In [1]:
from   sklearn.preprocessing   import MinMaxScaler
from   sklearn.linear_model    import LinearRegression
from   sklearn.metrics         import mean_squared_error, r2_score
import pandas                  as     pd
import numpy                   as     np
import seaborn                 as     sns
import matplotlib.pyplot       as     plt

# Dummy Encoding

In [2]:
medals = pd.read_pickle('../dat/medals.pkl')
medals.head()

Unnamed: 0,Year,Summer,Games,Host,NOC,Region,Athletes,Females,Sports,Events,Medals,Golds,Silvers,Bronzes
0,1896,1,1896 Summer,0,AUS,Australia,5,0,2,5,3,2,0,1
1,1896,1,1896 Summer,0,AUT,Austria,8,0,3,8,5,2,1,2
2,1896,1,1896 Summer,0,DEN,Denmark,15,0,5,12,6,1,2,3
3,1896,1,1896 Summer,0,FRA,France,26,0,6,18,11,5,4,2
4,1896,1,1896 Summer,0,GBR,UK,25,0,7,19,9,3,3,3


In [3]:
# create dummy variables
model = pd.get_dummies(data=medals, columns=['NOC'])
model.head()

Unnamed: 0,Year,Summer,Games,Host,Region,Athletes,Females,Sports,Events,Medals,...,NOC_VIE,NOC_VIN,NOC_VNM,NOC_WIF,NOC_YAR,NOC_YEM,NOC_YMD,NOC_YUG,NOC_ZAM,NOC_ZIM
0,1896,1,1896 Summer,0,Australia,5,0,2,5,3,...,0,0,0,0,0,0,0,0,0,0
1,1896,1,1896 Summer,0,Austria,8,0,3,8,5,...,0,0,0,0,0,0,0,0,0,0
2,1896,1,1896 Summer,0,Denmark,15,0,5,12,6,...,0,0,0,0,0,0,0,0,0,0
3,1896,1,1896 Summer,0,France,26,0,6,18,11,...,0,0,0,0,0,0,0,0,0,0
4,1896,1,1896 Summer,0,UK,25,0,7,19,9,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Correlation
NOC_medal_correlations = model.corr()
NOC_medal_correlations = NOC_medal_correlations[['Medals','Golds','Silvers','Bronzes']]
#NOC_medal_correlations = NOC_medal_correlations.abs()
NOC_medal_correlations = NOC_medal_correlations.sort_values(by='Medals', ascending=False)
NOC_medal_correlations.head(50)

Unnamed: 0,Medals,Golds,Silvers,Bronzes
Medals,1.0,0.933208,0.925465,0.910814
Golds,0.933208,1.0,0.787451,0.761291
Silvers,0.925465,0.787451,1.0,0.791388
Bronzes,0.910814,0.761291,0.791388,1.0
Athletes,0.843252,0.731369,0.801421,0.818615
Events,0.728245,0.621348,0.684731,0.728202
Females,0.715589,0.629688,0.662149,0.701756
Sports,0.611403,0.503231,0.588675,0.620934
NOC_USA,0.387638,0.443014,0.328241,0.278849
Host,0.327053,0.314088,0.322436,0.265433


# Time-Based Train / Test Split

In [24]:
y = pd.DataFrame(model['Medals'])
y.head()

Unnamed: 0,Medals
0,3
1,5
2,6
3,11
4,9


In [25]:
X = model.drop(['Medals','Games','Golds','Silvers','Bronzes','Region'], axis=1)
X.head()

Unnamed: 0,Year,Summer,Host,Athletes,Females,Sports,Events,NOC_AFG,NOC_AHO,NOC_ALB,...,NOC_VIE,NOC_VIN,NOC_VNM,NOC_WIF,NOC_YAR,NOC_YEM,NOC_YMD,NOC_YUG,NOC_ZAM,NOC_ZIM
0,1896,1,0,5,0,2,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1896,1,0,8,0,3,8,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1896,1,0,15,0,5,12,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1896,1,0,26,0,6,18,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1896,1,0,25,0,7,19,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Scaling
scaler = MinMaxScaler()
scaler.fit(X)
scaler.transform(X)

# Linear Regressor Model

In [26]:
# Create linear regression object
regr = LinearRegression()

In [27]:
# Train the model using the training sets
regr.fit(X[['Year','Summer','Host','Athletes','Females','Sports','Events']], y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [28]:
# Make predictions using the testing set
y_pred = pd.DataFrame(regr.predict(X), columns=['Prediction'])
y_pred['Prediction'] = y_pred['Prediction'].astype('int64')
y_pred.head()

ValueError: shapes (3837,237) and (7,1) not aligned: 237 (dim 1) != 7 (dim 0)

In [None]:
y['Prediction'] = y_pred['Prediction']
y['NOC']        = medals['NOC']
y['Error']      = y['Medals'] - y['Prediction']
y               = y.sort_values(by='Medals', ascending=False)
y

# Validation on Test Set

In [None]:
X.columns

In [None]:
# The coefficients
features = pd.DataFrame(regr.coef_.T, columns=['Coefficients'])
features['Feature'] = X.columns.T
features = features.sort_values(by='Coefficients', ascending=False)
features

In [None]:
regr.intercept_

In [None]:
regr.score(X,y)

In [None]:
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y, y_pred))

In [None]:
# The mean squared error
print("Mean squared error: %.2f" % np.sqrt(mean_squared_error(y, y_pred)))

# Prediction of 2020 Tokyo Olympics

In [None]:
# Plot outputs
plt.figure(figsize = (10,6))
sns.set_style("whitegrid")
plt.scatter(X['Events'], y, c='navy', alpha=0.6)
plt.plot(X['Events'], y_pred, color='red', linewidth=3)
plt.show()