In [None]:
from   sklearn.preprocessing   import MinMaxScaler
from   sklearn.linear_model    import LinearRegression
from   sklearn.metrics         import mean_squared_error, r2_score
import pandas                  as     pd
import numpy                   as     np
import seaborn                 as     sns
import matplotlib.pyplot       as     plt

# Dummy Encoding

In [None]:
medals = pd.read_pickle('../dat/medals.pkl')
medals.head()

In [None]:
# create dummy variables
model = pd.get_dummies(data=medals, columns=['NOC'])
model.head()

In [None]:
# Correlation

# Time-Based Train / Test Split

In [None]:
y = pd.DataFrame(model['Medals'])
y.head()

In [None]:
X = model.drop(['Medals','Games','Golds','Silvers','Bronzes','Region'], axis=1)
X.head()

In [None]:
# Scaling
scaler = MinMaxScaler()
scaler.fit(X)
scaler.transform(X)

# Linear Regressor Model

In [None]:
# Create linear regression object
regr = LinearRegression()

In [None]:
# Train the model using the training sets
regr.fit(X[['Year','Summer','Host','Athletes','Females','Sports','Events']], y)

In [None]:
# Make predictions using the testing set
y_pred = pd.DataFrame(regr.predict(X), columns=['Prediction'])
y_pred['Prediction'] = y_pred['Prediction'].astype('int64')
y_pred.head()

In [None]:
y['Prediction'] = y_pred['Prediction']
y['NOC']        = medals['NOC']
y['Error']      = y['Medals'] - y['Prediction']
y               = y.sort_values(by='Medals', ascending=False)
y

# Validation on Test Set

In [None]:
X.columns

In [None]:
# The coefficients
features = pd.DataFrame(regr.coef_.T, columns=['Coefficients'])
features['Feature'] = X.columns.T
features = features.sort_values(by='Coefficients', ascending=False)
features

In [None]:
regr.intercept_

In [None]:
regr.score(X,y)

In [None]:
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y, y_pred))

In [None]:
# The mean squared error
print("Mean squared error: %.2f" % np.sqrt(mean_squared_error(y, y_pred)))

# Prediction of 2020 Tokyo Olympics

In [None]:
# Plot outputs
plt.figure(figsize = (10,6))
sns.set_style("whitegrid")
plt.scatter(X['Events'], y, c='navy', alpha=0.6)
plt.plot(X['Events'], y_pred, color='red', linewidth=3)
plt.show()