In [1]:
# Import required dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBClassifier 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
import pipeline_utilities as p_utils
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder



In [2]:
# Import data
file_path = 'Resources/summer.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [3]:
#Get info on all features and target
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31165 entries, 0 to 31164
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        31165 non-null  int64 
 1   City        31165 non-null  object
 2   Sport       31165 non-null  object
 3   Discipline  31165 non-null  object
 4   Athlete     31165 non-null  object
 5   Country     31161 non-null  object
 6   Gender      31165 non-null  object
 7   Event       31165 non-null  object
 8   Medal       31165 non-null  object
dtypes: int64(1), object(8)
memory usage: 2.1+ MB


In [None]:
#data exploration - module 12 activities
# p_value evaluation
# Z score
# histogram
# plotting
# heatmap

In [5]:
#drop unnecessary columns   
clean_df = p_utils.drop_columns(df)
clean_df.head()

Unnamed: 0,Year,Sport,Country,Gender,Medal
0,1896,Aquatics,HUN,Men,Gold
1,1896,Aquatics,AUT,Men,Silver
2,1896,Aquatics,GRE,Men,Bronze
3,1896,Aquatics,GRE,Men,Gold
4,1896,Aquatics,GRE,Men,Silver


In [3]:
#get medal count by country, sport, gender and medal groupings
medal_ct_country = df.groupby(['Country','Sport','Gender','Medal']).agg(Medal_count=('Medal','count')).sort_values('Medal_count', ascending=False).reset_index()
medal_ct_country.head()


Unnamed: 0,Country,Sport,Gender,Medal,Medal_count
0,USA,Athletics,Men,Gold,382
1,USA,Aquatics,Men,Gold,319
2,USA,Aquatics,Women,Gold,259
3,USA,Athletics,Men,Silver,240
4,USA,Aquatics,Men,Silver,188


In [4]:
p_utils.medal_model_generator(medal_ct_country)

Testing Linear Regression
Mean Squared Error: 269.31503223059843
R-squared: 0.22481931857463744
Adjusted R-squared: 0.22078716938646914


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Testing Random Forest Regressor
Mean Squared Error: 168.9391525839793
R-squared: 0.4285833316132802
Adjusted R-squared: 0.42561107326016334
Testing XGB Regressor
Mean Squared Error: 140.0057771730882
R-squared: 0.6751101691795114
Adjusted R-squared: 0.6734202350790146
XGB Regressor is the best model


In [7]:
# columns_to_encode = ['Sport','Country','Gender']

In [8]:
# # Create an encoder for the categorical columns
# ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)

# # Set up the OneHotEncoder so it will transform to Pandas
# ohe.set_output(transform="pandas")

# # Fit and transform the OneHotEncoder to the columns to encode
# medals_data_ohe = ohe.fit_transform(clean_df[columns_to_encode])
# medals_data_ohe.head()

Unnamed: 0,Sport_Archery,Sport_Athletics,Sport_Badminton,Sport_Baseball,Sport_Basketball,Sport_Basque Pelota,Sport_Boxing,Sport_Canoe,Sport_Canoe / Kayak,Sport_Cricket,...,Country_USA,Country_UZB,Country_VEN,Country_VIE,Country_YUG,Country_ZAM,Country_ZIM,Country_ZZX,Country_nan,Gender_Women
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# # Create an encoder for the categorical ordinal Medal
# medal_ord_enc = OrdinalEncoder(categories=[['Gold','Silver','Bronze']],
#                                             encoded_missing_value=-1, 
#                                             handle_unknown='use_encoded_value', 
#                                             unknown_value=-1)
# # Set up the OneHotEncoder so it will transform to Pandas
# medal_ord_enc.set_output(transform="pandas")


# # Fit and transform the OneHotEncoder to the columns to encode
# medal_ord_enc = medal_ord_enc.fit_transform(clean_df[['Medal']])
# medal_ord_enc.head()


In [None]:
# # Create an encoder for the categorical ordinal column for 
# medal_ord_enc = OrdinalEncoder(categories=[['Gold','Silver','Bronze']],
#                                             encoded_missing_value=-1, 
#                                             handle_unknown='use_encoded_value', 
#                                             unknown_value=-1)
# # Set up the OneHotEncoder so it will transform to Pandas
# medal_ord_enc.set_output(transform="pandas")


# # Fit and transform the OneHotEncoder to the columns to encode
# medal_ord_enc = medal_ord_enc.fit_transform(clean_df[['Medal']])
# medal_ord_enc.head()

In [10]:
# #concat above dataframes
# df_enc = pd.concat([medals_data_ohe,medal_ord_enc],axis=1)
# df_enc.head()