# Import dependencies

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    RandomizedSearchCV,
    GridSearchCV,
)
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    mean_squared_error,
    precision_score, 
    recall_score,
    f1_score,
)

from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns

# Import cleaned data

In [10]:
df = pd.read_csv("../data/summer_men_100m_swimming_freestyle.csv")

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,Height,Weight,NOC,Year,City,Medal,Times_Participated,Participation_Number
0,209,Oszkr Abay-Nemes,22.0,185.308036,79.379836,HUN,1936,Berlin,no medal,1,1
1,244,Mohammed Abbas Sabih,26.0,170.0,70.0,IRQ,2004,Athina,no medal,1,1
2,348,Dorri Abdel Kader Said,21.0,185.308036,79.379836,EGY,1948,London,no medal,3,1
3,351,Dorri Abdel Kader Said,25.0,185.308036,79.379836,EGY,1952,Helsinki,no medal,3,2
4,551,Roosevelt M. Abdulgafur,24.0,181.0,73.0,PHI,1968,Mexico City,no medal,1,1


In [13]:
df = df.drop(columns=["Unnamed: 0"])

KeyError: "['Unnamed: 0'] not found in axis"

In [14]:
df.head()

Unnamed: 0,Name,Age,Height,Weight,NOC,Year,City,Medal,Times_Participated,Participation_Number
0,Oszkr Abay-Nemes,22.0,185.308036,79.379836,HUN,1936,Berlin,no medal,1,1
1,Mohammed Abbas Sabih,26.0,170.0,70.0,IRQ,2004,Athina,no medal,1,1
2,Dorri Abdel Kader Said,21.0,185.308036,79.379836,EGY,1948,London,no medal,3,1
3,Dorri Abdel Kader Said,25.0,185.308036,79.379836,EGY,1952,Helsinki,no medal,3,2
4,Roosevelt M. Abdulgafur,24.0,181.0,73.0,PHI,1968,Mexico City,no medal,1,1


## Preprocessing

### Split data into train and test

In [15]:
X = df.drop(columns=["Medal"])
y = df["Medal"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=55, stratify=y)       # test size 0.1 because I have a large dataset and i want to train on as much data as possible
X_train.shape, X_test.shape

((1112, 9), (124, 9))

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1112 entries, 1203 to 870
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Name                  1112 non-null   object 
 1   Age                   1112 non-null   float64
 2   Height                1112 non-null   float64
 3   Weight                1112 non-null   float64
 4   NOC                   1112 non-null   object 
 5   Year                  1112 non-null   int64  
 6   City                  1112 non-null   object 
 7   Times_Participated    1112 non-null   int64  
 8   Participation_Number  1112 non-null   int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 86.9+ KB


### Separate categorical and numerical features

In [18]:
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X_train.select_dtypes(include=['int', 'float']).columns.tolist()

categorical_features, numeric_features

(['Name', 'NOC', 'City'],
 ['Age',
  'Height',
  'Weight',
  'Year',
  'Times_Participated',
  'Participation_Number'])

### Define transformers pipelines

In [19]:
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore").set_output(transform="pandas"))])
categorical_transformer

0,1,2
,steps,"[('onehot', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [20]:
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler().set_output(transform="pandas"))])
numeric_transformer

0,1,2
,steps,"[('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [21]:
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features),
                                               ("cat", categorical_transformer, categorical_features)
                                               ]).set_output(transform="pandas")
preprocessor

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


## Models

### Create a dataframe to keep scores

In [22]:
scores_df = pd.DataFrame(columns=[
    "model_name",
    "F1",
    "Precision",
    "Recall",
])

scores_df

Unnamed: 0,model_name,F1,Precision,Recall


### Train and evaluate different models

#### Logistic regression

##### Initialize model and build pipeline

In [23]:
lr_model = LogisticRegression(random_state=42, solver='liblinear')
lr_model

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,100


In [24]:
pipeline_lr = Pipeline(steps=[
                              ("pre_process", preprocessor),
                              ("model", lr_model)
                              ])
pipeline_lr

0,1,2
,steps,"[('pre_process', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,100


##### Train and evaluate model

In [25]:
pipeline_lr.fit(X_train, y_train)
y_pred = pipeline_lr.predict(X_test)

class_labels = pipeline_lr.named_steps['model'].classes_

print(classification_report(y_test, y_pred))

pd.DataFrame(confusion_matrix(y_test, y_pred),
             columns=class_labels, index=class_labels)

              precision    recall  f1-score   support

      Bronze       0.00      0.00      0.00         2
        Gold       0.00      0.00      0.00         3
      Silver       0.00      0.00      0.00         3
    no medal       0.94      1.00      0.97       116

    accuracy                           0.94       124
   macro avg       0.23      0.25      0.24       124
weighted avg       0.88      0.94      0.90       124



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Unnamed: 0,Bronze,Gold,Silver,no medal
Bronze,0,0,0,2
Gold,0,0,0,3
Silver,0,0,0,3
no medal,0,0,0,116


##### Save scores

In [26]:
# add the scores to the dataframe

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

new_row = {
    "model_name": "Logistic Regression",
    "F1": f1,
    "Precision": precision,
    "Recall": recall,
}

scores_df = pd.concat([scores_df, pd.DataFrame([new_row])], ignore_index=True)
scores_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  scores_df = pd.concat([scores_df, pd.DataFrame([new_row])], ignore_index=True)


Unnamed: 0,model_name,F1,Precision,Recall
0,Logistic Regression,0.904301,0.87513,0.935484


#### KNN classifier

##### Initialize model and build pipeline

In [27]:
knn = KNeighborsClassifier()
knn

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [28]:
pipeline_knn = Pipeline([("pre_process", preprocessor),
                         ("model", knn)])
pipeline_knn

0,1,2
,steps,"[('pre_process', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


##### Train and evaluate model

In [29]:
pipeline_knn.fit(X_train, y_train)
y_pred = pipeline_knn.predict(X_test)

class_labels = pipeline_knn.named_steps['model'].classes_

print(classification_report(y_test, y_pred))

pd.DataFrame(confusion_matrix(y_test, y_pred),
             columns=class_labels, index=class_labels)

              precision    recall  f1-score   support

      Bronze       0.00      0.00      0.00         2
        Gold       0.00      0.00      0.00         3
      Silver       0.00      0.00      0.00         3
    no medal       0.94      1.00      0.97       116

    accuracy                           0.94       124
   macro avg       0.24      0.25      0.24       124
weighted avg       0.88      0.94      0.91       124



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Unnamed: 0,Bronze,Gold,Silver,no medal
Bronze,0,1,0,1
Gold,0,0,0,3
Silver,0,0,0,3
no medal,0,0,0,116


##### Save scores

In [30]:
# add the scores to the dataframe

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

new_row = {
    "model_name": "KNN Classifier",
    "F1": f1,
    "Precision": precision,
    "Recall": recall,
}

scores_df = pd.concat([scores_df, pd.DataFrame([new_row])], ignore_index=True)
scores_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Unnamed: 0,model_name,F1,Precision,Recall
0,Logistic Regression,0.904301,0.87513,0.935484
1,KNN Classifier,0.908085,0.882245,0.935484


#### Naive Bayes

##### Initialize model and build pipeline

In [31]:
nb = BernoulliNB()
nb

0,1,2
,alpha,1.0
,force_alpha,True
,binarize,0.0
,fit_prior,True
,class_prior,


In [32]:
pipeline_nb = Pipeline([("pre_process", preprocessor),
                        ("model", nb)])
pipeline_nb

0,1,2
,steps,"[('pre_process', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,1.0
,force_alpha,True
,binarize,0.0
,fit_prior,True
,class_prior,


##### Train and evaluate model

In [33]:
pipeline_nb.fit(X_train, y_train)
y_pred = pipeline_nb.predict(X_test)

class_labels = pipeline_nb.named_steps['model'].classes_

print(classification_report(y_test, y_pred))

pd.DataFrame(confusion_matrix(y_test, y_pred),
             columns=class_labels, index=class_labels)

              precision    recall  f1-score   support

      Bronze       0.00      0.00      0.00         2
        Gold       0.00      0.00      0.00         3
      Silver       0.00      0.00      0.00         3
    no medal       0.94      1.00      0.97       116

    accuracy                           0.94       124
   macro avg       0.23      0.25      0.24       124
weighted avg       0.88      0.94      0.90       124



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Unnamed: 0,Bronze,Gold,Silver,no medal
Bronze,0,0,0,2
Gold,0,0,0,3
Silver,0,0,0,3
no medal,0,0,0,116


##### Save scores

In [34]:
# add the scores to the dataframe

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

new_row = {
    "model_name": "Naive Bayes Classifier",
    "F1": f1,
    "Precision": precision,
    "Recall": recall,
}

scores_df = pd.concat([scores_df, pd.DataFrame([new_row])], ignore_index=True)
scores_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Unnamed: 0,model_name,F1,Precision,Recall
0,Logistic Regression,0.904301,0.87513,0.935484
1,KNN Classifier,0.908085,0.882245,0.935484
2,Naive Bayes Classifier,0.904301,0.87513,0.935484


#### SVM classifier

##### Linear Support vector classification

###### Initialize model and build pipeline

###### Train and evaluate model

###### Save scores

##### Support vector classification with Radial basis function

##### Initialize model and build pipeline

##### Train and evaluate model

##### Save scores

#### Decision tree

##### Initialize model and build pipeline

##### Train and evaluate model

##### Save scores

#### Random forest

##### Initialize model and build pipeline

##### Train and evaluate model

##### Save scores

#### AdaBoost

##### Initialize model and build pipeline

##### Train and evaluate model

##### Save scores

#### Gradient Boosting

##### Initialize model and build pipeline

##### Train and evaluate model

##### Save scores

#### XGBoost

##### Initialize model and build pipeline

##### Train and evaluate model

##### Save scores

### Compare model performance after one fit

### Compare model performance after cross validation

## Hyperparameter tuning

## Stacking classifier