# Age Prediction Using Marathon Data

In [132]:
import pandas as pd
import numpy as np
import matplotlib

In [133]:
df=pd.read_csv('/home/arnab782003s/kaggle/Data_Analysis_marathon/archive/Athletes.csv')
df.head()

Unnamed: 0,Bib,Zip,Age,Age Group,Gender,First Half,Second Half,Finish,Positive Split,Percent Change
0,25,93730,30,Under 35,M,3832,3961,7793,129,0.033664
1,9,97124,32,Under 35,M,3845,4059,7904,214,0.055657
2,20,80922,40,40-44,M,3795,4157,7952,362,0.095389
3,46,2136,38,35-39,M,3997,4065,8062,68,0.017013
4,51,6119,26,Under 35,M,3979,4198,8177,219,0.055039


### Dropping columns which are not useful for determining Age:

In [134]:
df=df.drop(columns=['Bib','Zip',])
df.head()

Unnamed: 0,Age,Age Group,Gender,First Half,Second Half,Finish,Positive Split,Percent Change
0,30,Under 35,M,3832,3961,7793,129,0.033664
1,32,Under 35,M,3845,4059,7904,214,0.055657
2,40,40-44,M,3795,4157,7952,362,0.095389
3,38,35-39,M,3997,4065,8062,68,0.017013
4,26,Under 35,M,3979,4198,8177,219,0.055039


### Encoding Categorical Data using Label Encoder:

In [135]:
from sklearn import preprocessing 
le=preprocessing.LabelEncoder()

col=['Age Group','Gender']

for c in col:
    df[c]=le.fit_transform(df[c])

df.head()

Unnamed: 0,Age,Age Group,Gender,First Half,Second Half,Finish,Positive Split,Percent Change
0,30,10,1,3832,3961,7793,129,0.033664
1,32,10,1,3845,4059,7904,214,0.055657
2,40,1,1,3795,4157,7952,362,0.095389
3,38,0,1,3997,4065,8062,68,0.017013
4,26,10,1,3979,4198,8177,219,0.055039


In [136]:
Y=df['Age']
df.drop(columns=['Age'])

Unnamed: 0,Age Group,Gender,First Half,Second Half,Finish,Positive Split,Percent Change
0,10,1,3832,3961,7793,129,0.033664
1,10,1,3845,4059,7904,214,0.055657
2,1,1,3795,4157,7952,362,0.095389
3,0,1,3997,4065,8062,68,0.017013
4,10,1,3979,4198,8177,219,0.055039
...,...,...,...,...,...,...,...
17215,2,1,7286,17096,24382,9810,1.346418
17216,5,1,10891,13701,24592,2810,0.258011
17217,6,1,11957,12829,24786,872,0.072928
17218,1,1,11536,13915,25451,2379,0.206224


### Scaling all values using Standard Scaler:

In [137]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df= scaler.fit_transform(df)
df

array([[-0.97427727,  1.37176717,  0.94484056, ..., -2.13547336,
        -1.04557206, -0.96628213],
       [-0.81479274,  1.37176717,  0.94484056, ..., -2.09797285,
        -0.95025839, -0.79771503],
       [-0.17685463, -0.96458621,  0.94484056, ..., -2.08175641,
        -0.78430047, -0.49318358],
       ...,
       [ 1.89644423,  0.33338789,  0.94484056, ...,  3.60548312,
        -0.21241846, -0.66533669],
       [-0.17685463, -0.96458621,  0.94484056, ...,  3.83014834,
         1.47743681,  0.35632978],
       [ 0.14211442, -0.96458621,  0.94484056, ...,  3.97035295,
         0.77435834, -0.11055603]])

### Train Test Splitting:

In [138]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df,Y, test_size=0.2, random_state=42)

### Model Used : RandomForestRegressor with K-Fold Cross Validation

In [139]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score

model = RandomForestRegressor(n_estimators=100, random_state=42)

In [140]:
y_train = y_train.values

### Mean Squared Error for each iteration:

In [141]:
k = 5
kf = KFold(n_splits=k)
cv_mse_scores = []

for train_index, val_index in kf.split(X_train):
    X_train_k, X_val_k = X_train[train_index], X_train[val_index]
    y_train_k, y_val_k = y_train[train_index], y_train[val_index]
    
    model.fit(X_train_k, y_train_k)
    y_val_pred = model.predict(X_val_k)
    mse = mean_squared_error(y_val_k, y_val_pred)
    r2 = r2_score(y_val_k, y_val_pred)
    cv_mse_scores.append(mse)
    print(f'Fold MSE: {mse}')

Fold MSE: 0.00012213352685050733
Fold MSE: 0.0017753539019963588
Fold MSE: 8.929219600725598e-06
Fold MSE: 0.0007821415607985446
Fold MSE: 0.00012947368421052487


In [142]:
mean_cv_mse = np.mean(cv_mse_scores)
print(f'Mean cross-validation MSE: {mean_cv_mse}')

Mean cross-validation MSE: 0.0005636063786913323


### Fitting Total Train Data and Computing Accuracy on Test:

In [143]:
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
print(f'Test MSE: {test_mse}')
print(f'Test R²: {test_r2}')

Test MSE: 0.00021228222996515616
Test R²: 0.9999986911429551
