In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
import joblib

In [4]:
df = pd.read_csv('sbd_post_sql.csv')
df.shape

(405068, 16)

In [5]:
df.head()

Unnamed: 0,Sex,BirthClass,BW,S1,S2,S3,Best3S,B1,B2,B3,Best3B,D1,D2,D3,Best3D,Total
0,F,\N,50.6,85.0,90.0,92.5,92.5,57.5,60.0,62.5,62.5,137.5,142.5,150.0,150.0,305.0
1,F,24-39,51.0,-90.0,-90.0,90.0,90.0,50.0,52.5,-55.0,52.5,100.0,107.5,112.5,112.5,255.0
2,F,50-59,51.2,115.0,117.5,120.0,120.0,55.0,57.5,60.0,60.0,145.0,147.5,150.0,150.0,330.0
3,F,19-23,51.5,70.0,75.0,77.5,77.5,47.5,50.0,52.5,52.5,102.5,110.0,-117.5,110.0,240.0
4,F,24-39,51.8,-92.5,92.5,-95.0,92.5,60.0,62.5,66.0,66.0,117.5,122.5,125.0,125.0,283.5


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405068 entries, 0 to 405067
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Sex         405068 non-null  object
 1   BirthClass  405068 non-null  object
 2   BW          405068 non-null  object
 3   S1          405068 non-null  object
 4   S2          405068 non-null  object
 5   S3          405068 non-null  object
 6   Best3S      405068 non-null  object
 7   B1          405068 non-null  object
 8   B2          405068 non-null  object
 9   B3          405068 non-null  object
 10  Best3B      405068 non-null  object
 11  D1          405068 non-null  object
 12  D2          405068 non-null  object
 13  D3          405068 non-null  object
 14  Best3D      405068 non-null  object
 15  Total       405068 non-null  object
dtypes: object(16)
memory usage: 49.4+ MB


In order to model things, I'll first need to deal with nulls, one-hot encoding Sex and BirthClass, and ensuring the datatypes of the numerical values are numerical.

In [7]:
df.replace('\\N', None, inplace=True)
df.head()

Unnamed: 0,Sex,BirthClass,BW,S1,S2,S3,Best3S,B1,B2,B3,Best3B,D1,D2,D3,Best3D,Total
0,F,,50.6,85.0,90.0,92.5,92.5,57.5,60.0,62.5,62.5,137.5,142.5,150.0,150.0,305.0
1,F,24-39,51.0,-90.0,-90.0,90.0,90.0,50.0,52.5,-55.0,52.5,100.0,107.5,112.5,112.5,255.0
2,F,50-59,51.2,115.0,117.5,120.0,120.0,55.0,57.5,60.0,60.0,145.0,147.5,150.0,150.0,330.0
3,F,19-23,51.5,70.0,75.0,77.5,77.5,47.5,50.0,52.5,52.5,102.5,110.0,-117.5,110.0,240.0
4,F,24-39,51.8,-92.5,92.5,-95.0,92.5,60.0,62.5,66.0,66.0,117.5,122.5,125.0,125.0,283.5


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405068 entries, 0 to 405067
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Sex         405068 non-null  object
 1   BirthClass  348813 non-null  object
 2   BW          404317 non-null  object
 3   S1          308221 non-null  object
 4   S2          305952 non-null  object
 5   S3          299979 non-null  object
 6   Best3S      401970 non-null  object
 7   B1          307500 non-null  object
 8   B2          305465 non-null  object
 9   B3          298742 non-null  object
 10  Best3B      400479 non-null  object
 11  D1          306710 non-null  object
 12  D2          302907 non-null  object
 13  D3          294932 non-null  object
 14  Best3D      398549 non-null  object
 15  Total       390467 non-null  object
dtypes: object(16)
memory usage: 49.4+ MB


In [9]:
df[['BW', 'S1', 'S2','S3','Best3S','B1','B2','B3','Best3B','D1','D2', 'D3', 'Best3D', 'Total']] = df[['BW', 'S1', 'S2','S3','Best3S','B1','B2','B3','Best3B','D1','D2', 'D3', 'Best3D', 'Total']].astype(float)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405068 entries, 0 to 405067
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Sex         405068 non-null  object 
 1   BirthClass  348813 non-null  object 
 2   BW          404317 non-null  float64
 3   S1          308221 non-null  float64
 4   S2          305952 non-null  float64
 5   S3          299979 non-null  float64
 6   Best3S      401970 non-null  float64
 7   B1          307500 non-null  float64
 8   B2          305465 non-null  float64
 9   B3          298742 non-null  float64
 10  Best3B      400479 non-null  float64
 11  D1          306710 non-null  float64
 12  D2          302907 non-null  float64
 13  D3          294932 non-null  float64
 14  Best3D      398549 non-null  float64
 15  Total       390467 non-null  float64
dtypes: float64(14), object(2)
memory usage: 49.4+ MB


Were there an API that I could call upon to cross reference lifter names (in the original dataframe) and ages, it might be possible to impute missing BirthClass values. However, since this data spans from the 1970s, this informaiton may not be available anywhere. There are too many to do by hand, so I will drop all rows with BirthClass nulls. Same thing for bodyweight.

In [12]:
df_no_null = df.dropna(subset=['BirthClass'])
df_no_null.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 348813 entries, 1 to 405067
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Sex         348813 non-null  object 
 1   BirthClass  348813 non-null  object 
 2   BW          348268 non-null  float64
 3   S1          271919 non-null  float64
 4   S2          269848 non-null  float64
 5   S3          264519 non-null  float64
 6   Best3S      346095 non-null  float64
 7   B1          271283 non-null  float64
 8   B2          269428 non-null  float64
 9   B3          263346 non-null  float64
 10  Best3B      344750 non-null  float64
 11  D1          270603 non-null  float64
 12  D2          267168 non-null  float64
 13  D3          260051 non-null  float64
 14  Best3D      343000 non-null  float64
 15  Total       335923 non-null  float64
dtypes: float64(14), object(2)
memory usage: 45.2+ MB


In [13]:
df_no_null = df_no_null.dropna(subset=['BW'])
df_no_null.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 348268 entries, 1 to 405067
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Sex         348268 non-null  object 
 1   BirthClass  348268 non-null  object 
 2   BW          348268 non-null  float64
 3   S1          271884 non-null  float64
 4   S2          269813 non-null  float64
 5   S3          264486 non-null  float64
 6   Best3S      345934 non-null  float64
 7   B1          271246 non-null  float64
 8   B2          269391 non-null  float64
 9   B3          263310 non-null  float64
 10  Best3B      344588 non-null  float64
 11  D1          270568 non-null  float64
 12  D2          267133 non-null  float64
 13  D3          260019 non-null  float64
 14  Best3D      342842 non-null  float64
 15  Total       335766 non-null  float64
dtypes: float64(14), object(2)
memory usage: 45.2+ MB


In [14]:
dfonehot = pd.get_dummies(df_no_null, columns=['Sex', 'BirthClass'])

In [15]:
dfonehot.head()

Unnamed: 0,BW,S1,S2,S3,Best3S,B1,B2,B3,Best3B,D1,...,Sex_F,Sex_M,Sex_Mx,BirthClass_14-18,BirthClass_19-23,BirthClass_24-39,BirthClass_40-49,BirthClass_50-59,BirthClass_60-69,BirthClass_70-999
1,51.0,-90.0,-90.0,90.0,90.0,50.0,52.5,-55.0,52.5,100.0,...,1,0,0,0,0,1,0,0,0,0
2,51.2,115.0,117.5,120.0,120.0,55.0,57.5,60.0,60.0,145.0,...,1,0,0,0,0,0,0,1,0,0
3,51.5,70.0,75.0,77.5,77.5,47.5,50.0,52.5,52.5,102.5,...,1,0,0,0,1,0,0,0,0,0
4,51.8,-92.5,92.5,-95.0,92.5,60.0,62.5,66.0,66.0,117.5,...,1,0,0,0,0,1,0,0,0,0
5,52.0,80.0,-87.5,87.5,87.5,57.5,62.5,-65.5,62.5,110.0,...,1,0,0,0,0,1,0,0,0,0


In [16]:
dfonehot['Sex_Mx'].sum()

5

I think it's important to represent data for underrepresented groups such as non-binary individuals, however, there are only 5 rows with informaiton on non-binary competitors. This means that the model won't actually learn anything about the performance of non-binary competitors. Given the technical limitation and the extremely low number of non-binary competitors, I will omit this information and I hope that the IPF continues to expand and record competitors in underrepresented genders.

In [17]:
dfonehot = dfonehot[dfonehot['Sex_Mx']==0]
dfonehot = dfonehot.drop('Sex_Mx', axis=1)

In [18]:
dfonehot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 348263 entries, 1 to 405067
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   BW                 348263 non-null  float64
 1   S1                 271879 non-null  float64
 2   S2                 269808 non-null  float64
 3   S3                 264481 non-null  float64
 4   Best3S             345929 non-null  float64
 5   B1                 271241 non-null  float64
 6   B2                 269386 non-null  float64
 7   B3                 263305 non-null  float64
 8   Best3B             344583 non-null  float64
 9   D1                 270563 non-null  float64
 10  D2                 267128 non-null  float64
 11  D3                 260014 non-null  float64
 12  Best3D             342837 non-null  float64
 13  Total              335761 non-null  float64
 14  Sex_F              348263 non-null  uint8  
 15  Sex_M              348263 non-null  uint8  
 16  Bi

In [19]:
dfonehot['Sex_F'].sum()

112792

In [20]:
dfonehot['Sex_M'].sum()

235471

At this point in my model development, I will set acknowledge but set aside the imbalance in gender data here.

Starting with Squat, I will make a model that predicts the best S2 and then a model that predicts the best S3 using information like Sex, BirthClass, and BW. I will not make a model that predicts S1 since that is largely dependent on the athlete's one rep max at the time of competition along with fatigue and injury factors. This is not information I have access to in this data. Second and third attempts, however, are more strategic and dependent on the tone set by the first attempt.

Since there will be multiple models that consecutively determine the next lift attempt, I will create dataframes that drop the relevant values at each model development onset.

## Linear Regression

In [33]:
dflr1 = dfonehot.dropna(subset=['S1'])

In [35]:
dflr1 = dflr1.dropna(subset=['S2'])

In [36]:
dflr1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 269752 entries, 1 to 405066
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   BW                 269752 non-null  float64
 1   S1                 269752 non-null  float64
 2   S2                 269752 non-null  float64
 3   S3                 264320 non-null  float64
 4   Best3S             269631 non-null  float64
 5   B1                 269098 non-null  float64
 6   B2                 267991 non-null  float64
 7   B3                 262128 non-null  float64
 8   Best3B             268641 non-null  float64
 9   D1                 268326 non-null  float64
 10  D2                 265975 non-null  float64
 11  D3                 259101 non-null  float64
 12  Best3D             267514 non-null  float64
 13  Total              260886 non-null  float64
 14  Sex_F              269752 non-null  uint8  
 15  Sex_M              269752 non-null  uint8  
 16  Bi

In [37]:
X = dflr1[['Sex_F', 'Sex_M', 'BirthClass_14-18', 'BirthClass_19-23', 'BirthClass_24-39', 'BirthClass_40-49', 'BirthClass_50-59', 'BirthClass_60-69', 'BirthClass_70-999', 'BW', 'S1']]
y = dflr1['S2']

In [38]:
X_full, X_test, y_full, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [39]:
X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.2, random_state=2)

In [49]:
#first pass no hyperparameter optimizations specified
linreg = LinearRegression()
linreg.fit(X_train, y_train)
print(linreg.score(X_train, y_train))
print(linreg.score(X_val, y_val))

0.08973892421785135
0.09305034273523927


## HGradBoost Regressor

In [51]:
dfhgb = dfonehot.dropna(subset=['S2'])

In [52]:
Xgb = dfhgb[['Sex_F', 'Sex_M', 'BirthClass_14-18', 'BirthClass_19-23', 'BirthClass_24-39', 'BirthClass_40-49', 'BirthClass_50-59', 'BirthClass_60-69', 'BirthClass_70-999', 'BW', 'S1']]
ygb = dfhgb['S2']

In [53]:
X_fullgb, X_testgb, y_fullgb, y_testgb = train_test_split(Xgb, ygb, test_size=0.2, random_state=2)

In [54]:
X_traingb, X_valgb, y_traingb, y_valgb = train_test_split(X_fullgb, y_fullgb, test_size=0.2, random_state=2)

In [55]:
hgradboost = HistGradientBoostingRegressor()
hgradboost.fit(X_traingb, y_traingb)
print(hgradboost.score(X_traingb, y_traingb))
print(hgradboost.score(X_testgb, y_testgb))

0.11025672994018254
0.10273380772420981


In [60]:
params = {'learning_rate': [0.01,0.02,0.03,0.04],
                  'l2_regularization' : [0.01, 0.1, 0, 10, 100],
                  'max_depth'    : [4,6,8,10]
}

In [61]:
grid = GridSearchCV(hgradboost, param_grid=params)
fitted_search = grid.fit(X_full, y_full)

In [62]:
print(f'The best parameters: {fitted_search.best_params_}')
print(f'The best score: {fitted_search.best_score_}')

The best parameters: {'l2_regularization': 10, 'learning_rate': 0.04, 'max_depth': 8}
The best score: 0.10192486345900986


## SVR

In [66]:
svrmodel = SVR()
params = {
    'C':[0.1,1,10],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.01, 0.1, 1]
}

In [67]:
grid = GridSearchCV(svrmodel, param_grid=params)
fitted_search = grid.fit(X_full, y_full)

In [None]:
print(f'The best parameters: {fitted_search.best_params_}')
print(f'The best score: {fitted_search.best_score_}')

These are not great scores. I will revisit the data cleaning to figure out how to proceed. Failed attempts are currently marked with a negative number, and that might be throwing off the model. I will train one with only the successful attempts (after all, the goal is to be able to predict a successful attempt), and check the score of that model.

First, I'll try by turning the negative numbers to 0.

In [None]:
dflr2 = dflr1

In [None]:
dflr2['s1'] = dflr2['s1'].where(dflr2['s1'] >= 0, 0)

In [None]:
dflr2.sample(10)

In [None]:
X2 = dflr2[['Sex_F', 'Sex_M', 'BirthClass_14-18', 'BirthClass_19-23', 'BirthClass_24-39', 'BirthClass_40-49', 'BirthClass_50-59', 'BirthClass_60-69', 'BirthClass_70-999', 'BW', 'S1']]
y2 = dflr2['S2']

In [None]:
X_full2, X_test2, y_full2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=2)

In [None]:
X_train2, X_val2, y_train2, y_val2 = train_test_split(X_full2, y_full2, test_size=0.2, random_state=2)

In [None]:
hgradboost = HistGradientBoostingRegressor()
hgradboost.fit(X_train2, y_train2)
print(hgradboost.score(X_train2, y_train2))
print(hgradboost.score(X_test2, y_test2))

Now I'll try by getting rid of all rows with negative numbers.

In [None]:
dflr3 = dflr1[~(dflr1 < 0).any(axis=1)]
