In [1]:
# Add dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pathlib import Path
import warnings
import numpy as np
warnings.filterwarnings('ignore')

In [2]:
columns = ['Name', 'Sex', 'Event', 'Equipment', 'Age', 'AgeClass', 'Division',
       'BodyweightKg', 'WeightClassKg', 'Squat1Kg', 'Squat2Kg', 'Squat3Kg',
       'Squat4Kg', 'Best3SquatKg', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg',
       'Bench4Kg', 'Best3BenchKg', 'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg',
       'Deadlift4Kg', 'Best3DeadliftKg', 'TotalKg', 'Place', 'Wilks',
       'McCulloch', 'Glossbrenner', 'IPFPoints', 'Tested', 'Country',
       'Federation', 'Date', 'MeetCountry', 'MeetState', 'MeetName']

target = ['TotalKg']

In [3]:
#Read data into DataFrame
file_path = Path('..\Resources\openpowerlifting.csv')
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()

In [4]:
df

Unnamed: 0,Name,Sex,Event,Equipment,Age,AgeClass,Division,BodyweightKg,WeightClassKg,Squat1Kg,...,McCulloch,Glossbrenner,IPFPoints,Tested,Country,Federation,Date,MeetCountry,MeetState,MeetName
0,Abbie Murphy,F,SBD,Wraps,29.0,24-34,F-OR,59.8,60,80.0,...,324.16,286.42,511.15,,,GPC-AUS,2018-10-27,Australia,VIC,Melbourne Cup
1,Abbie Tuong,F,SBD,Wraps,29.0,24-34,F-OR,58.5,60,100.0,...,378.07,334.16,595.65,,,GPC-AUS,2018-10-27,Australia,VIC,Melbourne Cup
2,Ainslee Hooper,F,B,Raw,40.0,40-44,F-OR,55.4,56,,...,38.56,34.12,313.97,,,GPC-AUS,2018-10-27,Australia,VIC,Melbourne Cup
3,Amy Moldenhauer,F,SBD,Wraps,23.0,20-23,F-OR,60.0,60,-105.0,...,345.61,305.37,547.04,,,GPC-AUS,2018-10-27,Australia,VIC,Melbourne Cup
4,Andrea Rowan,F,SBD,Wraps,45.0,45-49,F-OR,104.0,110,120.0,...,338.91,274.56,550.08,,,GPC-AUS,2018-10-27,Australia,VIC,Melbourne Cup
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423349,Marian Cafalik,M,SBD,Raw,60.5,60-64,Masters 2,73.5,74,160.0,...,438.27,316.52,469.67,Yes,,PZKFiTS,2017-04-01,Poland,,Polish Classic Powerlifting Cup
1423350,Marian Piwowarczyk,M,SBD,Raw,55.5,55-59,Masters 2,63.5,66,90.0,...,372.60,295.66,423.03,Yes,Poland,PZKFiTS,2017-04-01,Poland,,Polish Classic Powerlifting Cup
1423351,Andrzej Bryniarski,M,SBD,Raw,62.5,60-64,Masters 2,94.4,105,140.0,...,382.36,264.22,378.84,Yes,,PZKFiTS,2017-04-01,Poland,,Polish Classic Powerlifting Cup
1423352,Stanisław Goroczko,M,SBD,Raw,63.5,60-64,Masters 2,80.8,83,-165.0,...,,,,Yes,,PZKFiTS,2017-04-01,Poland,,Polish Classic Powerlifting Cup


In [5]:
# # Drop the null columns where all values are null
# df=df.dropna(axis=1,how='all')
place_mask = df['Place'] == '1'
df = df.loc[place_mask]

event_mask = df['Event'] == 'SBD'
df = df.loc[event_mask]

df = df.drop(['Equipment', 'AgeClass', 'Division','WeightClassKg','Squat1Kg','Squat2Kg','Squat3Kg','Squat4Kg', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg',
       'Bench4Kg', 'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg',
       'Deadlift4Kg','Name', 'Event',
       'McCulloch', 'Glossbrenner', 'IPFPoints', 'Tested', 'Country',
       'Federation','MeetCountry', 'MeetState', 'MeetName', 'Wilks'], axis=1)


In [6]:
df.dtypes

Sex                 object
Age                float64
BodyweightKg       float64
Best3SquatKg       float64
Best3BenchKg       float64
Best3DeadliftKg    float64
TotalKg            float64
Place               object
Date                object
dtype: object

In [7]:
# gender = {'M': 1,'F': 0}
# df = pd.get_dummies(df["Gender"])
df["Sex"] = df["Sex"].astype('category')
df["Sex"] = df["Sex"].cat.codes
df['Place'] = df['Place'].astype('int')
df['Date'] = pd.to_datetime(df['Date'])

df.insert(loc=0, column='ID', value=np.arange(len(df)))
df.dtypes

ID                          int32
Sex                          int8
Age                       float64
BodyweightKg              float64
Best3SquatKg              float64
Best3BenchKg              float64
Best3DeadliftKg           float64
TotalKg                   float64
Place                       int32
Date               datetime64[ns]
dtype: object

In [8]:
df

Unnamed: 0,ID,Sex,Age,BodyweightKg,Best3SquatKg,Best3BenchKg,Best3DeadliftKg,TotalKg,Place,Date
6,0,0,23.0,59.8,125.0,70.0,150.0,345.0,1,2018-10-27
8,1,0,36.0,108.0,220.0,100.0,200.0,520.0,1,2018-10-27
9,2,0,37.0,74.8,200.0,95.0,180.0,475.0,1,2018-10-27
12,3,0,27.0,78.6,182.5,105.0,205.0,492.5,1,2018-10-27
16,4,0,50.0,55.2,137.5,70.0,182.5,390.0,1,2018-10-27
...,...,...,...,...,...,...,...,...,...,...
1423327,345339,1,26.5,99.6,305.5,195.0,400.0,900.5,1,2017-04-01
1423332,345340,1,24.5,116.9,295.0,195.0,320.0,810.0,1,2017-04-01
1423337,345341,1,27.5,137.3,270.0,205.0,295.0,770.0,1,2017-04-01
1423340,345342,1,40.5,58.7,183.0,142.5,190.0,515.5,1,2017-04-01


In [9]:
# Determine y and x columns
X = pd.get_dummies(df, columns=['Sex','Age','Best3SquatKg','Best3BenchKg','Best3DeadliftKg','BodyweightKg','Date']).drop('TotalKg', axis=1)
y = df['TotalKg']

In [10]:
X.shape

(345344, 17838)

In [11]:
y.shape

(345344,)

In [12]:
# Check the balance of our target values
y.value_counts()

600.0    1726
500.0    1639
550.0    1495
520.0    1457
590.0    1457
         ... 
219.9       1
204.2       1
138.3       1
63.5        1
633.9       1
Name: TotalKg, Length: 4277, dtype: int64

In [13]:
# Create training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
# Instantiate the model
model = LinearRegression()
model

LinearRegression()

In [15]:
# Train the dataset x_train and y_train
model.fit(X_train, y_train)

MemoryError: Unable to allocate 17.2 GiB for an array with shape (17838, 259008) and data type int32

In [None]:
# Validate the model by Predicting the data 
y_predicted = model.predict(X_test)

# Load the predicted outcome into a DataFrame with the y_test data)
predicted_outcome = pd.DataFrame({"Prediction": y_predicted, "Actual": y_test}).reset_index(drop = True)
predicted_outcome.head()

In [None]:
# Test the simple ML model
print({"The accuracy Score of the model is"} accuracy_score(y_test, y_predicted))