In [1]:
# Add dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import r2_score
from sklearn import metrics
from pathlib import Path
import warnings
import numpy as np
warnings.filterwarnings('ignore')
import psycopg2

In [2]:
!pip install psycopg2



# Read the CSV and Perform Basic Data Cleaning

In [3]:
# Connect to the database and build the necessary dataframes from existing tables
conn_string = "host='project-database.c34a9viyb12x.us-east-1.rds.amazonaws.com' dbname='FuturePowerLifting' port='5432' user='postgres' password='Fantastic'"

conn = psycopg2.connect(conn_string)
test_query_comp = "SELECT * FROM competitor"
test_query_meet = "SELECT * FROM meet"
test_query_perf = "SELECT * FROM performance"
test_query_totalkg = "SELECT * FROM totalkg"
test_query_project_df = "SELECT * FROM project_df"
conn.set_client_encoding('UNICODE')
cursor = conn.cursor()

cursor.execute(test_query_meet)
colnames = [desc[0] for desc in cursor.description]
df_imp = cursor.fetchall()
df_meet = pd.DataFrame(data=df_imp, columns=colnames)

cursor.execute(test_query_perf)
colnames = [desc[0] for desc in cursor.description]
df_imp = cursor.fetchall()
df_perf = pd.DataFrame(data=df_imp, columns=colnames)

cursor.execute(test_query_comp)
colnames = [desc[0] for desc in cursor.description]
df_imp = cursor.fetchall()
df_comp = pd.DataFrame(data=df_imp, columns=colnames)

cursor.execute(test_query_totalkg)
colnames = [desc[0] for desc in cursor.description]
df_imp = cursor.fetchall()
df_totalkg = pd.DataFrame(data=df_imp, columns=colnames)

cursor.execute(test_query_project_df)
colnames = [desc[0] for desc in cursor.description]
df_imp = cursor.fetchall()
df_project = pd.DataFrame(data=df_imp, columns=colnames)

In [4]:
# Competitor dataframe
df_comp.head(10)

Unnamed: 0,competitor_id,competitor_name,sex,country
0,0,Dakoda Plumridge,F,Australia
1,1,Helene Faccio,F,Australia
2,2,Chris Lepp,M,Australia
3,3,Emad Nayef,M,Australia
4,4,Luke Faulkner,M,Australia
5,5,Warrick Eccles,M,Australia
6,6,Ace Kirkwood,M,Australia
7,7,Billa Hamilton,F,Australia
8,8,Yvonne Wagstaff,F,Australia
9,9,Nina Markopoulos,F,Australia


In [5]:
# Totalkg dataframe
df_totalkg.head(10)

Unnamed: 0,competitor_id,bodyweightkg,best3benchkg,best3squatkg,best3deadliftkg,totalkg,wilks_score
0,0,79,105,182,205,492,455
1,1,55,70,138,182,390,464
2,2,82,160,272,268,700,469
3,3,88,162,272,280,715,460
4,4,99,170,290,295,755,462
5,5,137,200,370,330,900,504
6,6,74,138,235,240,612,438
7,7,67,100,175,185,460,472
8,8,50,40,60,102,202,258
9,9,61,70,115,130,315,347


In [6]:
# Project dataframe containing all columns used to feed the model
df_project.head(10)

Unnamed: 0,competitor_id,sex,age,bodyweightkg,best3squatkg,best3benchkg,best3deadliftkg,totalkg,place,meet_date,wilks_score
0,0,F,27,79,182,105,205,492,1,2018-10-27,455
1,1,F,50,55,138,70,182,390,1,2018-10-27,464
2,2,M,26,82,272,160,268,700,1,2018-10-27,469
3,3,M,41,88,272,162,280,715,1,2018-10-27,460
4,4,M,32,99,290,170,295,755,1,2018-10-27,462
5,5,M,26,137,370,200,330,900,1,2018-10-27,504
6,6,M,35,74,235,138,240,612,1,2018-08-25,438
7,7,F,31,67,175,100,185,460,1,2018-08-25,472
8,8,F,46,50,60,40,102,202,1,2013-04-28,258
9,9,F,48,61,115,70,130,315,1,2013-04-28,347


In [7]:
columns = ['Name', 'Sex', 'Event', 'Equipment', 'Age', 'AgeClass', 'Division',
       'BodyweightKg', 'WeightClassKg', 'Squat1Kg', 'Squat2Kg', 'Squat3Kg',
       'Squat4Kg', 'Best3SquatKg', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg',
       'Bench4Kg', 'Best3BenchKg', 'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg',
       'Deadlift4Kg', 'Best3DeadliftKg', 'TotalKg', 'Place', 'Wilks',
       'McCulloch', 'Glossbrenner', 'IPFPoints', 'Tested', 'Country',
       'Federation', 'Date', 'MeetCountry', 'MeetState', 'MeetName']

target = ['TotalKg']

In [8]:
# Check the datatypes of the columns
df_project.dtypes

competitor_id      object
sex                object
age                object
bodyweightkg        int64
best3squatkg        int64
best3benchkg        int64
best3deadliftkg     int64
totalkg             int64
place              object
meet_date          object
wilks_score         int64
dtype: object

Some of the columns like 'place' and 'meet_date' will need to be formatted so that they can be used in the model. We can drop columns we will not be using to fit to the model. For our analysis, we are using only Sex, Age, Best3BenchKg, Best3SquatKg, Best3DeadliftKg, and meet_date as the features and TotalKg as the target.  

Some of the data has been preselected by filtering the 'Place' column for values only equal to '1', 'Event' column only for values equal to 'SBD', and the 'Age' column only for values greater than or equal to '18'. This allows for us to keep relevant samples for competitors over the age of 18 that placed 1st, with entries for the squat, bench, and deadlift. The 'Sex' column will be converted to a category to represent '0' for females and '1' for males. It is then converted to an integer dtype. "meet_date" will also be reformatted in order to make the column uniform and the "ID" column will replace "competitor_id".

In [9]:
df_project["sex"] = df_project["sex"].astype('category')
df_project["sex"] = df_project["sex"].cat.codes
df_project['place'] = df_project['place'].astype('int')
df_project['meet_date'] = pd.to_datetime(df_project['meet_date'])

df_project.insert(loc=0, column='ID', value=np.arange(len(df_project)))
df_project.dtypes

ID                          int32
competitor_id              object
sex                          int8
age                        object
bodyweightkg                int64
best3squatkg                int64
best3benchkg                int64
best3deadliftkg             int64
totalkg                     int64
place                       int32
meet_date          datetime64[ns]
wilks_score                 int64
dtype: object

In [10]:
# Drop competitor_id and wilks_score from dataframe
df_project= df_project.drop(['competitor_id','wilks_score'], axis=1)
df_project.head(10)

Unnamed: 0,ID,sex,age,bodyweightkg,best3squatkg,best3benchkg,best3deadliftkg,totalkg,place,meet_date
0,0,0,27,79,182,105,205,492,1,2018-10-27
1,1,0,50,55,138,70,182,390,1,2018-10-27
2,2,1,26,82,272,160,268,700,1,2018-10-27
3,3,1,41,88,272,162,280,715,1,2018-10-27
4,4,1,32,99,290,170,295,755,1,2018-10-27
5,5,1,26,137,370,200,330,900,1,2018-10-27
6,6,1,35,74,235,138,240,612,1,2018-08-25
7,7,0,31,67,175,100,185,460,1,2018-08-25
8,8,0,46,50,60,40,102,202,1,2013-04-28
9,9,0,48,61,115,70,130,315,1,2013-04-28


Before, the features (X) and the target (y) were defined. 

In [11]:
# Determine y and x columns
X = pd.get_dummies(df_project, columns=['sex','age','best3squatkg','best3benchkg','best3deadliftkg','bodyweightkg','meet_date']).drop('totalkg', axis=1)
y = df_project['totalkg']

In [12]:
X.describe()

Unnamed: 0,ID,place,sex_0,sex_1,age_10,age_10.5,age_11,age_11.5,age_12,age_12.5,...,meet_date_2019-03-24 00:00:00,meet_date_2019-03-28 00:00:00,meet_date_2019-03-30 00:00:00,meet_date_2019-03-31 00:00:00,meet_date_2019-04-05 00:00:00,meet_date_2019-04-06 00:00:00,meet_date_2019-04-07 00:00:00,meet_date_2019-04-12 00:00:00,meet_date_2019-04-13 00:00:00,meet_date_2019-04-14 00:00:00
count,36587.0,36587.0,36587.0,36587.0,36587.0,36587.0,36587.0,36587.0,36587.0,36587.0,...,36587.0,36587.0,36587.0,36587.0,36587.0,36587.0,36587.0,36587.0,36587.0,36587.0
mean,18293.0,1.0,0.340804,0.659196,0.000246,8.2e-05,0.000219,0.000164,0.000601,0.000683,...,0.000273,0.000301,0.001312,0.000219,0.000164,0.002159,2.7e-05,0.000902,0.000765,2.7e-05
std,10561.901486,0.0,0.473986,0.473986,0.015682,0.009055,0.014786,0.012805,0.024515,0.026131,...,0.01653,0.017337,0.036197,0.014786,0.012805,0.046418,0.005228,0.03002,0.027654,0.005228
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9146.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,18293.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,27439.5,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,36586.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
y.shape

(36587,)

In [14]:
# Check the balance of our target values
y.value_counts()

500     191
370     187
590     187
700     186
350     185
       ... 
1132      1
108       1
1205      1
993       1
1136      1
Name: totalkg, Length: 950, dtype: int64

In [15]:
# Create training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

Because the dataset we are using is continuous, we are using linear regression as our supervised learning model. If we were looking for categorical and discreet results, we would use logistic regression instead. 

In [16]:
# Instantiate the model
model = LinearRegression()
model

LinearRegression()

In [17]:
# Train the dataset x_train and y_train
model.fit(X_train, y_train)

LinearRegression()

In [18]:
# Print the coefficients
print(model.coef_)

[ 2.30053999e-07  1.72284473e-07 -4.95988625e-03 ... -5.36321327e-02
  2.33723350e-01 -4.07882789e-01]


In [19]:
# Validate the model by Predicting the data 
y_predicted = model.predict(X_test)

# Load the predicted outcome into a DataFrame with the y_test data)
predicted_outcome = pd.DataFrame({"Prediction": y_predicted, "Actual": y_test}).reset_index(drop = True)
predicted_outcome.head()

Unnamed: 0,Prediction,Actual
0,859.972432,860
1,389.992561,390
2,738.736026,739
3,334.70212,335
4,462.499244,462


Initially, we attempted to use the balanced_accuracy_score function to produce the accuracy score for our model. We eventually realized this was a mistake because this is to be used as a classification metric and not for a regression problem. Thus, we decided to use the R-Squared (R2) score instead.

In [21]:
# Predict the accuracy using r2 score
from sklearn.metrics import r2_score
Accuracy=r2_score(y_test,y_predicted)*100
print("Accuracy of the model is %.2f" %Accuracy)

Accuracy of the model is 99.55
