In [248]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [249]:
df = pd.read_csv('Calisthenics Data - Sheet1.csv')

In [250]:
df.head()

Unnamed: 0,Gender,Reps,Year,Time Format,AP/JC,FF/PR/PB,HR/FR,Exercise
0,Male,130,2019,2 MINUTES,AP,FF,HR,2
1,Male,119,2017,2 MINUTES,AP,FF,HR,2
2,Male,111,2017,2 MINUTES,AP,FF,HR,2
3,Male,107,2018,2 MINUTES,AP,FF,HR,2
4,Male,105,2019,2 MINUTES,AP,FF,HR,2


In [251]:
print(df['Exercise'].unique())

[2 1 3 4 5]


# Cleaning up the dataset

In [252]:
# Conditions to remove specific exercises which are 'UNTIMED'
condition_1 = (df['Exercise'] == 1) & (df['Time Format'] == 'UNTIMED')  # remove 'UNTIMED' rows for exercise 1
condition_2 = (df['Exercise'] == 2) & (df['Time Format'] == 'UNTIMED')  # remove 'UNTIMED' rows for exercise 2
condition_4 = (df['Exercise'] == 4) & (df['Time Format'] == 'UNTIMED')  # remove 'UNTIMED' rows for exercise 4
condition_5 = (df['Exercise'] == 5) & (df['Time Format'] == 'UNTIMED')  # remove 'UNTIMED' rows for exercise 5

# Removing 'UNTIMED' rows for specific exercises
df = df[~(condition_1 | condition_2 | condition_4 | condition_5)]

# Drops any rows where 'Reps' are 0
df = df[df['Reps'] != 0]  

# Removing any half-reps
df = df[df['HR/FR'] != 'HR']  

# Remove freeform
df = df[df['FF/PR/PB'] != 'FF']

In [253]:
df.head()

Unnamed: 0,Gender,Reps,Year,Time Format,AP/JC,FF/PR/PB,HR/FR,Exercise
70,Male,71,2023,2 MINUTES,JC,PB,FR,2
71,Male,70,2020,2 MINUTES,JC,PB,FR,2
72,Male,70,2021,2 MINUTES,JC,PB,FR,2
73,Male,70,2022,2 MINUTES,JC,PB,FR,2
74,Male,70,2023,2 MINUTES,JC,PB,FR,2


In [254]:
unique_exercises = df['Exercise'].unique()
for exercise in unique_exercises:
    exercise_df = df[df['Exercise'] == exercise]
    print(f"Exercise: {exercise}")
    print(exercise_df.describe())
    print('\n')

Exercise: 2
             Reps         Year  Exercise
count  111.000000   111.000000     111.0
mean    47.288288  2021.630631       2.0
std     12.643638     1.043838       0.0
min      2.000000  2020.000000       2.0
25%     38.000000  2021.000000       2.0
50%     46.000000  2022.000000       2.0
75%     56.500000  2022.500000       2.0
max     71.000000  2023.000000       2.0


Exercise: 1
             Reps         Year  Exercise
count  165.000000   165.000000     165.0
mean    38.666667  2020.842424       1.0
std     14.561197     1.501834       0.0
min      1.000000  2019.000000       1.0
25%     28.000000  2019.000000       1.0
50%     38.000000  2021.000000       1.0
75%     47.000000  2022.000000       1.0
max     73.000000  2023.000000       1.0


Exercise: 3
             Reps         Year  Exercise
count  119.000000   119.000000     119.0
mean    18.016807  2021.630252       3.0
std      6.803518     1.080376       0.0
min      1.000000  2020.000000       3.0
25%     13.500000

In [255]:
# Instantiate the scaler
scaler = StandardScaler()

# Split dataframe by 'Exercise', standardize 'Reps' within each and convert to single dimensional array using ravel()
df['Reps_Standardized'] = df.groupby('Exercise')['Reps'].transform(lambda x: scaler.fit_transform(x.values.reshape(-1,1)).ravel())

In [256]:
# Making sure scalar worked
exercise_1 = df[df['Exercise'] == 1]
exercise_1

Unnamed: 0,Gender,Reps,Year,Time Format,AP/JC,FF/PR/PB,HR/FR,Exercise,Reps_Standardized
290,Male,73,2022,2MD,JC,PB,,1,2.365042
291,Male,73,2023,2MD,JC,PB,,1,2.365042
294,Male,72,2019,2MD,JC,PB,,1,2.296158
295,Male,72,2022,2MD,JC,PB,,1,2.296158
296,Male,71,2022,2MD,JC,PB,,1,2.227273
...,...,...,...,...,...,...,...,...,...
553,Female,11,2019,2MD,JC,PB,,1,-1.905811
557,Male,7,2019,2MD,JC,PB,,1,-2.181350
560,Female,3,2019,2MD,JC,PB,,1,-2.456889
561,Female,1,2019,2MD,JC,PB,,1,-2.594658


# Why I chose this method
* Normalize the data to account for values that might be on different scales
* Calculates a score for each rep in each exercise, based on Z-score
* Able to compare scores from different exercises
* This calculates Z-scores relative to the performance for a specific exercise and not across all exercises
* Assigns more 'value' to reps that are above the average for a specific exercise

In [257]:
import numpy as np

# Function to calculate score
def calculate_score(data):
    """
   This function calculates a score for each rep in each exercise, based on Z-score.
   An average person will score 500 and an exceptional score will be 1000, 
   which is defined as being in the 99th percentile

   :param data: Pandas DataFrame with 'Reps_Standardized' column
   :return: Given DataFrame with scores calculated in an additional 'Score' column.
   """
    # Calculate the 99th percentile
    exceptional_score = np.percentile(data['Reps_Standardized'], 99)

    # Calculate standardized score
    data['Score'] = ((data['Reps_Standardized'] / exceptional_score) * 500) + 500

    return data

# Calculate score dataframe
scores_df = calculate_score(df)

# Print the data frame
scores_df

Unnamed: 0,Gender,Reps,Year,Time Format,AP/JC,FF/PR/PB,HR/FR,Exercise,Reps_Standardized,Score
70,Male,71,2023,2 MINUTES,JC,PB,FR,2,1.883892,906.107249
71,Male,70,2020,2 MINUTES,JC,PB,FR,2,1.804442,888.980385
72,Male,70,2021,2 MINUTES,JC,PB,FR,2,1.804442,888.980385
73,Male,70,2022,2 MINUTES,JC,PB,FR,2,1.804442,888.980385
74,Male,70,2023,2 MINUTES,JC,PB,FR,2,1.804442,888.980385
...,...,...,...,...,...,...,...,...,...,...
1472,Female,67,2015,2 MINUTES,,,FR,5,-1.581917,158.988956
1473,Male,67,2021,2 MINUTES,,,FR,5,-1.581917,158.988956
1474,Male,67,2023,2 MINUTES,,,FR,5,-1.581917,158.988956
1486,Male,50,2016,2 MINUTES,,,FR,5,-2.630497,-67.051636


Here, `Reps_Standardized` represents the standardized repetitions for each row in the data. The `exceptional_score` is calculated as the 99th percentile of the 'Reps_Standardized' values. This means that 99% of the 'Reps_Standardized' values are less than the `exceptional_score`.

The formula divides each 'Reps_Standardized' value by the `exceptional_score`, multiplies it by 500 and then adds 500 to it. This calculation assumes that an average person will score 500 and an exceptional score will be 1000.

So, the score is calculated relative to the `exceptional_score`. If a value in 'Reps_Standardized' equals to `exceptional_score`, it will get a score of 1000. Lower 'Reps_Standardized' values will get scores less than 1000 and higher values will get scores greater than 1000.

In [258]:
# Calculate score per rep
scores_df['Score_Per_Rep'] = scores_df['Score'] / scores_df['Reps']

# calculate weights
weights = scores_df.groupby('Exercise')['Score_Per_Rep'].mean()
weights

Exercise
1    11.646832
2     8.827387
3    26.319109
4     6.941001
5     5.083621
Name: Score_Per_Rep, dtype: float64

In [259]:
# Predicting new scores based on the weights
new_data = df.copy(deep=True)  # just a place-holder for now
# New data would be the data that is being predicted
new_data['predicted_score'] = new_data.apply(lambda row: row['Reps'] * weights[row['Exercise']], axis=1)