In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('Calisthenics Data - Sheet1.csv')

In [3]:
df.head()

Unnamed: 0,Gender,Reps,Year,Time Format,AP/JC,FF/PR/PB,HR/FR,Exercise
0,Male,130,2019,2 MINUTES,AP,FF,HR,2
1,Male,119,2017,2 MINUTES,AP,FF,HR,2
2,Male,111,2017,2 MINUTES,AP,FF,HR,2
3,Male,107,2018,2 MINUTES,AP,FF,HR,2
4,Male,105,2019,2 MINUTES,AP,FF,HR,2


In [4]:
print(df['Exercise'].unique())

[2 1 3 4 5]


# Cleaning up the dataset

In [5]:
# Conditions to remove specific exercises which are 'UNTIMED'
condition_1 = (df['Exercise'] == 1) & (df['Time Format'] == 'UNTIMED')  # remove 'UNTIMED' rows for exercise 1
condition_2 = (df['Exercise'] == 2) & (df['Time Format'] == 'UNTIMED')  # remove 'UNTIMED' rows for exercise 2
condition_4 = (df['Exercise'] == 4) & (df['Time Format'] == 'UNTIMED')  # remove 'UNTIMED' rows for exercise 4
condition_5 = (df['Exercise'] == 5) & (df['Time Format'] == 'UNTIMED')  # remove 'UNTIMED' rows for exercise 5

# Removing 'UNTIMED' rows for specific exercises
df = df[~(condition_1 | condition_2 | condition_4 | condition_5)]

# Drops any rows where 'Reps' are 0
df = df[df['Reps'] != 0]  

# Removing any half-reps
df = df[df['HR/FR'] != 'HR']  

# Remove freeform
df = df[df['FF/PR/PB'] != 'FF']

In [6]:
df.head()

Unnamed: 0,Gender,Reps,Year,Time Format,AP/JC,FF/PR/PB,HR/FR,Exercise
70,Male,71,2023,2 MINUTES,JC,PB,FR,2
71,Male,70,2020,2 MINUTES,JC,PB,FR,2
72,Male,70,2021,2 MINUTES,JC,PB,FR,2
73,Male,70,2022,2 MINUTES,JC,PB,FR,2
74,Male,70,2023,2 MINUTES,JC,PB,FR,2


In [7]:
unique_exercises = df['Exercise'].unique()
for exercise in unique_exercises:
    exercise_df = df[df['Exercise'] == exercise]
    print(f"Exercise: {exercise}")
    print(exercise_df.describe())
    print('\n')

Exercise: 2
             Reps         Year  Exercise
count  111.000000   111.000000     111.0
mean    47.288288  2021.630631       2.0
std     12.643638     1.043838       0.0
min      2.000000  2020.000000       2.0
25%     38.000000  2021.000000       2.0
50%     46.000000  2022.000000       2.0
75%     56.500000  2022.500000       2.0
max     71.000000  2023.000000       2.0


Exercise: 1
             Reps         Year  Exercise
count  165.000000   165.000000     165.0
mean    38.666667  2020.842424       1.0
std     14.561197     1.501834       0.0
min      1.000000  2019.000000       1.0
25%     28.000000  2019.000000       1.0
50%     38.000000  2021.000000       1.0
75%     47.000000  2022.000000       1.0
max     73.000000  2023.000000       1.0


Exercise: 3
             Reps         Year  Exercise
count  119.000000   119.000000     119.0
mean    18.016807  2021.630252       3.0
std      6.803518     1.080376       0.0
min      1.000000  2020.000000       3.0
25%     13.500000

In [8]:
# Standardize dataframe so reps are centered at 0
# Instantiate the scaler
scaler = StandardScaler()  

# Fit and add a standardized reps column
df['Reps_Standardized'] = scaler.fit_transform(df[['Reps']])  
df.head()

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Gender,Reps,Year,Time Format,AP/JC,FF/PR/PB,HR/FR,Exercise,Reps_Standardized
70,Male,71,2023,2 MINUTES,JC,PB,FR,2,0.642719
71,Male,70,2020,2 MINUTES,JC,PB,FR,2,0.609716
72,Male,70,2021,2 MINUTES,JC,PB,FR,2,0.609716
73,Male,70,2022,2 MINUTES,JC,PB,FR,2,0.609716
74,Male,70,2023,2 MINUTES,JC,PB,FR,2,0.609716


In [9]:
def calculate_scores(data):
    """
    Function to calculate scores and score value of each rep for different exercises.
    It uses the Z-Scores to determine the points.

    Args:
    df (pandas.DataFrame): Dataframe containing the data. 

    Returns:
    df (pandas.DataFrame): Dataframe with additional columns 'score' and 'score_per_rep'. 
    """
    # Create a copy of the input DataFrame to avoid changing original data
    df = data.copy(deep=True)

    # Identify unique exercises in the DataFrame
    unique_exercises = df['Exercise'].unique()

    # Process each exercise one at a time
    for exercise in unique_exercises:
        # Create a new DataFrame that only contains rows for the current exercise
        df_exercise = df[df['Exercise'] == exercise]

        # Calculate 'score' using 'Reps_Standardized'
        # 500 represents an average score and 1000 represents an exceptional score
        # Assume an exceptional score corresponds to a Z-Score of 3 (3 standard deviations above the mean)
        df_exercise['score'] = ( ( (df_exercise['Reps_Standardized'] / 3) + 1) / 2 ) * 500 + 500

        # Calculate 'score_per_rep' by dividing 'score' by the original 'Reps'
        df_exercise['score_per_rep'] = df_exercise['score'] / df_exercise['Reps']

        # Update the 'score' and 'score_per_rep' values in the original DataFrame for the current exercise
        df.loc[df['Exercise'] == exercise, 'score'] = df_exercise['score']
        df.loc[df['Exercise'] == exercise, 'score_per_rep'] = df_exercise['score_per_rep']

    # Return the updated DataFrame with the new 'score' and 'score_per_rep' columns
    return df

A Z-score represents how many standard deviations a given value is from the mean. For instance, a Z-score of 1.0 represents a value that is one standard deviation from the mean. Negative Z-scores are below the mean while positive Z-scores are above the mean.
<br>
In the expression (df_exercise['Reps_Standardized'] / 3), 'Reps_Standardized' is presumably a z-score and the division by 3 scales it to the range [-1, 1] under the assumption that a 'Reps_Standardized' score of 3 is extraordinary.
<br>
The + 1 afterwards shifts the range to [0, 2].
The division by 2 at the end scales the range down to [0, 1].
By performing this operation, the function is ensuring that the 'score' values will fall within the range [500, 1000] as desired.

In [10]:
# New dataframe with scores per rep
score_df = calculate_scores(df)
score_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exercise['score'] = ( ( (df_exercise['Reps_Standardized'] / 3) + 1) / 2 ) * 500 + 500
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exercise['score_per_rep'] = df_exercise['score'] / df_exercise['Reps']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exercise['score'] = ( ( (df_exercise['R

Unnamed: 0,Gender,Reps,Year,Time Format,AP/JC,FF/PR/PB,HR/FR,Exercise,Reps_Standardized,score,score_per_rep
70,Male,71,2023,2 MINUTES,JC,PB,FR,2,0.642719,803.559885,11.317745
71,Male,70,2020,2 MINUTES,JC,PB,FR,2,0.609716,800.809655,11.440138
72,Male,70,2021,2 MINUTES,JC,PB,FR,2,0.609716,800.809655,11.440138
73,Male,70,2022,2 MINUTES,JC,PB,FR,2,0.609716,800.809655,11.440138
74,Male,70,2023,2 MINUTES,JC,PB,FR,2,0.609716,800.809655,11.440138
...,...,...,...,...,...,...,...,...,...,...,...
1472,Female,67,2015,2 MINUTES,,,FR,5,0.510708,792.558966,11.829238
1473,Male,67,2021,2 MINUTES,,,FR,5,0.510708,792.558966,11.829238
1474,Male,67,2023,2 MINUTES,,,FR,5,0.510708,792.558966,11.829238
1486,Male,50,2016,2 MINUTES,,,FR,5,-0.050339,745.805060,14.916101


In [11]:
# Take the mean score per rep to get a weight
weights = score_df.groupby('Exercise')['score_per_rep'].mean()
weights

Exercise
1    29.013399
2    19.041024
3    48.407162
4    12.753596
5     9.576302
Name: score_per_rep, dtype: float64

In [12]:
# Predicting new scores based on the weights
new_data = df.copy(deep=True)  # just a place-holder for now
# New data would be the data that is being predicted
new_data['predicted_score'] = new_data.apply(lambda row: row['Reps'] * weights[row['Exercise']], axis=1)

# Why I chose this method
* Normalize the data to account for values that might be on different scales
* Calculates a score for each rep in each exercise, based on Z-score
* Able to compare scores from different exercises
* This calculates Z-scores relative to the performance for a specific exercise and not across all exercises
* Assigns more 'value' to reps that are above the average for a specific exercise