In [1]:
import os
import os.path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from datetime import datetime
import matplotlib.dates as mdates
import plotly.express as px
import plotly.graph_objects as go
sns.set()

In [2]:
datadir = "data/tidy_data"
data = os.path.join(datadir, "new_jump_data.csv")
df_jump = pd.read_csv(data)

In [3]:
print(df_jump.columns.tolist())

['athlete_code', 'TestId', 'Date', 'Time', 'Gender', 'Position', 'Type', 'Excluded', 'Tags', 'System Weight', 'Jump Height', 'Jump Momentum', 'Countermovement Depth', 'Braking RFD', 'Stiffness', 'Force at Min Displacement', 'Relative Force at Min Displacement', 'Avg. Braking Force', 'Avg. Relative Braking Force', 'Peak Braking Force', 'Peak Relative Braking Force', 'Avg. Propulsive Force', 'Avg. Relative Propulsive Force', 'Peak Propulsive Force', 'Peak Relative Propulsive Force', 'Unweighting Phase', 'Unweighting Phase %', 'Braking Phase', 'Braking Phase %', 'Propulsive Phase', 'Propulsive Phase %', 'Flight Time', 'Time To Takeoff', 'Braking Net Impulse', 'Propulsive Net Impulse', 'Positive Impulse', 'Positive Net Impulse', 'Impulse Ratio', 'Avg. Braking Velocity', 'Peak Braking Velocity', 'Avg. Propulsive Velocity', 'Takeoff Velocity', 'Peak Velocity', 'Avg. Braking Power', 'Avg. Relative Braking Power', 'Peak Braking Power', 'Peak Relative Braking Power', 'Avg. Propulsive Power', 'A

In [4]:
df_jump.head(5)

Unnamed: 0,athlete_code,TestId,Date,Time,Gender,Position,Type,Excluded,Tags,System Weight,...,team,problem_date,return_date,reported_date,body_part,affected_area,side_of_body,injury_type,specific_part,current_status
0,BASEB1,LBrzNxXMkCckqH6I37FC,04-16-2024,15:20:26,Men,,Countermovement Jump,,,730.371,...,Baseball,03-09-2024,03-09-2024,03-13-2024,Knee,Lower Extremity,Left,Fat Pad Syndrome,,Returned To Play As Tolerated
1,BASEB1,obdaLJJpA0veWNEbkgXZ,04-19-2024,14:54:35,Men,,Countermovement Jump,,,738.274,...,Baseball,03-09-2024,03-09-2024,03-13-2024,Knee,Lower Extremity,Left,Fat Pad Syndrome,,Returned To Play As Tolerated
2,BASEB1,xs4JaGoWbT6u3GfMrbMn,04-04-2024,15:21:41,Men,,Countermovement Jump,,,759.976,...,Baseball,03-09-2024,03-09-2024,03-13-2024,Knee,Lower Extremity,Left,Fat Pad Syndrome,,Returned To Play As Tolerated
3,BASEB1,2NXcR02pojld48VtJVsv,03-28-2024,15:20:09,Men,,Countermovement Jump,,,743.065,...,Baseball,03-09-2024,03-09-2024,03-13-2024,Knee,Lower Extremity,Left,Fat Pad Syndrome,,Returned To Play As Tolerated
4,BASEB1,idgUsdGMwwCRgpF1UlEC,04-19-2024,14:53:51,Men,,Countermovement Jump,,,738.847,...,Baseball,03-09-2024,03-09-2024,03-13-2024,Knee,Lower Extremity,Left,Fat Pad Syndrome,,Returned To Play As Tolerated


In [5]:
def date_coversion(date):
    date_object = datetime.strptime(date, '%m-%d-%Y')
    iso_format_date = date_object.strftime('%Y-%m-%d')
    return iso_format_date

# Changing date formatting
#df_jump.drop(columns = ["DateOfReturn", "ReportedDate"], inplace = True)
#df_jump["problem_date"] = df_jump["problem_date"].str.replace("/", "-").apply(date_coversion)
#df_jump["problem_date"] = pd.to_datetime(df_jump["problem_date"])

# Filling NA's
df_jump["side_of_body"] = df_jump["side_of_body"].fillna("Does Not Apply")

In [6]:
t_test_data = df_jump[[
    'TestId', 'Date', 'Gender', 'team', 'Peak Braking Velocity', 'Peak Propulsive Power', 'Avg. Braking Power',
    'Peak Braking Force', 'Avg. Braking Velocity'
]]

In [7]:
t_test_data['team'].unique()

array(['Baseball', nan, 'Football', 'Field Hockey', "Men's Golf",
       "Club Sports , Men's Golf", "Women's Golf", 'Mens Basketball',
       "Men's Lacrosse", "Men's Soccer", "Men's Swimming & Diving",
       "Men's Squash", "Men's Cross Country",
       "Men's Cross Country, Men's Track & Field", "Men's Track & Field",
       "Men's Tennis", 'Softball', 'Volleyball', "Women's Basketball",
       "Women's Fencing", "Women's Lacrosse", "Women's Swimming & Diving",
       'Womens Soccer', "Women's Squash", "Women's Cross Country",
       "Women's Track & Field",
       "Women's Cross Country, Women's Track & Field",
       "Club Sports , Women's Track & Field", "Women's Tennis"],
      dtype=object)

In [8]:
t_test_data['gender_binary'] = t_test_data['Gender'].apply(
    lambda x: 1 if x == 'Women' else (0 if x == 'Men' else np.nan)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t_test_data['gender_binary'] = t_test_data['Gender'].apply(


In [9]:
t_test_data['sport_name'] = t_test_data['team'].str.replace(
    r"\b(Men's|Mens|Women's|Womens|Club Sports\s*,\s*)\b", "", regex=True
).str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t_test_data['sport_name'] = t_test_data['team'].str.replace(


In [10]:
t_test_data['sport_name'].unique()

array(['Baseball', nan, 'Football', 'Field Hockey', 'Golf', 'Basketball',
       'Lacrosse', 'Soccer', 'Swimming & Diving', 'Squash',
       'Cross Country', 'Cross Country,  Track & Field', 'Track & Field',
       'Tennis', 'Softball', 'Volleyball', 'Fencing'], dtype=object)

In [11]:
# Define a list of field and court sports
field_sports = [
    'Baseball', 'Football', 'Golf', 'Lacrosse',
    'Soccer', 'Field Hockey', 'Softball'
]

court_sports = [
    'Basketball', 'Tennis', 'Squash', 'Volleyball',
    'Fencing' 
]

# Create the binary variable
t_test_data['field_court_binary'] = t_test_data['sport_name'].apply(
    lambda x: 1 if x in field_sports else (0 if x in court_sports else np.nan)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t_test_data['field_court_binary'] = t_test_data['sport_name'].apply(


In [12]:
t_test_data['sport_name'].count()

5043

In [13]:
t_test_data = t_test_data[t_test_data['field_court_binary'].notna()]

In [14]:
t_test_data = t_test_data[t_test_data['gender_binary'].notna()]

In [15]:
t_test_data[t_test_data['field_court_binary'] == 1].count()

TestId                   1817
Date                     1817
Gender                   1817
team                     1817
Peak Braking Velocity    1817
Peak Propulsive Power    1817
Avg. Braking Power       1817
Peak Braking Force       1817
Avg. Braking Velocity    1817
gender_binary            1817
sport_name               1817
field_court_binary       1817
dtype: int64

In [16]:
t_test_data[t_test_data['field_court_binary'] == 0].count()

TestId                   1971
Date                     1971
Gender                   1971
team                     1971
Peak Braking Velocity    1971
Peak Propulsive Power    1971
Avg. Braking Power       1971
Peak Braking Force       1971
Avg. Braking Velocity    1971
gender_binary            1971
sport_name               1971
field_court_binary       1971
dtype: int64

In [17]:
t_test_data[t_test_data['gender_binary'] == 0].count()

TestId                   1932
Date                     1932
Gender                   1932
team                     1932
Peak Braking Velocity    1932
Peak Propulsive Power    1932
Avg. Braking Power       1932
Peak Braking Force       1932
Avg. Braking Velocity    1932
gender_binary            1932
sport_name               1932
field_court_binary       1932
dtype: int64

In [18]:
t_test_data[t_test_data['gender_binary'] == 1].count()

TestId                   1856
Date                     1856
Gender                   1856
team                     1856
Peak Braking Velocity    1856
Peak Propulsive Power    1856
Avg. Braking Power       1856
Peak Braking Force       1856
Avg. Braking Velocity    1856
gender_binary            1856
sport_name               1856
field_court_binary       1856
dtype: int64

In [None]:
#t_test_data.to_csv('new_ttest_data.csv', index=False) 