# Injury Analysis:

- Identify the unique injuries present in the dataset for self assessment and the diagnostic of the professional medical reviewer.
- Provide a description of the injuries.
- Analyse the relationship between self-assessment and the diagnostic of the medical reviewer.
- Decide if self assessment adds information.

In [1]:
import pandas as pd
import os
from core.constants import RICKD_RESULTS_FOLDER, RICKD_PROCESSED_DATA_FOLDER, RICKD_SESSION_DATA_FULL_FILE

run_data_full = pd.read_csv(RICKD_SESSION_DATA_FULL_FILE)
run_injury_data = run_data_full[['id', 'sub_id','InjDefn', 'SpecInjury', 'SpecInjury2']]
run_injury_data

Unnamed: 0,id,sub_id,InjDefn,SpecInjury,SpecInjury2
0,100433_20101005t132240,100433,training volume/intensity affected,pain,
1,100434_20101117t132240,100434,training volume/intensity affected,disc degeneration,
2,100537_20120703t102550,100537,2 workouts missed in a row,other,
3,100560_20120717t103748,100560,no injury,,
4,101481_20120717t105021,101481,no injury,,
...,...,...,...,...,...
1827,200986_20150312t143944,200986,no injury,,
1828,200987_20150312t160840,200987,continuing to train in pain,pelvic malalignment,itb syndrome
1829,201100_20150409t155915,201100,training volume/intensity affected,muscle strain,ligament sprain
1830,201101_20150413t143152,201101,no injury,,


In [2]:
print(f"Num of NaN in SpecInjury: {run_injury_data[run_injury_data.SpecInjury.isna()].sub_id.count()}")
print(f"Num of no injury in SpecInjury: {run_injury_data[run_injury_data.SpecInjury == 'no injury'].sub_id.count()}")

Num of NaN in SpecInjury: 588
Num of no injury in SpecInjury: 70


## Clean Injury type:
Clean injury type introducing codes that are easier to handle. Remove duplciates and add descriptions.

In [3]:
# Combine SpecInjury and SpecInjury2 columns and get unique values
unique_combined_injuries = pd.concat([run_injury_data['SpecInjury'], run_injury_data['SpecInjury2']]).dropna().drop_duplicates()

print("### Distinct Injury Types")
for injury in sorted(unique_combined_injuries):
    print(f"- {injury}")


### Distinct Injury Types
- achilles tendonitis
- bursitis
- calf muscle strain
- chondromalacia
- compartment syndrome
- cyst
- deformity (e.g hammer/claw toes, bunion)
- deformity(ie hammer/claw toes, bunion)
- disc degeneration
- disc protrusion
- dislocation
- fibula fracture
- fibula stress fracture
- fill in specifics below
- first ray tendonitis (turf toe)
- gastrocnemius strain
- groin muscle strain
- hamstring muscle strain
- high ankle sprain with tendonosis
- hip flexor strain
- hip joint irritation
- ischial bursitis
- itb syndrome
- labral tear
- ligament sprain
- ligament tear/rupture acl
- ligament tear/rupture mcl
- ligament tear/rupture pcl
- ligament tear/rupture-acl
- low back pain
- medial tibial stress syndrome
- meniscal tear medial
- meniscal tear-lateral
- meniscal tear-medial
- metarsalgia
- metatarsal stress fracture
- muscle spasm
- muscle strain
- mva, not running related
- nerve impingement
- no injury
- non-specific injury
- osteitis pubis
- osteoarthritis

In [4]:

from core.data_quality import map_injury_codes
from core.constants import RICKD_MAP_INJURY_CODE, RICKD_MAP_INJURY_DESC, RICKD_MAP_SELF_INJURY_CODE
from core.utils import save_df_as_table_image


df = run_injury_data.copy()
df = map_injury_codes(df, RICKD_MAP_SELF_INJURY_CODE, "InjDefn", "self_injury_code")
df = map_injury_codes(df, RICKD_MAP_INJURY_CODE, "SpecInjury", "injury_code")
df = map_injury_codes(df, RICKD_MAP_INJURY_CODE, "SpecInjury2", "injury2_code")
df = map_injury_codes(df, RICKD_MAP_INJURY_DESC, "injury_code", "injury_desc")
df = map_injury_codes(df, RICKD_MAP_INJURY_DESC, "injury2_code", "injury2_desc")

clean_injury_data = df.copy()
clean_injury_data

Unnamed: 0,id,sub_id,InjDefn,SpecInjury,SpecInjury2,self_injury_code,injury_code,injury2_code,injury_desc,injury2_desc
0,100433_20101005t132240,100433,training volume/intensity affected,pain,,volume_intensity,pain,healthy,General sensation of discomfort signaling tiss...,No injury has been diagnosed.
1,100434_20101117t132240,100434,training volume/intensity affected,disc degeneration,,volume_intensity,disc_dege,healthy,Breakdown and gradual loss of spinal disc cush...,No injury has been diagnosed.
2,100537_20120703t102550,100537,2 workouts missed in a row,other,,missed_2_workouts,other,healthy,Unspecified biomechanical disruption or pain w...,No injury has been diagnosed.
3,100560_20120717t103748,100560,no injury,,,no_injury,healthy,healthy,No injury has been diagnosed.,No injury has been diagnosed.
4,101481_20120717t105021,101481,no injury,,,no_injury,healthy,healthy,No injury has been diagnosed.,No injury has been diagnosed.
...,...,...,...,...,...,...,...,...,...,...
1827,200986_20150312t143944,200986,no injury,,,no_injury,healthy,healthy,No injury has been diagnosed.,No injury has been diagnosed.
1828,200987_20150312t160840,200987,continuing to train in pain,pelvic malalignment,itb syndrome,train_in_pain,pelv_mala,itb_synd,Misalignment or imbalance of pelvic bones caus...,Painful friction of iliotibial band rubbing ov...
1829,201100_20150409t155915,201100,training volume/intensity affected,muscle strain,ligament sprain,volume_intensity,musc_stra,liga_spra,Muscle fibers stretched or torn due to excessi...,Stretching or tearing of ligaments due to exce...
1830,201101_20150413t143152,201101,no injury,,,no_injury,healthy,healthy,No injury has been diagnosed.,No injury has been diagnosed.


In [5]:
# Create a new dataframe with all columns from run_data_full plus the new mapped columns from df
run_data_full_with_injury_codes = pd.concat([
    run_data_full,
    clean_injury_data[['self_injury_code', 'injury_code', 'injury2_code', 'injury_desc', 'injury2_desc']]
], axis=1)

run_data_full_with_injury_codes


Unnamed: 0,id,sub_id,datestring,filename,speed_r,age,Height,Weight,Gender,DominantLeg,...,r_peak_hip_add_velocity,l_peak_pelvic_drop_velocity,r_peak_pelvic_drop_velocity,l_vertical_oscillation,r_vertical_oscillation,self_injury_code,injury_code,injury2_code,injury_desc,injury2_desc
0,100433_20101005t132240,100433,2010-10-05 13:22:40,20101005t132240.json,1.610861,53,,,unknown,,...,59.323001,-61.317978,-75.058744,61.125564,51.466247,volume_intensity,pain,healthy,General sensation of discomfort signaling tiss...,No injury has been diagnosed.
1,100434_20101117t132240,100434,2010-11-17 13:22:40,20101117t132240.json,2.237294,51,,,female,,...,183.573751,-99.163244,-96.178927,76.028357,62.307667,volume_intensity,disc_dege,healthy,Breakdown and gradual loss of spinal disc cush...,No injury has been diagnosed.
2,100537_20120703t102550,100537,2012-07-03 10:25:50,20120703t102550.json,2.127441,255,173.1,67.6,female,right,...,257.430836,-62.309317,-62.804925,78.130812,81.753809,missed_2_workouts,other,healthy,Unspecified biomechanical disruption or pain w...,No injury has been diagnosed.
3,100560_20120717t103748,100560,2012-07-17 10:37:48,20120717t103748.json,2.657365,33,179.3,83.0,female,right,...,320.445059,-128.814092,-141.687354,113.867318,111.686204,no_injury,healthy,healthy,No injury has been diagnosed.,No injury has been diagnosed.
4,101481_20120717t105021,101481,2012-07-17 10:50:21,20120717t105021.json,2.625088,32,176.3,58.6,female,,...,169.271213,-37.007098,-49.506248,91.595199,99.045820,no_injury,healthy,healthy,No injury has been diagnosed.,No injury has been diagnosed.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1827,200986_20150312t143944,200986,2015-03-12 14:39:44,20150312t143944.json,4.876998,20,174.0,56.8,female,right,...,113.244646,-30.979746,-35.757575,89.724083,84.589713,no_injury,healthy,healthy,No injury has been diagnosed.,No injury has been diagnosed.
1828,200987_20150312t160840,200987,2015-03-12 16:08:40,20150312t160840.json,2.765022,50,164.0,60.0,female,right,...,171.902428,-74.064966,-70.457833,66.498443,68.524480,train_in_pain,pelv_mala,itb_synd,Misalignment or imbalance of pelvic bones caus...,Painful friction of iliotibial band rubbing ov...
1829,201100_20150409t155915,201100,2015-04-09 15:59:15,20150409t155915.json,2.790966,52,170.0,80.0,male,right,...,203.683223,-146.535658,-120.697186,74.753470,86.710600,volume_intensity,musc_stra,liga_spra,Muscle fibers stretched or torn due to excessi...,Stretching or tearing of ligaments due to exce...
1830,201101_20150413t143152,201101,2015-04-13 14:31:52,20150413t143152.json,2.828602,21,162.0,65.5,male,right,...,168.473505,-56.000594,-79.974041,76.066245,67.343927,no_injury,healthy,healthy,No injury has been diagnosed.,No injury has been diagnosed.


In [6]:
list(run_data_full_with_injury_codes.columns)
# Columns to delete because we replaced by the mapped values.
cols_to_delete = [
    "InjDefn",
    "SpecInjury",
    "SpecInjury2",
]

run_data_full_with_injury_codes = run_data_full_with_injury_codes.drop(columns=cols_to_delete)

run_data_full_with_injury_codes.to_csv(os.path.join(RICKD_PROCESSED_DATA_FOLDER, "run_data_session_with_injury.csv"))

## Analyse Injury data:
- Describe types of injuries.
- Relationship between self-assessment and the diagnostic of the medical reviewer.

In [7]:
# Get unique values and their counts from InjDefn column
injury_defn_counts = clean_injury_data['InjDefn'].value_counts().reset_index()
injury_defn_counts


Unnamed: 0,InjDefn,count
0,no injury,659
1,training volume/intensity affected,499
2,continuing to train in pain,320
3,2 workouts missed in a row,274


In [8]:
clean_injury_data

Unnamed: 0,id,sub_id,InjDefn,SpecInjury,SpecInjury2,self_injury_code,injury_code,injury2_code,injury_desc,injury2_desc
0,100433_20101005t132240,100433,training volume/intensity affected,pain,,volume_intensity,pain,healthy,General sensation of discomfort signaling tiss...,No injury has been diagnosed.
1,100434_20101117t132240,100434,training volume/intensity affected,disc degeneration,,volume_intensity,disc_dege,healthy,Breakdown and gradual loss of spinal disc cush...,No injury has been diagnosed.
2,100537_20120703t102550,100537,2 workouts missed in a row,other,,missed_2_workouts,other,healthy,Unspecified biomechanical disruption or pain w...,No injury has been diagnosed.
3,100560_20120717t103748,100560,no injury,,,no_injury,healthy,healthy,No injury has been diagnosed.,No injury has been diagnosed.
4,101481_20120717t105021,101481,no injury,,,no_injury,healthy,healthy,No injury has been diagnosed.,No injury has been diagnosed.
...,...,...,...,...,...,...,...,...,...,...
1827,200986_20150312t143944,200986,no injury,,,no_injury,healthy,healthy,No injury has been diagnosed.,No injury has been diagnosed.
1828,200987_20150312t160840,200987,continuing to train in pain,pelvic malalignment,itb syndrome,train_in_pain,pelv_mala,itb_synd,Misalignment or imbalance of pelvic bones caus...,Painful friction of iliotibial band rubbing ov...
1829,201100_20150409t155915,201100,training volume/intensity affected,muscle strain,ligament sprain,volume_intensity,musc_stra,liga_spra,Muscle fibers stretched or torn due to excessi...,Stretching or tearing of ligaments due to exce...
1830,201101_20150413t143152,201101,no injury,,,no_injury,healthy,healthy,No injury has been diagnosed.,No injury has been diagnosed.


In [9]:
# Combine SpecInjury/injury_code and SpecInjury2/injury2_code into a single dataframe
medical_reviewer_injury = pd.concat([
    clean_injury_data[['injury_code']].rename(columns={'injury_code': 'Code'}),
    clean_injury_data[clean_injury_data['injury2_code'] != 'healthy'][['injury2_code']].rename(columns={'injury2_code': 'Code'})
])

medical_reviewer_injury_counts = medical_reviewer_injury.value_counts().reset_index().sort_values('Code')

with pd.option_context('display.max_rows', None):
    display(medical_reviewer_injury_counts)



Unnamed: 0,Code,count
5,achi_tend,75
10,burs,33
7,calf_musc_stra,63
28,chon,7
30,comp_synd,5
39,cyst,3
32,deformity,4
26,disc_dege,8
27,disc_prot,8
61,disl,1


In [10]:
save_df_as_table_image(
    pd.read_csv(RICKD_MAP_INJURY_CODE).sort_values(by='value'),
    os.path.join(RICKD_RESULTS_FOLDER, 'injury_code_mapping.png')
)


In [11]:
save_df_as_table_image(
    medical_reviewer_injury_counts,
    os.path.join(RICKD_RESULTS_FOLDER, 'medical_reviewer_injury_counts_sorted.png')
)


In [12]:
N=20
save_df_as_table_image(
    medical_reviewer_injury_counts.sort_values(by='count', ascending=False).head(20),
    os.path.join(RICKD_RESULTS_FOLDER, f'medical_reviewer_injury_counts_top{N}.png')
)

In [None]:
subject_pace = run_data_full_with_injury_codes.groupby('sub_id')['speed_r'].mean().reset_index()

# Get most common injury per subject
subject_injuries = pd.concat([
    run_data_full_with_injury_codes[['sub_id', 'injury_code']].rename(columns={'injury_code': 'Code'}),
    run_data_full_with_injury_codes[run_data_full_with_injury_codes['injury2_code'] != 'healthy'][['sub_id', 'injury2_code']].rename(columns={'injury2_code': 'Code'})
])

most_common_injury = subject_injuries.groupby('sub_id')['Code'].agg(
    lambda x: x.value_counts().index[0] if len(x) > 0 else 'healthy'
).reset_index()

subject_summary = pd.merge(subject_pace, most_common_injury, on='sub_id').rename(columns={'speed_r': 'pace (m/s)', 'Code': 'Injury Code'})

injury_pace_summary = subject_summary.groupby('Injury Code')['pace (m/s)'].agg(['mean', 'count']).reset_index()
injury_pace_summary = injury_pace_summary.sort_values('mean', ascending=False).rename(columns={'mean': 'mean pace (m/s)'})

display(injury_pace_summary)

save_df_as_table_image(
    injury_pace_summary,
    os.path.join(RICKD_RESULTS_FOLDER, 'injury_pace_summary.png')
)


Unnamed: 0,Injury Code,mean pace (m/s),count
51,stre_frac,3.290489,1
35,oste_pubis,3.044357,4
42,pelv_mala,3.001225,2
56,tibia_stre_frac,2.96621,7
27,meni_tear_lat,2.925006,1
52,stre_reac,2.914865,1
40,pate_tend,2.836377,9
41,pelv_dysf_gene,2.80067,4
14,groin_musc_stra,2.7791,2
50,strain,2.75095,1


In [34]:
run_data_full_with_injury_codes['datestring'] = pd.to_datetime(run_data_full_with_injury_codes['datestring'])

# Get all dates for each subject
subject_dates = run_data_full_with_injury_codes.groupby('sub_id')['datestring'].apply(list).reset_index()

def is_longitudinal(dates):
    if len(dates) < 2:
        return False
    dates = sorted(dates)
    for i in range(len(dates)-1):
        if (dates[i+1] - dates[i]).days < 30:
            return False
    return True

# Apply the check to each subject
subject_dates['is_longitudinal'] = subject_dates['datestring'].apply(is_longitudinal)
longitudinal_subjects = subject_dates[subject_dates['is_longitudinal']]

print(f"Number of subjects with longitudinal data (all sessions >= 1 month apart): {len(longitudinal_subjects)}")
print(f"Total number of subjects: {len(subject_dates)}")
print(f"Percentage of subjects with longitudinal data: {len(longitudinal_subjects)/len(subject_dates)*100:.1f}%")

# # Calculate and display time differences between consecutive sessions
# all_diffs = []
# for dates in subject_dates['datestring']:
#     dates = sorted(dates)
#     for i in range(len(dates)-1):
#         all_diffs.append((dates[i+1] - dates[i]).days / 30)

# print("\nSummary statistics of time differences between consecutive sessions (months):")
# print(pd.Series(all_diffs).describe())

Number of subjects with longitudinal data (all sessions >= 1 month apart): 30
Total number of subjects: 1403
Percentage of subjects with longitudinal data: 2.1%
