In [None]:
# import
import pandas as pd
import numpy as np

In [None]:
def prepare_data_complete(df):
    """
    Complete data preparation with all error handling.
    """
    df = df.copy()
    
    # Define feature sets
    predictor_features = [
        'average_weather_before', 'average_humidity_before', 'average_wind_before',
        'most_common_surface', 'most_common_roof', 'average_snaps_before',
        'sum_travel_magnitude', 'sum_tz_diff_magnitude', 'sum_elevation_difference',
        'prev_weather', 'prev_humidity', 'prev_wind', 'prev_surface', 'prev_roof',
        'prev_snaps', 'prev_travel_magnitude', 'prev_is_international',
        'prev_elevation_difference', 'prev_travel_direction', 'prev_elevation_difference_abs_m'
    ]
    
    # Filter to only existing columns
    predictor_features = [col for col in predictor_features if col in df.columns]
    
    print(f"Found {len(predictor_features)} predictor features")
    
    # Clean all predictor features
    # print("\nStep 1: Cleaning predictor features...")
    
    # replaced conon non-numeric strings for each column in predictor features
    for col in predictor_features:
        if col in df.columns:
            # Check data type
            if df[col].dtype == object:
                print(f"  Cleaning {col} (object type)...")
                
                # Replace common non-numeric strings
                df[col] = df[col].astype(str).str.lower()
                non_numeric_strings = ['unknown', 'n/a', 'na', 'nan', 'none', 'null', '']
                df[col] = df[col].replace(non_numeric_strings, np.nan)
                
                # Extract numbers from strings
                df[col] = df[col].str.extract(r'([-+]?\d*\.?\d+)')[0]
            
            # Convert to numeric
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Create proper binary target
    # print("\nStep 2: Creating injury target...")
    
    if 'game_status' in df.columns:
        # Clean and create binary target
        df['game_status'] = df['game_status'].astype(str).str.strip().str.lower()
        
        # Create binary target: 1 for injured, 0 for not injured
        injury_keywords = ['out', 'doubtful', 'questionable']
        df['is_injured'] = df['game_status'].isin(injury_keywords).astype(int)
        
        # Ensure we only have 0 and 1
        df['is_injured'] = df['is_injured'].clip(0, 1)
        
        print(f"Injury distribution: {df['is_injured'].value_counts().to_dict()}")
    else:
        print("Warning: No game_status column, creating dummy target")
        df['is_injured'] = 0
    
    # Remove rows with all NaN in predictors
    # print("\nStep 3: Removing invalid rows...")
    
    # Check for rows with all NaN in predictors
    nan_mask = df[predictor_features].isna().all(axis=1)
    if nan_mask.any():
        print(f"  Removing {nan_mask.sum()} rows with all NaN predictors")
        df = df[~nan_mask]
    
    # Impute missing values
    # print("\nStep Imputing missing values...")
    
    # Use median for numeric columns
    for col in predictor_features:
        if col in df.columns:
            if df[col].isna().any():
                median_val = df[col].median()
                df[col] = df[col].fillna(median_val)
    
    # Final check, ensure all values are finite
    for col in predictor_features:
        if col in df.columns:
            # Replace infinite values
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)
            df[col] = df[col].fillna(df[col].median())
    
    print(f"\nFinal data shape: {df.shape}")
    print(f"Injured: {df['is_injured'].sum()}, Not injured: {len(df) - df['is_injured'].sum()}")
    
    return df, predictor_features

def analyze_injury_patterns(df):
    """
    Simple analysis of injury patterns.
    """
    print("\n" + "=" * 60)
    print("INJURY PATTERN ANALYSIS")
    print("=" * 60)
    
    results = {}
    
    # Injury rate by position
    if 'position' in df.columns and 'is_injured' in df.columns:
        injury_by_position = df.groupby('position')['is_injured'].agg(['mean', 'count'])
        injury_by_position = injury_by_position.sort_values('mean', ascending=False)
        
        print("\n1. Injury Rate by Position:")
        for position, row in injury_by_position.head(10).iterrows():
            print(f"   {position}: {row['mean']:.1%} ({int(row['count'])} players)")
        
        results['injury_by_position'] = injury_by_position
    
    # Most common injury locations
    if 'injury_lcoation' in df.columns:
        injury_locations = df['injury_lcoation'].value_counts().head(10)
        
        print("\n2. Most Common Injury Locations:")
        for location, count in injury_locations.items():
            print(f"   {location}: {count} injuries")
        
        results['common_injury_locations'] = injury_locations
    
    # Correlation of is_injured with numeric features
    numeric_features = ['average_snaps_before', 'sum_travel_magnitude', 
                       'sum_elevation_difference', 'average_weather_before']
    
    if 'is_injured' in df.columns:
        # print("\n3. Correlation with Injury Status:")
        for feature in numeric_features:
            if feature in df.columns:
                try:
                    # Ensure numeric
                    feature_series = pd.to_numeric(df[feature], errors='coerce')
                    corr = feature_series.corr(df['is_injured'])
                    print(f"   {feature}: {corr:.3f}")
                except:
                    print(f"   {feature}: Could not calculate correlation")
    
    return results

In [None]:
print("\n" + "=" * 60)
print("FULL PIPELINE")
print("=" * 60)

# read in the data
df = pd.read_csv('clean_data/agged_data2.csv')

# get data ready for simple analysis function
df_prepared, predictor_features = prepare_data_complete(df)

# Simple Analysis: 
# - returns injury by position, location
# - correlational coefficients between features and Is_injured 
analysis_results = analyze_injury_patterns(df_prepared)

Found 20 predictor features
  Cleaning most_common_surface (object type)...
  Cleaning most_common_roof (object type)...
  Cleaning prev_humidity (object type)...
  Cleaning prev_wind (object type)...
  Cleaning prev_surface (object type)...
  Cleaning prev_roof (object type)...
  Cleaning prev_travel_magnitude (object type)...
  Cleaning prev_is_international (object type)...
  Cleaning prev_travel_direction (object type)...
Injury distribution: {0: 280024, 1: 27612}
  Removing 6796 rows with all NaN predictors


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)



Final data shape: (300840, 138)
Injured: 27085, Not injured: 273755

INJURY PATTERN ANALYSIS

1. Injury Rate by Position:
   CB: 56.1% (6643 players)
   K: 54.5% (352 players)
   S: 53.9% (4420 players)
   FB: 53.8% (173 players)
   WR: 51.5% (7267 players)
   RB: 51.3% (4158 players)
   LB: 50.4% (7258 players)
   T: 50.0% (4769 players)
   TE: 48.8% (3483 players)
   G: 46.7% (3463 players)

2. Most Common Injury Locations:
   Knee: 4234 injuries
   Ankle: 3726 injuries
   Hamstring: 2923 injuries
   Shoulder: 1651 injuries
   Concussion: 1600 injuries
   Groin: 1139 injuries
   Foot: 1131 injuries
   Calf: 974 injuries
   Back: 930 injuries
   Illness: 891 injuries
   average_snaps_before: 0.075
   sum_travel_magnitude: nan
   sum_elevation_difference: 0.003
   average_weather_before: 0.014


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  c /= stddev[:, None]
  c /= stddev[None, :]
