In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_classif
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:

# Load your data
# df = pd.read_csv('your_injury_analysis_data.csv')

def prepare_data(df):
    """
    Prepare the data for modeling.
    """
    df = df.copy()
    
    # Define feature sets
    predictor_features = [
        'average_weather_before', 'average_humidity_before', 'average_wind_before',
        'most_common_surface', 'most_common_roof', 'average_snaps_before',
        'sum_travel_magnitude', 'sum_tz_diff_magnitude', 'sum_elevation_difference',
        'prev_weather', 'prev_humidity', 'prev_wind', 'prev_surface', 'prev_roof',
        'prev_snaps', 'prev_travel_magnitude', 'prev_is_international',
        'prev_elevation_difference', 'prev_travel_direction', 'prev_elevation_difference_abs_m'
    ]
    
    performance_features = [
        'pass_cmp_x', 'pass_att', 'pass_yds', 'pass_td', 'pass_int', 'pass_sacked_x',
        'pass_sacked_yds', 'pass_long', 'pass_rating', 'rush_att_x', 'rush_yds_x',
        'rush_td_x', 'rush_long', 'targets_x', 'rec_x', 'rec_yds_x', 'rec_td_x',
        'rec_long', 'fumbles', 'fumbles_lost', 'def_int_x', 'def_int_yds',
        'def_int_td', 'def_int_long', 'pass_defended', 'sacks_x', 'tackles_combined_x',
        'tackles_solo', 'tackles_assists', 'tackles_loss', 'qb_hits', 'fumbles_rec',
        'fumbles_rec_yds', 'fumbles_rec_td', 'fumbles_forced', 'xpm', 'xpa', 'fgm',
        'fga', 'punt', 'punt_yds', 'punt_yds_per_punt', 'punt_long', 'kick_ret',
        'kick_ret_yds', 'kick_ret_yds_per_ret', 'kick_ret_td', 'kick_ret_long',
        'punt_ret', 'punt_ret_yds', 'punt_ret_yds_per_ret', 'punt_ret_td',
        'punt_ret_long', 'pass_first_down', 'pass_first_down_pct', 'pass_target_yds',
        'pass_tgt_yds_per_att', 'pass_air_yds', 'pass_air_yds_per_cmp',
        'pass_air_yds_per_att', 'pass_yac', 'pass_yac_per_cmp', 'pass_drops',
        'pass_drop_pct', 'pass_poor_throws', 'pass_poor_throw_pct', 'pass_sacked_y',
        'pass_blitzed', 'pass_hurried', 'pass_hits', 'pass_pressured',
        'pass_pressured_pct', 'rush_scrambles', 'rush_scrambles_yds_per_att',
        'rush_first_down', 'rush_yds_before_contact', 'rush_yds_bc_per_rush',
        'rush_yac', 'rush_yac_per_rush', 'rush_broken_tackles',
        'rush_broken_tackles_per_rush', 'rec_first_down', 'rec_air_yds',
        'rec_air_yds_per_rec', 'rec_yac', 'rec_yac_per_rec', 'rec_adot',
        'rec_broken_tackles', 'rec_broken_tackles_per_rec', 'rec_drops',
        'rec_drop_pct', 'rec_target_int', 'rec_pass_rating', 'def_targets',
        'def_cmp', 'def_cmp_perc', 'def_cmp_yds', 'def_yds_per_cmp',
        'def_yds_per_target', 'def_cmp_td', 'def_pass_rating', 'def_tgt_yds_per_att',
        'def_air_yds', 'def_yac', 'blitzes', 'qb_hurry', 'qb_knockdown', 'pressures',
        'tackles_missed', 'tackles_missed_pct', 'avg_snap_count_after'
    ]
    
    # Filter to only existing columns
    predictor_features = [col for col in predictor_features if col in df.columns]
    performance_features = [col for col in performance_features if col in df.columns]
    
    # Create binary target for injury prediction
    # We'll predict injury severity based on game_status
    df['injury_severity'] = df['game_status'].map({
        'Out': 2,           # Severe injury
        'Doubtful': 1,      # Moderate injury
        'Questionable': 1,  # Moderate injury
        'Probable': 0       # Minor/no injury
    })
    
    # Fill missing values
    df[predictor_features] = df[predictor_features].fillna(df[predictor_features].median())
    df[performance_features] = df[performance_features].fillna(df[performance_features].median())
    
    # Encode categorical features
    categorical_cols = ['most_common_surface', 'most_common_roof', 
                       'prev_surface', 'prev_roof', 'prev_travel_direction']
    
    label_encoders = {}
    for col in categorical_cols:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le
    
    return df, predictor_features, performance_features

In [None]:

# ============================================================================
# MODEL 1: Predict Injury Risk
# ============================================================================

def model_injury_risk(df, predictor_features):
    """
    Model 1: Predict injury risk based on pre-injury features.
    """
    print("=" * 60)
    print("MODEL 1: INJURY RISK PREDICTION")
    print("=" * 60)
    
    # Prepare data
    X = df[predictor_features]
    y = df['injury_severity']  # Multi-class: 0=minor, 1=moderate, 2=severe
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train models
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
        'XGBoost': xgb.XGBClassifier(random_state=42)
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Train
        model.fit(X_train_scaled, y_train)
        
        # Predict
        y_pred = model.predict(X_test_scaled)
        
        # Evaluate
        print(f"Accuracy: {model.score(X_test_scaled, y_test):.3f}")
        print("Classification Report:")
        print(classification_report(y_test, y_pred, 
              target_names=['Minor', 'Moderate', 'Severe']))
        
        # Feature importance
        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
            feature_importance = pd.DataFrame({
                'feature': predictor_features,
                'importance': importances
            }).sort_values('importance', ascending=False)
            
            print("\nTop 10 Most Important Features for Injury Prediction:")
            print(feature_importance.head(10).to_string(index=False))
            
            # Plot feature importance
            plt.figure(figsize=(10, 6))
            plt.barh(feature_importance.head(10)['feature'], 
                    feature_importance.head(10)['importance'])
            plt.xlabel('Importance')
            plt.title(f'{name} - Top 10 Injury Risk Features')
            plt.gca().invert_yaxis()
            plt.tight_layout()
            plt.show()
            
            results[name] = {
                'model': model,
                'feature_importance': feature_importance,
                'scaler': scaler
            }
    
    return results

In [None]:

# ============================================================================
# MODEL 2: Predict Post-Injury Performance
# ============================================================================

def model_post_injury_performance(df, predictor_features, performance_features):
    """
    Model 2: Predict post-injury performance based on injury characteristics.
    """
    print("\n" + "=" * 60)
    print("MODEL 2: POST-INJURY PERFORMANCE PREDICTION")
    print("=" * 60)
    
    # Use injury features as predictors
    injury_features = ['injury_lcoation', 'practice_status', 'game_status', 'position']
    
    # Prepare data
    df_model = df.copy()
    
    # Encode injury features
    le_injury = LabelEncoder()
    df_model['injury_lcoation_encoded'] = le_injury.fit_transform(
        df_model['injury_lcoation'].fillna('Unknown').astype(str)
    )
    
    le_practice = LabelEncoder()
    df_model['practice_status_encoded'] = le_practice.fit_transform(
        df_model['practice_status'].fillna('Unknown').astype(str)
    )
    
    le_position = LabelEncoder()
    df_model['position_encoded'] = le_position.fit_transform(
        df_model['position'].fillna('Unknown').astype(str)
    )
    
    # Combine features
    X_features = ['injury_lcoation_encoded', 'practice_status_encoded', 'position_encoded']
    X = df_model[X_features]
    
    performance_results = {}
    
    # Predict each performance metric
    print(f"\nPredicting {len(performance_features)} performance metrics...")
    
    for i, target_col in enumerate(performance_features[:10]):  # Limit to first 10 for demo
        if target_col not in df_model.columns:
            continue
            
        print(f"\n{target_col}...")
        
        y = df_model[target_col]
        
        # Remove rows where target is NaN
        valid_idx = y.notna()
        X_clean = X[valid_idx]
        y_clean = y[valid_idx]
        
        if len(y_clean) < 20:  # Need enough data
            continue
        
        # Split
        X_train, X_test, y_train, y_test = train_test_split(
            X_clean, y_clean, test_size=0.2, random_state=42
        )
        
        # Scale
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train model
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train_scaled, y_train)
        
        # Predict
        y_pred = model.predict(X_test_scaled)
        
        # Evaluate
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        print(f"  MSE: {mse:.3f}, R²: {r2:.3f}")
        
        performance_results[target_col] = {
            'model': model,
            'mse': mse,
            'r2': r2,
            'scaler': scaler,
            'feature_importance': dict(zip(X_features, model.feature_importances_))
        }
    
    return performance_results

In [None]:

# ============================================================================
# MODEL 3: Integrated Analysis - Connecting Injury Risk to Performance
# ============================================================================

def integrated_analysis(df, predictor_features, performance_features):
    """
    Model 3: Connect injury risk factors to post-injury performance.
    """
    print("\n" + "=" * 60)
    print("MODEL 3: INTEGRATED ANALYSIS")
    print("Connecting Injury Risk Factors to Performance Impact")
    print("=" * 60)
    
    # 3.1: Cluster players by injury risk profile
    from sklearn.cluster import KMeans
    
    X_risk = df[predictor_features].fillna(df[predictor_features].median())
    
    # Scale
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_risk)
    
    # Find optimal clusters
    inertias = []
    for k in range(2, 10):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X_scaled)
        inertias.append(kmeans.inertia_)
    
    # Elbow method visualization
    plt.figure(figsize=(10, 6))
    plt.plot(range(2, 10), inertias, 'bo-')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.title('Elbow Method for Optimal Clusters')
    plt.show()
    
    # Choose k (e.g., 4)
    k = 4
    kmeans = KMeans(n_clusters=k, random_state=42)
    df['risk_cluster'] = kmeans.fit_predict(X_scaled)
    
    # 3.2: Analyze performance by risk cluster
    print("\nPerformance by Risk Cluster:")
    cluster_performance = {}
    
    for cluster in range(k):
        cluster_mask = df['risk_cluster'] == cluster
        cluster_size = cluster_mask.sum()
        
        print(f"\nCluster {cluster} (n={cluster_size}):")
        
        # Get average performance metrics for this cluster
        cluster_avg = df.loc[cluster_mask, performance_features].mean()
        
        # Top 5 performance metrics where this cluster differs most from overall mean
        overall_avg = df[performance_features].mean()
        diff = (cluster_avg - overall_avg).abs()
        top_diffs = diff.nlargest(5)
        
        print("  Most affected performance metrics:")
        for metric, diff_val in top_diffs.items():
            cluster_val = cluster_avg[metric]
            overall_val = overall_avg[metric]
            pct_change = ((cluster_val - overall_val) / overall_val * 100) if overall_val != 0 else 0
            print(f"    {metric}: {cluster_val:.2f} vs {overall_val:.2f} ({pct_change:+.1f}%)")
    
    # 3.3: Key injury risk factors analysis
    print("\n" + "=" * 40)
    print("KEY FINDINGS: Injury Risk Factors")
    print("=" * 40)
    
    # Correlation analysis
    injury_correlations = {}
    for feature in predictor_features:
        corr = df[feature].corr(df['injury_severity'])
        if not pd.isna(corr):
            injury_correlations[feature] = abs(corr)
    
    top_corrs = pd.Series(injury_correlations).nlargest(10)
    print("\nTop 10 factors correlated with injury severity:")
    for feature, corr in top_corrs.items():
        direction = "increases" if df[feature].corr(df['injury_severity']) > 0 else "decreases"
        print(f"  {feature}: {corr:.3f} ({direction} risk)")
    
    # 3.4: Performance impact by injury type
    print("\n" + "=" * 40)
    print("Performance Impact by Injury Type")
    print("=" * 40)
    
    if 'injury_lcoation' in df.columns:
        injury_locations = df['injury_lcoation'].value_counts().head(5).index
        
        for location in injury_locations:
            location_mask = df['injury_lcoation'] == location
            if location_mask.sum() > 10:  # Enough samples
                print(f"\n{location}:")
                
                # Compare performance before vs average
                location_perf = df.loc[location_mask, performance_features].mean()
                overall_perf = df[performance_features].mean()
                
                # Find biggest differences
                perf_diff = (location_perf - overall_perf) / overall_perf * 100
                perf_diff = perf_diff.replace([np.inf, -np.inf], np.nan).dropna()
                
                biggest_drops = perf_diff.nsmallest(3)
                biggest_gains = perf_diff.nlargest(3)
                
                print("  Biggest performance drops:")
                for metric, pct in biggest_drops.items():
                    print(f"    {metric}: {pct:+.1f}%")
                
                print("  Biggest performance gains:")
                for metric, pct in biggest_gains.items():
                    print(f"    {metric}: {pct:+.1f}%")
    
    return df

In [None]:

# ============================================================================
# MAIN PIPELINE
# ============================================================================

def run_injury_analysis_pipeline(df):
    """
    Run the complete injury analysis pipeline.
    """
    print("INJURY ANALYSIS PIPELINE")
    print("=" * 60)
    
    # Step 1: Prepare data
    df_prepared, predictor_features, performance_features = prepare_data(df)
    print(f"Data prepared: {len(df_prepared)} samples")
    print(f"Predictor features: {len(predictor_features)}")
    print(f"Performance features: {len(performance_features)}")
    
    # Step 2: Model 1 - Injury Risk Prediction
    injury_risk_results = model_injury_risk(df_prepared, predictor_features)
    
    # Step 3: Model 2 - Post-Injury Performance Prediction
    performance_results = model_post_injury_performance(
        df_prepared, predictor_features, performance_features
    )
    
    # Step 4: Model 3 - Integrated Analysis
    df_analyzed = integrated_analysis(df_prepared, predictor_features, performance_features)
    
    # Step 5: Summary and Recommendations
    print("\n" + "=" * 60)
    print("SUMMARY & RECOMMENDATIONS")
    print("=" * 60)
    
    # Key takeaways from Model 1
    print("\n1. KEY INJURY RISK FACTORS:")
    rf_importance = injury_risk_results.get('Random Forest', {}).get('feature_importance')
    if rf_importance is not None:
        top_risks = rf_importance.head(5)
        for _, row in top_risks.iterrows():
            print(f"   • {row['feature']}: Importance = {row['importance']:.3f}")
    
    # Key takeaways from Model 2
    print("\n2. PERFORMANCE IMPACT FINDINGS:")
    if performance_results:
        # Sort by R² to find most predictable metrics
        predictable_metrics = sorted(
            [(metric, results['r2']) for metric, results in performance_results.items()],
            key=lambda x: x[1],
            reverse=True
        )[:5]
        
        print("   Most predictable post-injury metrics:")
        for metric, r2 in predictable_metrics:
            print(f"   • {metric}: R² = {r2:.3f}")
    
    # Practical recommendations
    print("\n3. PRACTICAL RECOMMENDATIONS:")
    print("   a) Monitor high-risk factors identified (travel, workload, environment)")
    print("   b) Adjust training based on injury location and severity predictions")
    print("   c) Use performance predictions to set realistic return expectations")
    print("   d) Consider player position in risk assessment")
    
    return {
        'data': df_analyzed,
        'injury_risk_models': injury_risk_results,
        'performance_models': performance_results,
        'predictor_features': predictor_features,
        'performance_features': performance_features
    }

In [None]:

# ============================================================================
# VISUALIZATION FUNCTIONS
# ============================================================================

def create_visualizations(results):
    """
    Create comprehensive visualizations for the analysis.
    """
    df = results['data']
    predictor_features = results['predictor_features']
    
    # 1. Injury Severity Distribution
    plt.figure(figsize=(10, 6))
    df['injury_severity'].value_counts().sort_index().plot(kind='bar')
    plt.xlabel('Injury Severity (0=Minor, 1=Moderate, 2=Severe)')
    plt.ylabel('Count')
    plt.title('Distribution of Injury Severity')
    plt.xticks(rotation=0)
    plt.show()
    
    # 2. Top Predictor Features Heatmap
    if len(predictor_features) >= 5:
        top_features = predictor_features[:10]  # Top 10 features
        
        plt.figure(figsize=(12, 8))
        correlation_matrix = df[top_features + ['injury_severity']].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
        plt.title('Correlation Heatmap: Predictor Features vs Injury Severity')
        plt.tight_layout()
        plt.show()
    
    # 3. Performance Before vs After Injury (for top 3 positions)
    if 'position' in df.columns and 'avg_snap_count_after' in df.columns:
        top_positions = df['position'].value_counts().head(3).index
        
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        
        for idx, position in enumerate(top_positions):
            pos_mask = df['position'] == position
            if pos_mask.sum() > 0:
                axes[idx].scatter(
                    df.loc[pos_mask, 'average_snaps_before'],
                    df.loc[pos_mask, 'avg_snap_count_after'],
                    alpha=0.6
                )
                axes[idx].plot([0, 100], [0, 100], 'r--', alpha=0.5)  # Reference line
                axes[idx].set_xlabel('Average Snaps Before Injury')
                axes[idx].set_ylabel('Average Snaps After Injury')
                axes[idx].set_title(f'{position} - Snaps Before vs After')
                axes[idx].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

In [None]:
# ============================================================================
# EXECUTION
# ============================================================================

if __name__ == "__main__":
    # Load your data
    # df = pd.read_csv('your_injury_data.csv')
    
    # Run the complete pipeline
    analysis_results = run_injury_analysis_pipeline(df)
    
    # Create visualizations
    create_visualizations(analysis_results)
    
    # Save results
    analysis_results['data'].to_csv('injury_analysis_results.csv', index=False)
    print("\nAnalysis complete! Results saved to 'injury_analysis_results.csv'")