In [1]:
import random
import numpy as np
import pandas as pd

# ====== PARAMETERS (replace with data-sourced values) ======
SEASON_GAMES = 16
SIM_REPS = 100
NUM_TEAMS = 1

# Concussions per team per season (Poisson lambda)
lambda_team = 0.438  

concussion_prob_by_pos = { 
    'QB': 0.16666666666666666,  
    'RB': 0.19047619047619047,
    'WR': 0.3333333333333333,
    'TE': 0.30952380952380953,
    'K': 0.00
}

# Position buckets and data-based probabilities
positions = ['QB', 'RB', 'WR', 'TE', 'K'] 

pos_probs = [
    7/55,   # QB
    10/55,  # RB
    20/55,  # WR
    18/55,  # TE
    0/55    # K
]

# Average games missed by position 
games_missed_by_position = {
    'QB': 3,
    'RB': 2,
    'WR': 2,
    'TE': 4,
    'K': 0
}

# Probability that the position "produces" their points_loss in a game (when available)
# Example: QB contributes their average 4.0 points in ~95% of games; adjust as needed.
# PLACEHOLDER
prob_score_by_pos = {
    'QB': 0.3,
    'RB': 0.3,
    'WR': 0.3,
    'TE': 0.3,
    'K': 0.5
}
# expected points lost per game by position 
points_loss = {'QB':-3.73, 'RB':1-8.23, 'WR':-.36, 'TE':-0.16,'K':0.0}

# per-player per-game salary (median values) (NOT PLACEHOLDERS)
per_game_salary = {'QB': 600000, 'RB':158823, 'WR':142000, 'TE':121875, 'K':125000}

# points to wins conversion (NOT PLACEHOLDER)
POINTS_PER_WIN = 47.0

# optional: value per win (for revenue/valuation impact) (NOT PLACEHOLDER)
VALUE_PER_WIN = 2000000

# percent of fans not showing up when players are injured per position 
FAN_DROP_PERCENT = {'QB':0.05, 'RB':0.03, 'WR':0.02, 'TE':0.01,'K':0.005}


In [9]:
# Random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# ========= Simulation ========
def simulate_season():
    total_points_lost = 0.0
    total_salary_lost = 0.0
    total_fan_drop = 0.0
    
    # Simulate number of concussions for the team this season
    num_concussions = np.random.poisson(lambda_team)
    
    for _ in range(num_concussions):
        # Determine position of injured player
        injured_pos = np.random.choice(positions, p=pos_probs)
        
        # Determine games missed
        games_missed = games_missed_by_position[injured_pos]
        
        # Calculate points lost
        points_lost = points_loss[injured_pos] * games_missed
        total_points_lost += points_lost
        
        # Calculate salary lost
        salary_lost = per_game_salary[injured_pos] * games_missed
        total_salary_lost += salary_lost
        
        # Calculate fan drop
        fan_drop = FAN_DROP_PERCENT[injured_pos] * games_missed
        total_fan_drop += fan_drop
    
    return total_points_lost, total_salary_lost, total_fan_drop

# Run simulations
results = []
for sim in range(SIM_REPS):
    points_lost, salary_lost, fan_drop = simulate_season()
    results.append({
        'Simulation': sim + 1,
        'Points_Lost': points_lost,
        'Salary_Lost': salary_lost,
        'Fan_Drop': fan_drop
    })
# print results
#print(results)


In [15]:
### Overall Results Summary ###
# Convert results to DataFrame for easier analysis
df_results = pd.DataFrame(results)
print(df_results)
print("Simulation Averages (100 seasons):")

#averages
df_averages = pd.DataFrame({
    'Metric': ['Points_Lost', 'Money_Lost', 'Fan_Drop'],
    'Average': [
        np.mean([r['Points_Lost'] for r in results]),
        np.mean([r['Salary_Lost'] for r in results]),
        np.mean([r['Fan_Drop'] for r in results])
    ]
})
print(df_averages)
print(f"On average, you can expect to loose ${np.mean([r['Salary_Lost'] for r in results]):,.2f} per season due to concussions.")


    Simulation  Points_Lost  Salary_Lost  Fan_Drop
0            1         0.00          0.0      0.00
1            2       -28.92     635292.0      0.12
2            3         0.00          0.0      0.00
3            4        -0.64     487500.0      0.04
4            5         0.00          0.0      0.00
..         ...          ...          ...       ...
95          96         0.00          0.0      0.00
96          97        -0.72     284000.0      0.04
97          98         0.00          0.0      0.00
98          99         0.00          0.0      0.00
99         100         0.00          0.0      0.00

[100 rows x 4 columns]
Simulation Averages (100 seasons):
        Metric      Average
0  Points_Lost      -1.7501
1   Money_Lost  208460.2200
2     Fan_Drop       0.0221
On average, you can expect to loose $208,460.22 per season due to concussions.


have (throughout 3 years)
- head injuries of contributing players 
- amount of points scored while out
- amount of points in first game back
- the amount of games won and lost while gone 
- whether they won or lost first game back

want 
- chance of conncussion per position
- probability of each position making a point
- value per win 
- money lost while players are out

In [6]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict

# ========= LOAD AND PARSE THE CSV =========
def parse_nfl_data(csv_path):
    """
    Parse the NFL injury data from the messy CSV format.
    Returns a cleaned DataFrame with injury records.
    """
    df = pd.read_csv(r"C:\Users\bella\OneDrive\Documents\PSY_341L\CompSim_Final_Project\NFL Data.csv")
    
    # Forward-fill team names (column B)
    if len(df.columns) > 1:
        df.iloc[:, 1] = df.iloc[:, 1].ffill()
    
    # Identify week columns
    week_cols = [col for col in df.columns if 'Week' in str(col)]
    
    injury_records = []
    
    for idx, row in df.iterrows():
        player_info = str(row.iloc[0]) if pd.notna(row.iloc[0]) else ""
        team = str(row.iloc[1]) if len(df.columns) > 1 and pd.notna(row.iloc[1]) else ""
        
        # Extract position and injury weeks from player info
        pos_match = re.search(r'\b(QB|RB|WR|TE|K)\d?\b', player_info)
        injury_match = re.search(r'WK\s*(\d+)\s*-\s*(\d+)', player_info)
        
        if pos_match and injury_match:
            position = pos_match.group(1)  # Get just QB, RB, etc.
            start_week = int(injury_match.group(1))
            end_week = int(injury_match.group(2))
            games_missed = end_week - start_week + 1
            
            # Extract player name (everything before position)
            player_name = player_info[:pos_match.start()].strip()
            
            # Find scores during injury period
            scores_during = []
            scores_after = []
            
            for week_col in week_cols:
                week_num_match = re.search(r'\d+', week_col)
                if week_num_match:
                    week_num = int(week_num_match.group())
                    score_val = str(row[week_col]) if pd.notna(row[week_col]) else ""
                    
                    # Extract numeric score
                    score_match = re.search(r'(\d+)', score_val)
                    if score_match:
                        score = int(score_match.group(1))
                        
                        if start_week <= week_num <= end_week:
                            scores_during.append(score)
                        elif week_num == end_week + 1:
                            scores_after.append(score)
            
            injury_records.append({
                'Player': player_name,
                'Team': team,
                'Position': position,
                'Start_Week': start_week,
                'End_Week': end_week,
                'Games_Missed': games_missed,
                'Scores_During_Injury': scores_during,
                'Score_First_Game_Back': scores_after[0] if scores_after else None
            })
    
    return pd.DataFrame(injury_records)


# ========= CALCULATE STATISTICS =========
def calculate_injury_stats(injury_df):
    """
    Calculate injury statistics by position for the simulation.
    """
    stats = {}
    
    # Group by position
    for pos in ['QB', 'RB', 'WR', 'TE', 'K']:
        pos_data = injury_df[injury_df['Position'] == pos]
        
        if len(pos_data) > 0:
            # Average games missed
            avg_games_missed = pos_data['Games_Missed'].mean()
            
            # Total injuries
            total_injuries = len(pos_data)
            
            # Average points during injury (team performance)
            all_scores = []
            for scores in pos_data['Scores_During_Injury']:
                all_scores.extend(scores)
            avg_score_during = np.mean(all_scores) if all_scores else 0
            
            # Average first game back score
            first_back_scores = pos_data['Score_First_Game_Back'].dropna()
            avg_first_back = first_back_scores.mean() if len(first_back_scores) > 0 else 0
            
            stats[pos] = {
                'total_injuries': total_injuries,
                'avg_games_missed': round(avg_games_missed, 2),
                'avg_score_during_injury': round(avg_score_during, 2),
                'avg_score_first_back': round(avg_first_back, 2)
            }
        else:
            stats[pos] = {
                'total_injuries': 0,
                'avg_games_missed': 0,
                'avg_score_during_injury': 0,
                'avg_score_first_back': 0
            }
    
    return stats


# ========= CALCULATE SIMULATION PARAMETERS =========
def calculate_simulation_params(injury_df, total_teams=32, seasons=3):
    """
    Calculate parameters needed for the simulation.
    """
    total_games = total_teams * 16 * seasons  # 16 games per season
    total_injuries = len(injury_df)
    
    # Lambda for Poisson (injuries per team per season)
    lambda_team = total_injuries / (total_teams * seasons)
    
    # Position distribution
    pos_counts = injury_df['Position'].value_counts()
    total = pos_counts.sum()
    pos_probs = {pos: pos_counts.get(pos, 0) / total for pos in ['QB', 'RB', 'WR', 'TE', 'K']}
    
    # Games missed by position
    games_missed = {}
    for pos in ['QB', 'RB', 'WR', 'TE', 'K']:
        pos_data = injury_df[injury_df['Position'] == pos]
        if len(pos_data) > 0:
            games_missed[pos] = int(round(pos_data['Games_Missed'].mean()))
        else:
            games_missed[pos] = 0
    
    params = {
        'lambda_team': round(lambda_team, 3),
        'pos_probs': pos_probs,
        'games_missed_by_position': games_missed,
        'total_injuries_observed': total_injuries
    }
    
    return params


# ========= MAIN EXECUTION =========
if __name__ == "__main__":
    
    csv_path = r"C:\Users\bella\OneDrive\Documents\PSY_341L\CompSim_Final_Project\NFL Data.csv"
    
    print("=" * 60)
    print("NFL INJURY DATA PARSER")
    print("=" * 60)
    
    # Parse the data
    print("\nðŸ“Š Parsing injury data...")
    injury_df = parse_nfl_data(csv_path)
    
    print(f"âœ“ Found {len(injury_df)} injury records\n")
    
    # Display sample records
    print("Sample injury records:")
    print(injury_df.head(10).to_string())
    
    # Calculate statistics
    print("\n" + "=" * 60)
    print("INJURY STATISTICS BY POSITION")
    print("=" * 60)
    stats = calculate_injury_stats(injury_df)
    
    for pos, data in stats.items():
        print(f"\n{pos}:")
        print(f"  Total injuries: {data['total_injuries']}")
        print(f"  Avg games missed: {data['avg_games_missed']}")
        print(f"  Avg score during injury: {data['avg_score_during_injury']}")
        print(f"  Avg score first game back: {data['avg_score_first_back']}")
    
    # Calculate simulation parameters
    print("\n" + "=" * 60)
    print("SIMULATION PARAMETERS")
    print("=" * 60)
    params = calculate_simulation_params(injury_df)
    
    print(f"\nÎ» (lambda_team): {params['lambda_team']}")
    print("\nPosition probabilities:")
    for pos, prob in params['pos_probs'].items():
        print(f"  {pos}: {prob:.4f}")
    
    print("\nAverage games missed by position:")
    for pos, games in params['games_missed_by_position'].items():
        print(f"  {pos}: {games} games")
    
    print(f"\nTotal injuries observed: {params['total_injuries_observed']}")
    
    # Save cleaned data
    output_path = "cleaned_nfl_injury_data.csv"
    injury_df.to_csv(output_path, index=False)
    print(f"\nâœ“ Cleaned data saved to: {output_path}")
    
    print("\n" + "=" * 60)
    print("READY TO UPDATE YOUR SIMULATION!")
    print("=" * 60)
    print("\nUse these values in your simulation code:")
    print(f"lambda_team = {params['lambda_team']}")
    print(f"pos_probs = {list(params['pos_probs'].values())}")
    print(f"games_missed_by_position = {params['games_missed_by_position']}")

NFL INJURY DATA PARSER

ðŸ“Š Parsing injury data...
âœ“ Found 42 injury records

Sample injury records:
           Player       Team Position  Start_Week  End_Week  Games_Missed                                Scores_During_Injury  Score_First_Game_Back
0      Jay Cutler      Bears       QB          11        12             2                                             [7, 28]                   17.0
1    Devin Hester      Bears       WR          13        14             2                                                  []                    NaN
2                      Bills       RB          11        12             2                                            [19, 13]                    NaN
3                     Browns       TE          16        17             2                                                  []                    NaN
4                  Cardinals       WR          15        17             3                                        [38, 13, 13]                    NaN
5 

In [7]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict

def calculate_points_contribution(csv_path):
    """
    Calculate the typical point contribution (and loss when injured) 
    for each position based on team scoring data.
    """
    df = pd.read_csv(csv_path)
    
    # Forward-fill team names
    if len(df.columns) > 1:
        df.iloc[:, 1] = df.iloc[:, 1].ffill()
    
    week_cols = [col for col in df.columns if 'Week' in str(col)]
    
    # Track scores during injuries and baseline scores
    position_data = defaultdict(lambda: {
        'scores_during_injury': [],
        'baseline_scores': [],
        'team_baseline': defaultdict(list)
    })
    
    # First pass: collect all team scores when NO injuries
    team_all_scores = defaultdict(list)
    injured_weeks = set()  # Track (team, week) pairs with injuries
    
    for idx, row in df.iterrows():
        player_info = str(row.iloc[0]) if pd.notna(row.iloc[0]) else ""
        team = str(row.iloc[1]) if len(df.columns) > 1 and pd.notna(row.iloc[1]) else ""
        
        injury_match = re.search(r'WK\s*(\d+)\s*-\s*(\d+)', player_info)
        pos_match = re.search(r'\b(QB|RB|WR|TE|K)\d?\b', player_info)
        
        if injury_match and pos_match:
            start_week = int(injury_match.group(1))
            end_week = int(injury_match.group(2))
            position = pos_match.group(1)
            
            for week_col in week_cols:
                week_num_match = re.search(r'\d+', week_col)
                if week_num_match:
                    week_num = int(week_num_match.group())
                    score_val = str(row[week_col]) if pd.notna(row[week_col]) else ""
                    score_match = re.search(r'(\d+)', score_val)
                    
                    if score_match:
                        score = int(score_match.group(1))
                        
                        # Track injured weeks
                        if start_week <= week_num <= end_week:
                            injured_weeks.add((team, week_num))
                            position_data[position]['scores_during_injury'].append(score)
                        
                        # Collect all scores for baseline
                        team_all_scores[team].append(score)
    
    # Second pass: get baseline scores (when no injuries)
    for idx, row in df.iterrows():
        team = str(row.iloc[1]) if len(df.columns) > 1 and pd.notna(row.iloc[1]) else ""
        
        for week_col in week_cols:
            week_num_match = re.search(r'\d+', week_col)
            if week_num_match:
                week_num = int(week_num_match.group())
                score_val = str(row[week_col]) if pd.notna(row[week_col]) else ""
                score_match = re.search(r'(\d+)', score_val)
                
                if score_match and (team, week_num) not in injured_weeks:
                    score = int(score_match.group(1))
                    # This is a healthy game for this team
                    for pos in ['QB', 'RB', 'WR', 'TE', 'K']:
                        position_data[pos]['baseline_scores'].append(score)
    
    # Calculate point contributions
    results = {}
    
    print("=" * 70)
    print("POINT CONTRIBUTION ANALYSIS BY POSITION")
    print("=" * 70)
    
    for pos in ['QB', 'RB', 'WR', 'TE', 'K']:
        data = position_data[pos]
        
        if data['scores_during_injury'] and data['baseline_scores']:
            avg_with_injury = np.mean(data['scores_during_injury'])
            avg_baseline = np.mean(data['baseline_scores'])
            points_lost = avg_baseline - avg_with_injury
            
            # Probability they contribute (games where they made a difference)
            # Estimate: if points_lost > 0, they contribute in most games
            prob_contribute = min(0.95, max(0.30, (points_lost / avg_baseline) * 3))
            
            results[pos] = {
                'avg_score_with_injury': round(avg_with_injury, 2),
                'avg_baseline_score': round(avg_baseline, 2),
                'points_lost_per_game': round(points_lost, 2),
                'prob_contributes': round(prob_contribute, 2),
                'injury_samples': len(data['scores_during_injury']),
                'baseline_samples': len(data['baseline_scores'])
            }
        else:
            results[pos] = {
                'avg_score_with_injury': 0,
                'avg_baseline_score': 0,
                'points_lost_per_game': 0,
                'prob_contributes': 0.50,
                'injury_samples': 0,
                'baseline_samples': 0
            }
        
        print(f"\n{pos}:")
        print(f"  Avg team score when {pos} injured: {results[pos]['avg_score_with_injury']}")
        print(f"  Avg team score (baseline): {results[pos]['avg_baseline_score']}")
        print(f"  â†’ Points lost per game: {results[pos]['points_lost_per_game']}")
        print(f"  â†’ Estimated prob contributes: {results[pos]['prob_contributes']}")
        print(f"  Sample size: {results[pos]['injury_samples']} injured games, "
              f"{results[pos]['baseline_samples']} baseline games")
    
    # Generate code snippets
    print("\n" + "=" * 70)
    print("COPY THESE VALUES INTO YOUR SIMULATION:")
    print("=" * 70)
    
    points_loss_dict = {pos: results[pos]['points_lost_per_game'] 
                        for pos in ['QB', 'RB', 'WR', 'TE', 'K']}
    prob_score_dict = {pos: results[pos]['prob_contributes'] 
                       for pos in ['QB', 'RB', 'WR', 'TE', 'K']}
    
    print("\n# Expected points lost per game by position")
    print(f"points_loss = {points_loss_dict}")
    
    print("\n# Probability that position contributes points in a game")
    print(f"prob_score_by_pos = {prob_score_dict}")
    
    # Calculate win probability impact
    print("\n" + "=" * 70)
    print("WIN PROBABILITY ANALYSIS")
    print("=" * 70)
    print(f"\nAssuming {47} points needed to win on average:")
    
    for pos in ['QB', 'RB', 'WR', 'TE', 'K']:
        pts_lost = results[pos]['points_lost_per_game']
        baseline = results[pos]['avg_baseline_score']
        expected_with_injury = baseline - pts_lost
        
        print(f"\n{pos} injury:")
        print(f"  Expected score: {expected_with_injury:.1f} points")
        print(f"  Point deficit: {47 - expected_with_injury:.1f} points below win threshold")
        if expected_with_injury < 47:
            win_prob = max(0, (expected_with_injury / 47) * 0.5)  # Rough estimate
            print(f"  Est. win probability: ~{win_prob:.1%}")
    
    return results


if __name__ == "__main__":
    csv_path = r"C:\Users\bella\OneDrive\Documents\PSY_341L\CompSim_Final_Project\NFL Data.csv"
    
    print("Analyzing NFL injury impact on scoring...\n")
    results = calculate_points_contribution(csv_path)
    
    print("\nâœ“ Analysis complete!")

Analyzing NFL injury impact on scoring...

POINT CONTRIBUTION ANALYSIS BY POSITION

QB:
  Avg team score when QB injured: 22.83
  Avg team score (baseline): 19.1
  â†’ Points lost per game: -3.73
  â†’ Estimated prob contributes: 0.3
  Sample size: 18 injured games, 48 baseline games

RB:
  Avg team score when RB injured: 27.33
  Avg team score (baseline): 19.1
  â†’ Points lost per game: -8.23
  â†’ Estimated prob contributes: 0.3
  Sample size: 12 injured games, 48 baseline games

WR:
  Avg team score when WR injured: 21.47
  Avg team score (baseline): 19.1
  â†’ Points lost per game: -2.36
  â†’ Estimated prob contributes: 0.3
  Sample size: 30 injured games, 48 baseline games

TE:
  Avg team score when TE injured: 19.26
  Avg team score (baseline): 19.1
  â†’ Points lost per game: -0.16
  â†’ Estimated prob contributes: 0.3
  Sample size: 42 injured games, 48 baseline games

K:
  Avg team score when K injured: 0
  Avg team score (baseline): 0
  â†’ Points lost per game: 0
  â†’ Est