<a href="https://colab.research.google.com/github/bhaveshasasik/nfl_game_predictor/blob/main/Random_Forest_NFL_Game_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

In [50]:
def get_impactful(top_players):
    impactful_players = {
          row['Team']: {
              'Position': row['Pos'],
              'Player': row['Player'],
              'Impact_Score': row['Impact_Score']
          }
          for _, row in top_players.iterrows()
      }
    return impactful_players

In [51]:

# Function to find most impactful running back
def rb_func(file_path):
    # Load data
    data = pd.read_csv(file_path, header=1)

    # Rename columns for easy access
    data.columns = [
        'Rk', 'Player', 'Team', 'Age', 'Pos', 'G', 'GS', 'Att', 'Yds', 'TD',
        '1D', 'Succ%', 'Lng', 'Y/A', 'Y/G', 'Fmb'
    ]

    # Filter for running backs with minimum attempts
    data = data[(data['Pos'] == 'RB') & (data['Att'] >= 60)]

    # Calculate additional metrics
    data['Yards_per_Attempt'] = data['Yds'] / data['Att']
    data['Touchdowns_per_Attempt'] = data['TD'] / data['Att']
    data['Success_Rate'] = data['Succ%'] / 100  # Assuming Succ% is already a percentage

    # Select metrics and normalize
    metrics = ['Yards_per_Attempt', 'Touchdowns_per_Attempt', 'Success_Rate', 'Y/G']
    scaler = MinMaxScaler()
    data[metrics] = scaler.fit_transform(data[metrics])

    # Calculate impact score
    data['Impact_Score'] = (
        0.4 * data['Yards_per_Attempt'] +
        0.3 * data['Touchdowns_per_Attempt'] +
        0.2 * data['Success_Rate'] +
        0.1 * data['Y/G']
    )

    # Get top player per team
    top_players_per_team = (
        data.sort_values(by=['Team', 'Impact_Score'], ascending=[True, False])
        .groupby('Team')
        .head(1)
    )

    return top_players_per_team

In [52]:
def safety_func(file_path):

  data = pd.read_csv(file_path)

  data = data[data["Pos"] == "S"]

  data["Interceptions_Per_Game"] = data["Int"] / data["G"]
  data["Passing_Defended_Per_Game"] = data["PD"] / data["G"]
  data["Forced_Fumbles_Per_Game"] = data["FF"] / data["G"]
  data["Defensive_Touchdowns_Per_Game"] = data["IntTD"] / data["G"]
  data["Yards_From_Interceptions"] = data["Yds"] + data["Yds.1"] / data["G"]
  data["Solo_Per_Game"] = data["Solo"] / data["G"]
  data["Ast_Per_Game"] = data["Ast"] / data["G"]
  data["TFL_Per_Game"] = data["TFL"] / data["G"]
  data["QBHits_Per_Game"] = data["QBHits"] / data["G"]


  metrics = ["Interceptions_Per_Game", "Passing_Defended_Per_Game", "Forced_Fumbles_Per_Game", "Defensive_Touchdowns_Per_Game", "Yards_From_Interceptions", "Solo_Per_Game", "Ast_Per_Game", "TFL_Per_Game", "QBHits_Per_Game"]


  scaler = MinMaxScaler()
  data[metrics] = scaler.fit_transform(data[metrics])


  data['Impact_Score'] = (
      0.35 * data['Interceptions_Per_Game'] +
      0.2 * data['Passing_Defended_Per_Game'] +
      0.2 * (data['Solo_Per_Game'] + data['Ast_Per_Game']) +
      0.15 * data['Defensive_Touchdowns_Per_Game'] +
      0.1 * (data['TFL_Per_Game'] + data['QBHits_Per_Game'])
  )



  top_players_per_team = (
      data.sort_values(by=['Team', 'Impact_Score'], ascending=[True, False])
      .groupby('Team')
      .head(1)
  )

  return top_players_per_team

In [59]:
def qb_func(file_path):
  column_names = ["Rk", "Player", "Age", "Team", "Pos", "G", "GS", "QBrec", "Cmp", "Att",
                "Cmp%", "Yds", "TD", "TD%", "Int", "Int%", "1D", "Succ%", "Lng", "Y/A",
                "AY/A", "Y/C", "Y/G", "Rate", "QBR", "Sk", "Sk Yds", "Sk%", "NY/A",
                "ANY/A", "4QC", "GWD"]

  data = pd.read_csv(file_path)

  data.columns = data.columns.str.strip()

  # Check and adjust numeric columns
  numeric_columns = ['Yds', 'Cmp', 'Att', 'TD', 'Int', 'Rate', 'Succ%']
  # Ensure only columns present in the DataFrame are used
  numeric_columns = [col for col in numeric_columns if col in data.columns]

  # Convert to numeric, coercing errors to NaN
  data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')

  # Display the first few rows
  data['Pos'] = data['Pos'].str.strip().str.upper()
  data = data[data['Pos'] == 'QB']
  print(data.head())

  data['Yards_per_Attempt'] = data['Yds'] / data['Att']
  data['Touchdowns_per_Attempt'] = data['TD'] / data['Att']
  data['Passing_Yards_per_Attempt'] = data['Yds'] / data['Att']
  data['Interceptions_per_Attempt'] = data['Int'] / data['Att']
  data['Success_Rate'] = data['Succ%'] / 100  # Assuming Succ% is a percentage
  data['Y/G'] = data['Y/G']



  metrics = ['Yards_per_Attempt', 'Touchdowns_per_Attempt', 'Passing_Yards_per_Attempt', 'Interceptions_per_Attempt', 'Success_Rate', 'Y/G']



  data[metrics] = data[metrics].apply(pd.to_numeric, errors='coerce')  # Coerce strings to NaN
  data[metrics] = data[metrics].fillna(0)
  data[metrics] = data[metrics].replace([np.inf, -np.inf], np.nan)


  scaler = MinMaxScaler()
  #print(data.columns)
  #print(data[metrics])
  data[metrics] = scaler.fit_transform(data[metrics])


  data['Impact_Score'] = (
      0.4 * data['Yards_per_Attempt'] +      # Focus on yard efficiency
      0.3 * data['Touchdowns_per_Attempt'] +  # Prioritize scoring ability
      0.15 * data['Passing_Yards_per_Attempt'] +  # Adjust for passing contribution
      0.05 * data['Interceptions_per_Attempt'] +  # Penalize interceptions slightly
      0.05 * data['Success_Rate'] +         # Capture overall play success
      0.05 * data['Y/G']                    # Account for consistency in yardage
  )


  top_players_per_team = (
      data.sort_values(by=['Team', 'Impact_Score'], ascending=[True, False])
      .groupby('Team')
      .head(1)
  )
  return top_players_per_team

In [65]:
def wr_func(file_path):
  data = pd.read_csv(file_path)

  # Rename columns for easier access
  data.columns = [
      'Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'Tgt', 'Rec', 'Yds',
      'Y/R', 'TD', '1D', 'Succ%', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 'Fmb'
  ]

  # Ensure the dataset is filtered to include only wide receivers
  data = data[data['Pos'] == 'WR']

  # Convert 'Succ%' column to numeric, handling errors
  data['Succ%'] = pd.to_numeric(data['Succ%'], errors='coerce')

  # Calculate additional impact metrics
  data['Success_Rate'] = data['Succ%'] / 100  # Assuming Succ% is already a percentage

  # Select the metrics for impact scoring
  metrics = ['Y/G', 'R/G', 'Success_Rate']

  # Normalize the metrics to bring them to a 0-1 range
  scaler = MinMaxScaler()
  data[metrics] = scaler.fit_transform(data[metrics])

  # Calculate the final impact score as a weighted sum of metrics
  data['Impact_Score'] = (
      0.45 * data['Y/G'] +
      0.35 * data['R/G'] +
      0.2 * data['Success_Rate']
  )

  # Sort and group by team to get the top player per team based on Impact Score
  top_players_per_team = (
      data.sort_values(by=['Team', 'Impact_Score'], ascending=[True, False])
      .groupby('Team')
      .head(1)
  )
  return top_players_per_team

In [67]:
def cb_and_lb_func(file_path):
  data = pd.read_csv(file_path)

  data = data[(data["Pos"] == "LB") | (data["Pos"] == "C")]
  data["Interceptions_Per_Game"] = data["Int"] / data["G"]
  data["Passes_Defended_Per_Game"] = data["PD"] / data["G"]
  data["Forced_Fumbles_Per_Game"] = data["FF"] / data["G"]
  data["Defensive_Touchdowns_Per_Game"] = data["IntTD"] / data["G"]
  data["Yards_From_Interceptions_Per_Game"] = (data["Yds"] + data["Yds.1"]) / data["G"]
  data["Solo_Tackles_Per_Game"] = data["Solo"] / data["G"]
  data["Assisted_Tackles_Per_Game"] = data["Ast"] / data["G"]
  data["Tackles_For_Loss_Per_Game"] = data["TFL"] / data["G"]
  data["QB_Hits_Per_Game"] = data["QBHits"] / data["G"]


  metrics = ["Interceptions_Per_Game","Passes_Defended_Per_Game", "Solo_Tackles_Per_Game", "Forced_Fumbles_Per_Game", "Yards_From_Interceptions_Per_Game"]


  scaler = MinMaxScaler()
  data[metrics] = scaler.fit_transform(data[metrics])


  data["Impact_Score"] = (
      0.25 * data["Interceptions_Per_Game"] +        # Turnover ability
      0.2 * data["Passes_Defended_Per_Game"] +       # Coverage contribution
      0.2 * data["Solo_Tackles_Per_Game"] +          # Tackling reliability
      0.15 * data["Forced_Fumbles_Per_Game"] +       # Playmaking on forced fumbles
      0.1 * data["Tackles_For_Loss_Per_Game"] +      # Tackles behind the line
      0.1 * data["QB_Hits_Per_Game"]                 # Pressure on quarterback
  )


  top_players_per_team = (
      data.sort_values(by=['Team', 'Impact_Score'], ascending=[True, False])
      .groupby('Team')
      .head(1)
  )
  return top_players_per_team

In [68]:
impactful_rb = rb_func('../2023 rushing stats.csv')
impactful_safety = safety_func('../safety_stats.csv')
#impactful_qb = qb_func('../qb_stats.csv')
impactful_wr = wr_func('../2023_Wide_Receiver_Stats.csv')
impactful_cb_lb = cb_and_lb_func('../cornerback_data1.csv')

rb_player = get_impactful(impactful_rb)
safety_player = get_impactful(impactful_safety)
#qb_player = get_impactful(impactful_qb)
wr_player = get_impactful(impactful_wr)
cb_lb_player = get_impactful(impactful_cb_lb)
print(rb_player)
print(safety_player)
print(wr_player)
print(cb_lb_player)
#print(qb_player)



{'2TM': {'Position': 'RB', 'Player': 'Cam Akers', 'Impact_Score': 0.12730514096185735}, 'ARI': {'Position': 'RB', 'Player': 'James Conner', 'Impact_Score': 0.4835199881295264}, 'ATL': {'Position': 'RB', 'Player': 'Bijan Robinson', 'Impact_Score': 0.32359213802566994}, 'BAL': {'Position': 'RB', 'Player': 'Gus Edwards', 'Impact_Score': 0.5150651843467031}, 'BUF': {'Position': 'RB', 'Player': 'Latavius Murray', 'Impact_Score': 0.42558187338512193}, 'CAR': {'Position': 'RB', 'Player': 'Chuba Hubbard', 'Impact_Score': 0.2940749858651862}, 'CHI': {'Position': 'RB', 'Player': "D'Onta Foreman", 'Impact_Score': 0.39237034173318275}, 'CIN': {'Position': 'RB', 'Player': 'Joe Mixon', 'Impact_Score': 0.385267004982589}, 'CLE': {'Position': 'RB', 'Player': 'Kareem Hunt', 'Impact_Score': 0.3494676170360179}, 'DAL': {'Position': 'RB', 'Player': 'Tony Pollard', 'Impact_Score': 0.34571976049029}, 'DEN': {'Position': 'RB', 'Player': 'Jaleel McLaughlin', 'Impact_Score': 0.4068734918398893}, 'DET': {'Posit

In [None]:
def combine_impact_scores(qb_func, rb_func, wr_func, te_func, sft_func, cb_lb_func):
    """
    Combines impact scores from multiple position-specific functions into one dictionary.

    Parameters:
        qb_func (function): Function to calculate QB impact scores.
        rb_func (function): Function to calculate RB impact scores.
        wr_func (function): Function to calculate WR impact scores.
        te_func (function): Function to calculate TE impact scores.
        sft_func (function): Function to calculate SFT impact scores.
        cb_lb_func (function): Function to calculate CB/LB impact scores.

    Returns:
        dict: Combined dictionary with team-wise impact scores for all positions.
    """
    # Call each position-specific function to get their impact dictionaries
    qb_scores = qb_func()
    rb_scores = rb_func()
    wr_scores = wr_func()
    te_scores = te_func()
    sft_scores = sft_func()
    cb_lb_scores = cb_lb_func()

    # List of all score dictionaries
    all_scores = [qb_scores, rb_scores, wr_scores, te_scores, sft_scores, cb_lb_scores]

    # Initialize combined dictionary
    combined_scores = {}

    # Merge dictionaries
    for scores in all_scores:
        for team, player_data in scores.items():
            if team not in combined_scores:
                combined_scores[team] = []
            combined_scores[team].append(player_data)

    return combined_scores


In [None]:

# Function to process general team data (standings and win/loss records)
def process_team_standings(file_path):
    # Load data
    standings = pd.read_csv(file_path)

    # Calculate win percentage
    standings['Win_Percentage'] = standings['Wins'] / (standings['Wins'] + standings['Losses'])

    # Normalize win percentage
    scaler = MinMaxScaler()
    standings['Win_Percentage_Normalized'] = scaler.fit_transform(standings[['Win_Percentage']])

    # Return standings data
    team_data = standings[['Team', 'Win_Percentage_Normalized']].set_index('Team').to_dict('index')
    return team_data

# Combine impact scores and general team data
def combine_team_data(rb_impact_data, team_data):
    combined_data = []
    for team, rb_info in rb_impact_data.items():
        if team in team_data:
            combined_data.append({
                'Team': team,
                'Impact_Score': rb_info['Impact_Score'],
                'Win_Percentage': team_data[team]['Win_Percentage_Normalized']
            })
    return pd.DataFrame(combined_data)






# Random forest model
def train_random_forest(data):
    # Prepare features and labels
    X = data[['Impact_Score', 'Win_Percentage']]
    y = data['Outcome']  # Binary outcome: 1 = Win, 0 = Loss

    # Train random forest
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X, y)

    return rf

# Example Usage
if __name__ == "__main__":
    # Calculate RB impact
    rb_file_path = '2023_rushing_stats.csv'  # Path to rushing stats file
    rb_impact = calculate_top_rb_impact(rb_file_path)
    print("RB Impact:", rb_impact)

    # Process team standings
    standings_file_path = 'team_standings.csv'  # Path to standings file
    team_standings = process_team_standings(standings_file_path)
    print("Team Standings:", team_standings)

    # Combine data
    combined_data = combine_team_data(rb_impact, team_standings)
    print("Combined Data:", combined_data)

    # Train random forest (assuming Outcome column is present in combined_data)
    rf_model = train_random_forest(combined_data)
    print("Random Forest Model Trained.")
