In [1]:
import sqlite3
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score

In [None]:
cp ../Scraping/nfl.db .

In [2]:
# Convert probabilities to American odds
def probability_to_american_odds(prob):
    epsilon = 1e-10
    prob = max(min(prob, 1 - epsilon), epsilon)
    if prob > 0.5:
        return -100 * (prob / (1 - prob))
    else:
        return 100 * ((1 - prob) / prob)

# Test Run 1
prob_1 = 0.75
odds_1 = probability_to_american_odds(prob_1)
print(f"Probability: {prob_1}, American Odds: {odds_1}")

# Test Run 2
prob_2 = 0.40
odds_2 = probability_to_american_odds(prob_2)
print(f"Probability: {prob_2}, American Odds: {odds_2}")

# Test Run 3
prob_3 = 0.95
odds_3 = probability_to_american_odds(prob_3)
print(f"Probability: {prob_3}, American Odds: {odds_3}")

# Test Run 4
prob_3 = 0.25
odds_3 = probability_to_american_odds(prob_3)
print(f"Probability: {prob_3}, American Odds: {odds_3}")

Probability: 0.75, American Odds: -300.0
Probability: 0.4, American Odds: 149.99999999999997
Probability: 0.95, American Odds: -1899.9999999999982
Probability: 0.25, American Odds: 300.0


### Key Metrics for the Model:

   - **Recent Form**: A player's performance trend over recent weeks—whether they’re in a scoring slump or a hot streak.
   - **Historical TD Scoring Rate**: The frequency with which a player scores touchdowns in past games.
   - **Red Zone Targets**: Number of times a player is targeted in the red zone, where TD scoring opportunities are higher.
   - **Yards Per Carry/Reception**: Efficiency metrics that show how many yards a player gains on average per carry or reception.
   - **Touch Rate**: The percentage of plays in which the player is involved (e.g., carries, targets).
   - **Injury Status**: Whether the player is fully healthy, playing through an injury, or recently recovered.
   - **Team's Red Zone Efficiency**: How often the team converts red zone visits into touchdowns.
   - **Defensive Line Strength**: Metrics like defensive line grades, which can impact a running back's effectiveness.
   - **Opponent's Defensive Strength**: How strong the opposing defense is, particularly in stopping the run or pass.
   - **Opponent’s Red Zone Defense**: How well the opponent prevents touchdowns in the red zone.
   - **Defensive Adjustments**: The opponent's tendency to double-cover key players or adjust schemes for specific matchups.
   - **Weather/Situational Adjustments**: Expected flow of the game—whether a team is likely to be in a pass-heavy or run-heavy situation due to being ahead or behind.
   - **Home vs. Away**: Players might perform differently based on whether they’re playing at home or away.
   - **Expected Points Added (EPA)**: How much a player contributes to the team's scoring opportunities.
   - **Player Usage Rate in High-Leverage Situations**: How often the player is used in critical game situations, like third downs or within the 5-yard line.
   - **Game Pace**: The number of plays run per game by both teams, which can influence opportunities.
   - **Intangible Factors**: This could include things like a player’s motivation (e.g., playing against a former team), team morale, or locker room dynamics.


For tight ends, running backs, and wide receivers using data in that table, can you write code that analyzes each players performances (such as good, bad, avg, etc) against each specific team? I want to dig deep into these head to head matchups

For tight ends, running backs, and wide receivers using data in that table, how can I start developing an AI ML model that analyzes and predicts each players likelihood to score in an upcoming game against a given opponent? Base it on things such as historical performance against the team, in situational spots, coming off good/bag games, etc and other advanced trends.

In [None]:
# Players performances against each team (yards & touchdowns)

conn = sqlite3.connect('nfl.db')

df = pd.read_sql_query("SELECT * FROM PlayerStats", conn)

conn.close()

# Filter the data for tight ends, running backs, and wide receivers
positions = ['TE', 'RB', 'WR']
df_filtered = df[df['position'].isin(positions)].copy()

# Extract both teams from the game_id assuming the format is "season_week_home_away"
df_filtered['home_team'] = df_filtered['game_id'].apply(lambda x: x.split('_')[2])
df_filtered['away_team'] = df_filtered['game_id'].apply(lambda x: x.split('_')[3])

# Determine the opponent team for each game
df_filtered['opponent_team'] = df_filtered.apply(
    lambda row: row['away_team'] if row['player_current_team'] == row['home_team'] else row['home_team'],
    axis=1
)

# Group by player and opponent team to aggregate TDs, yards, and count games played
# performance_metrics = df_filtered.groupby(['player_display_name', 'opponent_team']).agg({
#     'rushing_tds': 'sum',
#     'receiving_tds': 'sum',
#     'rushing_yards': 'sum',
#     'receiving_yards': 'sum',
#     'game_id': 'count'  # This will count the number of games played
# }).reset_index()
performance_metrics = df_filtered.groupby(['player_display_name', 'opponent_team', 'position']).agg({
    'rushing_tds': 'sum',
    'receiving_tds': 'sum',
    'rushing_yards': 'sum',
    'receiving_yards': 'sum',
    'game_id': 'count'  # This will count the number of games played
}).reset_index()

# Rename the 'game_id' column to 'games_played'
performance_metrics.rename(columns={'game_id': 'games_played'}, inplace=True)

# Calculate averages
performance_metrics['total_tds'] = performance_metrics['rushing_tds'] + performance_metrics['receiving_tds']
performance_metrics = performance_metrics.drop(columns=['rushing_tds', 'receiving_tds']) # Drop the individual touchdown columns
performance_metrics['avg_total_tds'] = performance_metrics['total_tds'] / performance_metrics['games_played']
performance_metrics['avg_rushing_yards'] = performance_metrics['rushing_yards'] / performance_metrics['games_played']
performance_metrics['avg_receiving_yards'] = performance_metrics['receiving_yards'] / performance_metrics['games_played']

# Define thresholds for performance classification based on total touchdowns
def classify_performance_by_tds(row):
    total_tds = row['total_tds']
    if total_tds >= 2:
        return 'Good'
    elif total_tds == 1:
        return 'Average'
    else:
        return 'Bad'

# Apply classification to each row using the new criteria
performance_metrics['performance'] = performance_metrics.apply(classify_performance_by_tds, axis=1)

# Rearrange columns to place 'games_played' after 'opponent_team'
# column_order = [
#     'player_display_name', 
#     'opponent_team', 
#     'games_played', 
#     'total_tds', 
#     'rushing_yards', 
#     'receiving_yards',
#     'avg_total_tds',
#     'avg_rushing_yards', 
#     'avg_receiving_yards',
#     'performance'
# ]
column_order = [
    'player_display_name', 
    'opponent_team', 
    'position',
    'games_played', 
    'total_tds', 
    'rushing_yards', 
    'receiving_yards',
    'avg_total_tds',
    'avg_rushing_yards', 
    'avg_receiving_yards',
    'performance'
]
performance_metrics = performance_metrics[column_order]

# Save the results to a CSV file
performance_metrics.to_csv('player_against_teams_stats.csv', index=False)

print(f"Analysis complete. Results saved to 'player_against_teams_stats.csv'")

!open player_against_teams_stats.csv

In [None]:
# Generate Top 10 Averages by Position and Opponent

df = pd.read_csv('player_against_teams_stats.csv')

# Filter for players who have played at least 2 games
df_filtered = df[df['games_played'] >= 2]

# Function to get top 10 averages for a given position
def get_top_10_averages_by_position(position):
    # Filter by position
    df_position = df_filtered[df_filtered['position'] == position]
    
    # Group by player, position, and opponent team to calculate the average total touchdowns per game
    top_averages_opponent = df_position.groupby(['player_display_name', 'position', 'opponent_team']).agg({
        'avg_total_tds': 'mean',
        'games_played': 'max'  # Ensure we include the games played column for clarity
    }).reset_index()
    
    # Sort by average total touchdowns and get the top 10
    top_10 = top_averages_opponent.nlargest(10, 'avg_total_tds')
    
    return top_10

# Get the top 10 for each position
top_10_rb = get_top_10_averages_by_position('RB')
top_10_te = get_top_10_averages_by_position('TE')
top_10_wr = get_top_10_averages_by_position('WR')

# Display the results
print("Top 10 Running Backs (RB) by Average Touchdowns per Game:")
print(top_10_rb)
print("\nTop 10 Tight Ends (TE) by Average Touchdowns per Game:")
print(top_10_te)
print("\nTop 10 Wide Receivers (WR) by Average Touchdowns per Game:")
print(top_10_wr)


# XGBoost 1

In [None]:
# XGBoost
# 1.	Rushing Yards
# 2.	Receiving Yards
# 3.	Targets
# 4.	Home/Away Flag


conn = sqlite3.connect('nfl.db')
df = pd.read_sql_query("SELECT * FROM PlayerStats", conn)
conn.close()

positions = ['TE', 'RB', 'WR']
df_filtered = df[df['position'].isin(positions)].copy()

# Create a binary target variable: whether a player scored a touchdown in a game
df_filtered['scored_td'] = df_filtered.apply(
    lambda row: 1 if row['rushing_tds'] + row['receiving_tds'] > 0 else 0, axis=1
)

# Creating a 'home_away_flag' based on game_id (assuming 'game_id' contains home/away information)
# df_filtered['home_away_flag'] = df_filtered['game_id'].apply(lambda x: 1 if 'H' in x else 0)
df_filtered['home_team'] = df_filtered['game_id'].apply(lambda x: x.split('_')[2])
df_filtered['away_team'] = df_filtered['game_id'].apply(lambda x: x.split('_')[1])
df_filtered['home_away_flag'] = df_filtered.apply(
    lambda row: 1 if row['player_current_team'] == row['home_team'] else 0,
    axis=1
)

# Example: Creating an 'opponent_defense_score' (simple placeholder, this would ideally be based on real defensive stats)
# df_filtered['opponent_defense_score'] = df_filtered['opponent_team'].apply(lambda x: 1 if x in ['BUF', 'BAL', 'NE'] else 0)

# Example: Historical performance against specific opponents
# df_filtered = df_filtered.merge(historical_performance, on=['player_display_name', 'opponent_team'], how='left')

# Drop rows with missing values (if any) to clean the dataset
# df_filtered = df_filtered.dropna(subset=['rushing_yards', 'receiving_yards', 'targets', 'home_away_flag'])

# Assuming 'df_filtered' is prepared with relevant features
X = df_filtered[['rushing_yards', 'receiving_yards', 'targets', 'home_away_flag']]
y = df_filtered['scored_td']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize an XGBoost model
# model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model = xgb.XGBClassifier(eval_metric='logloss')

# Train the model
model.fit(X_train, y_train)

# Predict probabilities
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Convert probabilities to American odds
def probability_to_american_odds(prob):
    if prob > 0.5:
        return -100 * (prob / (1 - prob))
    else:
        return 100 * ((1 - prob) / prob)

american_odds = [probability_to_american_odds(prob) for prob in y_pred_proba]

# Evaluate the model
roc_auc = roc_auc_score(y_test, y_pred_proba)
log_loss_value = log_loss(y_test, y_pred_proba)

print("ROC AUC Score:", roc_auc)
print("Log Loss:", log_loss_value)
print("Example American Odds:", american_odds[:5])
total_items = len(american_odds)
print(f"Total number of items in Example American Odds: {total_items}")

# # Filter for Travis Kelce
# travis_kelce_df = df_filtered[df_filtered['player_display_name'] == 'Travis Kelce']
# travis_kelce_test_idx = X_test.index.intersection(travis_kelce_df.index)

# # Get and print Travis Kelce's odds for each game in the test set
# if not travis_kelce_test_idx.empty:
#     kelce_proba_indices = [X_test.index.get_loc(idx) for idx in travis_kelce_test_idx]
#     kelce_probas = y_pred_proba[kelce_proba_indices]
#     kelce_odds = [probability_to_american_odds(proba) for proba in kelce_probas]
    
#     print(f"Travis Kelce's Odds for each game in the test set: {kelce_odds}")
#     print(f"Total number of games for Travis Kelce in the test set: {len(kelce_odds)}")
# else:
#     print("Travis Kelce's data is not in the test set.")


In [None]:
# Run it

# Single Player #
# player_name = "Zay Flowers"
# player_name = "Travis Kelce"
# print(predict_td_odds_general(player_name))

# Loop #
players = [
    {"player_name": "Travis Kelce"},
    {"player_name": "Derrick Henry"},
    {"player_name": "Isiah Pacheco"},
    {"player_name": "Xavier Worthy"},
    {"player_name": "Rashee Rice"},
    {"player_name": "Zay Flowers"},
    {"player_name": "Marquise Brown"},
    {"player_name": "Mark Andrews"},
    {"player_name": "Kadarius Toney"}
]



def predict_td_odds_general(player_name):
    # Filter the data for the player
    player_data = df_filtered[df_filtered['player_display_name'] == player_name]
    
    if player_data.empty:
        return f"No data available for player {player_name}."

    # Use the trained model to predict the probability of a touchdown
    features = player_data[['rushing_yards', 'receiving_yards', 'targets', 'home_away_flag']]
    probability = model.predict_proba(features)[:, 1][0]
    
    # Convert the probability to American odds
    american_odds = probability_to_american_odds(probability)
    
    return f"Predicted American Odds for {player_name} scoring: {american_odds}"

# Run the prediction for each player
for player in players:
    player_name = player["player_name"]
    print(predict_td_odds_general(player_name))

# XGBoost 2.5

In [2]:
# XGBoost (only using specific opponents games for each run)
# 1.	Rushing Yards
# 2.	Receiving Yards
# 3.	Targets
# 4.	Home/Away Flag
# 5.    Opponent Team: filtered to consider only games where the player faced the specific opponent team


# List of players and their opponent teams
players = [
    {"player_name": "Travis Kelce", "opponent_team": "BAL"},
    {"player_name": "Derrick Henry", "opponent_team": "KC"},
    {"player_name": "Isiah Pacheco", "opponent_team": "BAL"},
    {"player_name": "Rashee Rice", "opponent_team": "BAL"},
    {"player_name": "Zay Flowers", "opponent_team": "KC"},
    {"player_name": "Marquise Brown", "opponent_team": "KC"},
    {"player_name": "Mark Andrews", "opponent_team": "KC"},
    {"player_name": "Kadarius Toney", "opponent_team": "BAL"}
]

# Connect to the NFL database
conn = sqlite3.connect('nfl.db')
df = pd.read_sql_query("SELECT * FROM PlayerStats", conn)
conn.close()

# Filter for relevant positions (TE, RB, WR)
positions = ['TE', 'RB', 'WR']
df_filtered = df[df['position'].isin(positions)].copy()

# Create a binary target variable: whether a player scored a touchdown in a game
df_filtered['scored_td'] = df_filtered.apply(
    lambda row: 1 if row['rushing_tds'] + row['receiving_tds'] > 0 else 0, axis=1
)

# Creating a 'home_away_flag' based on game_id
df_filtered['home_team'] = df_filtered['game_id'].apply(lambda x: x.split('_')[2])
df_filtered['away_team'] = df_filtered['game_id'].apply(lambda x: x.split('_')[1])
df_filtered['home_away_flag'] = df_filtered.apply(
    lambda row: 1 if row['player_current_team'] == row['home_team'] else 0,
    axis=1
)

# Create the 'opponent_team' column based on home/away flag
df_filtered['opponent_team'] = df_filtered.apply(
    lambda row: row['away_team'] if row['home_away_flag'] == 1 else row['home_team'],
    axis=1
)

# Drop rows with missing values (if any) to clean the dataset
df_filtered = df_filtered.dropna(subset=['rushing_yards', 'receiving_yards', 'targets', 'home_away_flag'])

# Prepare feature set and target
X = df_filtered[['rushing_yards', 'receiving_yards', 'targets', 'home_away_flag']]
y = df_filtered['scored_td']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
model = xgb.XGBClassifier(eval_metric='logloss')
model.fit(X_train, y_train)


# Run the prediction for each player-opponent pair
for player in players:
    player_name = player["player_name"]
    opponent_team = player["opponent_team"]

    # Filter data for the specific player and opponent team
    player_data = df_filtered[(df_filtered['player_display_name'] == player_name) &
                              (df_filtered['opponent_team'] == opponent_team)]

    if player_data.empty:
        print(f"No data available for player {player_name} against {opponent_team}.")
    else:
        # Use the trained model to predict the probability of a touchdown
        features = player_data[['rushing_yards', 'receiving_yards', 'targets', 'home_away_flag']]
        probability = model.predict_proba(features)[:, 1][0]

        # Convert the probability to American odds
        def probability_to_american_odds(prob):
            if prob > 0.5:
                return -100 * (prob / (1 - prob))
            else:
                return 100 * ((1 - prob) / prob)

        american_odds = probability_to_american_odds(probability)

        print(f"Predicted American Odds for {player_name} scoring against {opponent_team}: {american_odds}")

Predicted American Odds for Travis Kelce scoring against BAL: 173.63739035460597
Predicted American Odds for Derrick Henry scoring against KC: -287.9608580121315
No data available for player Isiah Pacheco against BAL.
No data available for player Rashee Rice against BAL.
Predicted American Odds for Zay Flowers scoring against KC: -144.31489173073837
Predicted American Odds for Marquise Brown scoring against KC: 752.6573284857294
Predicted American Odds for Mark Andrews scoring against KC: 872.530004058436
No data available for player Kadarius Toney against BAL.


# XGBoost 3

In [None]:
# XGBoost
# 1.	Rushing Yards
# 2.	Receiving Yards
# 3.	Targets
# 4.	Home/Away Flag
# 5.    Upcoming Opponent Historical Performances

In [17]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import roc_auc_score, log_loss
import matplotlib.pyplot as plt

# Load the player stats data
conn = sqlite3.connect('nfl.db')
df = pd.read_sql_query("SELECT * FROM PlayerStats", conn)
conn.close()

# Load the opponent stats data from the uploaded CSV, focusing on average stats
opponent_stats_df = pd.read_csv('player_against_teams_stats.csv')

# Filter by position
positions = ['TE', 'RB', 'WR']
df_filtered = df[df['position'].isin(positions)].copy()

# Create a binary target variable: whether a player scored a touchdown in a game
df_filtered['scored_td'] = df_filtered.apply(
    lambda row: 1 if row['rushing_tds'] + row['receiving_yards'] > 0 else 0, axis=1
)

# Create 'home_away_flag' based on game_id
df_filtered['home_team'] = df_filtered['game_id'].apply(lambda x: x.split('_')[2])
df_filtered['away_team'] = df_filtered['game_id'].apply(lambda x: x.split('_')[1])
df_filtered['home_away_flag'] = df_filtered.apply(
    lambda row: 1 if row['player_current_team'] == row['home_team'] else 0,
    axis=1
)

# Create opponent team column
df_filtered['opponent_team'] = df_filtered.apply(
    lambda row: row['away_team'] if row['player_current_team'] == row['home_team'] else row['home_team'], axis=1
)

# Merge the opponent stats with the main player stats DataFrame, using only the average stats
df_filtered = pd.merge(
    df_filtered, 
    opponent_stats_df[['player_display_name', 'opponent_team', 'avg_total_tds', 'avg_rushing_yards', 'avg_receiving_yards']], 
    on=['player_display_name', 'opponent_team'], 
    how='left'
)

# Drop rows with missing values after the merge
# df_filtered = df_filtered.dropna()

# Prepare features for both cases
X_with_history = df_filtered[['rushing_yards', 'receiving_yards', 'targets', 'home_away_flag', 
                              'avg_total_tds', 'avg_rushing_yards', 'avg_receiving_yards']]
y_with_history = df_filtered['scored_td']

X_no_history = df_filtered[['rushing_yards', 'receiving_yards', 'targets', 'home_away_flag']]
y_no_history = df_filtered['scored_td']

# Split the data into training and testing sets
X_train_with, X_test_with, y_train_with, y_test_with = train_test_split(X_with_history, y_with_history, test_size=0.2, random_state=42)
X_train_no, X_test_no, y_train_no, y_test_no = train_test_split(X_no_history, y_no_history, test_size=0.2, random_state=42)

# Initialize two XGBoost models
model_with_history = xgb.XGBClassifier(eval_metric='logloss')
model_no_history = xgb.XGBClassifier(eval_metric='logloss')

# Train both models
model_with_history.fit(X_train_with, y_train_with)
model_no_history.fit(X_train_no, y_train_no)

# Function to predict TD odds for a given player
def predict_td_odds_general(player_name, opponent_team):
    # Filter the data for the player
    player_data = df_filtered[df_filtered['player_display_name'] == player_name].copy()
    
    if player_data.empty:
        return f"No data available for player {player_name}."

    # Check if the player has any history against the opponent
    opponent_history = player_data[player_data['opponent_team'] == opponent_team]
    
    if not opponent_history.empty:
        # Use the model trained with opponent-specific stats
        features = opponent_history[['rushing_yards', 'receiving_yards', 'targets', 'home_away_flag', 
                                     'avg_total_tds', 'avg_rushing_yards', 'avg_receiving_yards']]
        probability = model_with_history.predict_proba(features)[:, 1][0]
    else:
        # Use the model trained without opponent-specific stats
        features = player_data[['rushing_yards', 'receiving_yards', 'targets', 'home_away_flag']]
        probability = model_no_history.predict_proba(features)[:, 1][0]
    
    # Convert the probability to American odds
    american_odds = probability_to_american_odds(probability)
    
    return f"Predicted American Odds for {player_name} scoring against {opponent_team}: {american_odds}"

# Example predictions
predictions = [
    {"player_name": "Travis Kelce", "opponent_team": "BAL"},
    {"player_name": "Derrick Henry", "opponent_team": "KC"},
    {"player_name": "Isiah Pacheco", "opponent_team": "BAL"},
    {"player_name": "Rashee Rice", "opponent_team": "BAL"},
    {"player_name": "Mark Andrews", "opponent_team": "KC"}
]

# Run the prediction for each player against a specific opponent
for prediction in predictions:
    player_name = prediction["player_name"]
    opponent_team = prediction["opponent_team"]
    print(predict_td_odds_general(player_name, opponent_team))



Predicted American Odds for Travis Kelce scoring against BAL: -8923951.063829787
Predicted American Odds for Derrick Henry scoring against KC: -7231458.620689655
Predicted American Odds for Isiah Pacheco scoring against BAL: 153.66811589929912
Predicted American Odds for Rashee Rice scoring against BAL: -9641978.16091954
Predicted American Odds for Mark Andrews scoring against KC: -8305452.475247525


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
import xgboost as xgb
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import LabelEncoder

# Load the aggregated player performance data from the CSV file
performance_metrics = pd.read_csv('player_against_teams_stats.csv')

# Encode the 'opponent_team' as a categorical feature
label_encoder_opponent = LabelEncoder()
performance_metrics['opponent_team_encoded'] = label_encoder_opponent.fit_transform(performance_metrics['opponent_team'])

# Define the feature set (X) and target variable (y)
X = performance_metrics[['games_played', 'rushing_yards', 'receiving_yards', 
                         'avg_total_tds', 'avg_rushing_yards', 'avg_receiving_yards',
                         'opponent_team_encoded']]
y = performance_metrics['total_tds'].apply(lambda x: 1 if x > 0 else 0)  # Binary target: whether the player scored any TDs

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize an XGBoost model with regularization
model = xgb.XGBClassifier(eval_metric='logloss', max_depth=3, n_estimators=100, learning_rate=0.01,
                          reg_alpha=1, reg_lambda=1, use_label_encoder=False)

# Use cross-validation to check for generalization
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
print("Cross-Validation ROC AUC Scores:", cross_val_scores)
print("Mean Cross-Validation ROC AUC Score:", cross_val_scores.mean())

# Train the model
model.fit(X_train, y_train)

# Predict probabilities
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Convert probabilities to American odds
def probability_to_american_odds(prob):
    if prob > 0.5:
        return -100 * (prob / (1 - prob))
    else:
        return 100 * ((1 - prob) / prob)

american_odds = [probability_to_american_odds(prob) for prob in y_pred_proba]

# Evaluate the model
roc_auc = roc_auc_score(y_test, y_pred_proba)
log_loss_value = log_loss(y_test, y_pred_proba)

print("ROC AUC Score:", roc_auc)
print("Log Loss:", log_loss_value)
print("Example American Odds:", american_odds[:5])

total_items = len(american_odds)
print(f"Total number of items in Example American Odds: {total_items}")

In [None]:
# # Logistic Regression

# # Code Explanation:
# # Data Loading and Preparation:
# # Connect to the nfl.db SQLite database and load the PlayerStats table.
# # Filter for relevant positions: Tight Ends (TE), Running Backs (RB), and Wide Receivers (WR).
# # Create a binary target variable scored_td that indicates whether a player scored a touchdown in the game.
# # Feature Selection:
# # Use rushing_yards, receiving_yards, and targets as features for the model.
# # Drop rows with missing values to ensure clean data for training.
# # Model Setup:
# # Split the data into training and testing sets (80% train, 20% test).
# # Initialize and train a Logistic Regression model.
# # Model Evaluation:
# # Evaluate the model using accuracy, ROC AUC score, and confusion matrix to understand its performance.
# # Output:
# # Print the accuracy, ROC AUC score, and confusion matrix to assess the model's effectiveness.

    
# import sqlite3
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# # Connect to the NFL database
# conn = sqlite3.connect('nfl.db')

# # Load the relevant tables (assuming 'PlayerStats' and any others you may need are present)
# df = pd.read_sql_query("SELECT * FROM PlayerStats", conn)

# conn.close()

# # Basic data preparation (e.g., filter for relevant positions)
# positions = ['TE', 'RB', 'WR']
# df_filtered = df[df['position'].isin(positions)].copy()

# # Create a binary target variable: whether a player scored a touchdown in a game
# df_filtered['scored_td'] = df_filtered.apply(
#     lambda row: 1 if row['rushing_tds'] + row['receiving_tds'] > 0 else 0, axis=1
# )

# # Example feature selection:
# # 1. 'rushing_yards' - rushing yards in the game
# # 2. 'receiving_yards' - receiving yards in the game
# # 3. 'targets' - number of times the player was targeted (as a receiver)

# # Drop rows with missing values (if any) to clean the dataset
# df_filtered = df_filtered.dropna(subset=['rushing_yards', 'receiving_yards', 'targets'])

# # Feature set and target
# X = df_filtered[['rushing_yards', 'receiving_yards', 'targets']]
# y = df_filtered['scored_td']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize a simple logistic regression model
# model = LogisticRegression()

# # Train the model
# model.fit(X_train, y_train)

# # Predictions
# y_pred = model.predict(X_test)
# y_pred_proba = model.predict_proba(X_test)[:, 1]

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# roc_auc = roc_auc_score(y_test, y_pred_proba)
# conf_matrix = confusion_matrix(y_test, y_pred)

# # Output the results
# print("Accuracy:", accuracy)
# print("ROC AUC Score:", roc_auc)
# print("Confusion Matrix:\n", conf_matrix)
