# Imports

In [3]:
import mfl as mfl
import pandas as pd
import numpy as np
import mfl.api.data_loaders as mfldata
import nfl_data_py as nfl

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score, precision_recall_curve

from xgboost import XGBClassifier, XGBRFClassifier

from catboost import CatBoostClassifier

import keras
from keras.layers import Dense, ReLU, Bidirectional, Normalization, Dropout, Input
from keras.models import Sequential

import requests
from bs4 import BeautifulSoup
import time

# Cleaning

In [3]:
def scrape_NFL_REF_QB(player_name):

    first_name = player_name.split(' ')[0].lower()
    last_name = player_name.split(' ')[1].lower()
    player_url = f'https://www.sports-reference.com/cfb/players/{first_name}-{last_name}-1.html'
    html_content = requests.get(player_url).text

    if len(player_name.split(' ')) > 2:
        first_name = player_name.split(' ')[0].lower()
        last_name = player_name.split(' ')[1].lower()
        suffix = player_name.split(' ')[2].lower()
        player_url = f'https://www.sports-reference.com/cfb/players/{first_name}-{last_name}-{suffix}-1.html'
        html_content = requests.get(player_url).text


    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table', {'id': 'passing_standard'})

    if not table:
        for idx in range(2, 6):
            time.sleep(3)
            first_name = player_name.split(' ')[0].lower()
            last_name = player_name.split(' ')[1].lower()
            player_url = f'https://www.sports-reference.com/cfb/players/{first_name}-{last_name}-{idx}.html'
            html_content = requests.get(player_url).text
            
            soup = BeautifulSoup(html_content, 'html.parser')
            table = soup.find('table', {'id': 'passing_standard'})
            
            if not table:
                print(f"Passing stats table not found for {player_name}. Failed on index {idx}")
            else:
                print(f"Player found on index {idx}")
                break
    
    try:
        table.find_all('th')
        headers = [th.getText() for th in table.find_all('th')]
        yrs = [i for i in headers if ('2' in i) & (len(i) == 4 or len(i) == 5)]
        career_idx = headers.index(yrs[-1]) + 1
        baseline_headers = []
        headers = [th.getText() for th in table.find_all('th')]
        rows = []

        for tr in table.find_all('tr')[1:]:
            cells = [td.getText() for td in tr.find_all('td')]
            if cells: 
                rows.append(cells)

        career_stats = []

        for row in rows:
            if row.count('') == 2:
                career_stats.extend(row)
                break

        career_stats.remove('')
        career_stats.remove('')

        career_stats.append(len(yrs))
        career_stats.append(player_name)

        column_names = [
        'G',        
        'Cmp',      
        'Att',      
        'Cmp%',     
        'Yds',      
        'TD',       
        'TD%',      
        'Int',      
        'Int%',     
        'Y/A',      
        'AY/A',     
        'Y/C',      
        'Y/G',      
        'Rate',
        'seasons',
        'name'
        ]

        final = pd.DataFrame({name: [value] for name, value in zip(column_names, career_stats)})

        if len(career_stats) == len(column_names):
            final = pd.DataFrame({name: [value] for name, value in zip(column_names, career_stats)})
        else:
            print("Error: Number of stats does not match number of column names.")

        return final    
    except:
        print(f"FAILED: On {player_name}")

In [107]:
years = range(2010, 2023)
data = nfl.import_weekly_data(years)
draft = nfl.import_draft_picks(years)

Downcasting floats.


In [117]:
draft.columns

Index(['season', 'round', 'pick', 'team', 'gsis_id', 'pfr_player_id',
       'cfb_player_id', 'pfr_player_name', 'hof', 'position', 'category',
       'side', 'college', 'age', 'to', 'allpro', 'probowls', 'seasons_started',
       'w_av', 'car_av', 'dr_av', 'games', 'pass_completions', 'pass_attempts',
       'pass_yards', 'pass_tds', 'pass_ints', 'rush_atts', 'rush_yards',
       'rush_tds', 'receptions', 'rec_yards', 'rec_tds', 'def_solo_tackles',
       'def_ints', 'def_sacks'],
      dtype='object')

In [119]:
draft[draft['pfr_player_name'] == "Cam Newton"]['seasons_started']

8832    9
Name: seasons_started, dtype: int32

In [5]:
all_qbs = draft[draft['position'] == 'QB']
all_qb_names = all_qbs['pfr_player_name'].unique().tolist()

In [8]:
raw_data = []
for name in all_qb_names:
    print(name)
    raw_data.append(scrape_NFL_REF_QB(player_name=name))
    time.sleep(3)

Sam Bradford
Tim Tebow
Jimmy Clausen
Colt McCoy
Mike Kafka
John Skelton
Passing stats table not found for John Skelton. Failed on index 2
Passing stats table not found for John Skelton. Failed on index 3
Passing stats table not found for John Skelton. Failed on index 4
Passing stats table not found for John Skelton. Failed on index 5
FAILED: On John Skelton
Jonathan Crompton
Rusty Smith
Dan LeFevour
Tony Pike
Levi Brown
Sean Canfield
Zac Robinson
Cam Newton
Jake Locker
Blaine Gabbert
Christian Ponder
Andy Dalton
Colin Kaepernick
Ryan Mallett
Ricky Stanzi
T.J. Yates
Passing stats table not found for T.J. Yates. Failed on index 2
Passing stats table not found for T.J. Yates. Failed on index 3
Passing stats table not found for T.J. Yates. Failed on index 4
Passing stats table not found for T.J. Yates. Failed on index 5
FAILED: On T.J. Yates
Nathan Enderle
Tyrod Taylor
Greg McElroy
Andrew Luck
Robert Griffin III
Ryan Tannehill
Brandon Weeden
Brock Osweiler
Russell Wilson
Nick Foles
Kirk Co

In [11]:
extracted_data = pd.concat([i for i in raw_data if i is not None])

In [15]:
print(raw_data.__len__())
print(extracted_data.shape[0])

149
133


In [16]:
processing = data[data['player_display_name'].isin(all_qb_names)].sort_values(['player_display_name', 'season']).groupby(['player_display_name', 'recent_team'], sort=False).agg({'season':'nunique'}).reset_index()
final_years_1st_team = processing.groupby('player_display_name').agg({'season':'first'})

In [17]:
draft_teams = data[data['player_display_name'].isin(all_qb_names)].groupby(['player_display_name']).agg({'recent_team':'first'})

In [18]:
df_with_teams = pd.merge(left=extracted_data, 
                         right=draft_teams, 
                         left_on='name',
                         right_on='player_display_name',
                         how='left')

In [125]:
draft.columns

Index(['season', 'round', 'pick', 'team', 'gsis_id', 'pfr_player_id',
       'cfb_player_id', 'pfr_player_name', 'hof', 'position', 'category',
       'side', 'college', 'age', 'to', 'allpro', 'probowls', 'seasons_started',
       'w_av', 'car_av', 'dr_av', 'games', 'pass_completions', 'pass_attempts',
       'pass_yards', 'pass_tds', 'pass_ints', 'rush_atts', 'rush_yards',
       'rush_tds', 'receptions', 'rec_yards', 'rec_tds', 'def_solo_tackles',
       'def_ints', 'def_sacks'],
      dtype='object')

In [19]:
df_with_draft_teams = pd.merge(left=all_qbs[['pfr_player_name', 'round', 'pick', 'season']],
                               right=df_with_teams,
                               left_on='pfr_player_name',
                               right_on='name',
                               how='left'
                               )

In [20]:
final_years_1st_team = final_years_1st_team.rename(columns={'season':'seasons_with_draft_team'})

In [21]:
final = pd.merge(left=df_with_draft_teams,
                 right=final_years_1st_team,
                 left_on='name',
                 right_on='player_display_name',
                 how='left')

In [22]:
final = final[['pfr_player_name', 'round', 'pick', 'season', 'G', 'Cmp', 'Att', 'Cmp%',
       'Yds', 'TD', 'TD%', 'Int', 'Int%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate',
       'seasons','recent_team', 'seasons_with_draft_team']]

In [23]:
final = final.rename(columns={'season':'draft_year','pfr_player_name':'player_name', 'seasons':'college_seasons'})

In [120]:
draft.columns

Index(['season', 'round', 'pick', 'team', 'gsis_id', 'pfr_player_id',
       'cfb_player_id', 'pfr_player_name', 'hof', 'position', 'category',
       'side', 'college', 'age', 'to', 'allpro', 'probowls', 'seasons_started',
       'w_av', 'car_av', 'dr_av', 'games', 'pass_completions', 'pass_attempts',
       'pass_yards', 'pass_tds', 'pass_ints', 'rush_atts', 'rush_yards',
       'rush_tds', 'receptions', 'rec_yards', 'rec_tds', 'def_solo_tackles',
       'def_ints', 'def_sacks'],
      dtype='object')

In [122]:
final = pd.read_csv("~/Desktop/mfl_project/mfl/data/full_qb_dataset_v2.csv")

In [None]:
pd.merge(left=final, right=draft.drop(['round', 'pick']), left_on='player_name', right_on='pfr_player_name', how='left')

Unnamed: 0,player_name,round_x,pick_x,draft_year,G,Cmp,Att,Cmp%,Yds,TD,...,pass_ints,rush_atts,rush_yards,rush_tds,receptions,rec_yards,rec_tds,def_solo_tackles,def_ints,def_sacks
0,Sam Bradford,1,1,2010,31.0,604.0,893.0,67.6,8403.0,88.0,...,61.0,146.0,340.0,2.0,1.0,5.0,0.0,,,
1,Tim Tebow,1,25,2010,55.0,661.0,995.0,66.4,9285.0,88.0,...,9.0,197.0,989.0,12.0,0.0,0.0,0.0,1.0,,
2,Jimmy Clausen,2,48,2010,35.0,695.0,1110.0,62.6,8148.0,60.0,...,14.0,37.0,102.0,0.0,0.0,0.0,0.0,,,
3,Colt McCoy,3,85,2010,53.0,1157.0,1645.0,70.3,13253.0,112.0,...,32.0,175.0,582.0,2.0,2.0,8.0,0.0,1.0,,
4,Mike Kafka,4,122,2010,30.0,408.0,637.0,64.1,4265.0,19.0,...,2.0,3.0,0.0,0.0,0.0,0.0,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,C.J. Beathard,3,104,2017,,,,,,,...,14.0,64.0,266.0,4.0,0.0,0.0,0.0,,,
145,Joshua Dobbs,4,135,2017,37.0,614.0,999.0,61.5,7138.0,53.0,...,15.0,100.0,520.0,8.0,0.0,0.0,0.0,,,
146,Nathan Peterman,5,171,2017,36.0,398.0,663.0,60.0,5236.0,47.0,...,13.0,24.0,87.0,1.0,0.0,0.0,0.0,,,
147,Brad Kaaya,6,215,2017,38.0,721.0,1189.0,60.6,9972.0,69.0,...,,,,,,,,,,


In [25]:
final[final['player_name'] == "Cam Newton"]

Unnamed: 0,player_name,round,pick,draft_year,G,Cmp,Att,Cmp%,Yds,TD,...,Int,Int%,Y/A,AY/A,Y/C,Y/G,Rate,college_seasons,recent_team,seasons_with_draft_team
13,Cam Newton,1,1,2011,20,191,292,65.4,2908,30,...,7,2.4,10.0,10.93,15.2,145.4,178.2,3,CAR,10.0


In [27]:
final.to_csv("~/Desktop/mfl_project/mfl/data/full_qb_dataset_v2.csv",index=False)

# Modeling Pipeline

In [97]:
def map_response(x):
        if x >= 4:
            return 1
        else: 
            return 0
    
def score(y_test, y_probs, y_preds):
    accuracy = accuracy_score(y_test, y_preds)
    f1 = f1_score(y_test, y_preds)
    roc_auc = roc_auc_score(y_test, y_probs)

    metric_dict = {
        'accuracy' : accuracy,
        'f1' : f1,
        'roc_auc': roc_auc
    }

    return metric_dict

def catboost(df, year_cutoff=2019, feature_set=None, kfold=False, folds=2):

    df = df.dropna()
    df = df[df['draft_year'] <= year_cutoff]

    if feature_set is None:
        X = df.drop(['player_name', 'seasons_with_draft_team'],axis=1)
        y = df['seasons_with_draft_team']
    elif feature_set is not None:
        pass
    
    X = X
    y = y
    y_mapped = y.apply(map_response)

    X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=.25, stratify=y_mapped, shuffle=True)

    model = CatBoostClassifier(one_hot_max_size=10,
                                iterations=500, 
                                cat_features=X.select_dtypes(include='object').columns.tolist(),
                                depth=10)
    
    model.fit(X_train, y_train)

    y_preds = model.predict(X_test)
    y_probs = model.predict_proba(X_test)[:,1]

    metrics = score(y_test, y_probs, y_preds)
    
    model.fit(X, y_mapped)
    fit_model = model
    model_results = pd.DataFrame(metrics, index=[0])

    return model, model_results
    

def predict_2025_qb(model, player_name, round, pick, recent_team, season=2025):
    season = season
    variant_features = ['round', 'pick', 'draft_year']
    available_features = np.setdiff1d(model.feature_names_[:-1], variant_features).tolist()

    initial_features = pd.DataFrame({
        'round' : round,
        'pick' : pick,
        'draft_year' : season
    }, index=[0])

    predictors = mfldata.scrape_NFL_REF_QB(player_name=player_name)[available_features]
    
    processing = pd.concat([initial_features, predictors], axis=1)
    processing['recent_team'] = recent_team

    return model.predict_proba(processing).tolist()[0][1]

In [93]:
df = pd.read_csv("/Users/benstager/Desktop/mfl_project/mfl/data/full_qb_dataset_v2.csv")

In [98]:
model, results = catboost(df, year_cutoff=2018)

Learning rate set to 0.005594
0:	learn: 0.6893884	total: 4.67ms	remaining: 2.33s
1:	learn: 0.6857479	total: 7.61ms	remaining: 1.9s
2:	learn: 0.6823072	total: 9.58ms	remaining: 1.59s
3:	learn: 0.6798508	total: 10ms	remaining: 1.24s
4:	learn: 0.6755109	total: 10.9ms	remaining: 1.08s
5:	learn: 0.6726729	total: 13.6ms	remaining: 1.12s
6:	learn: 0.6691961	total: 15.6ms	remaining: 1.1s
7:	learn: 0.6659139	total: 17.3ms	remaining: 1.06s
8:	learn: 0.6632603	total: 19ms	remaining: 1.03s
9:	learn: 0.6611032	total: 19.6ms	remaining: 958ms
10:	learn: 0.6584057	total: 20.1ms	remaining: 895ms
11:	learn: 0.6556395	total: 22.5ms	remaining: 916ms
12:	learn: 0.6533571	total: 22.9ms	remaining: 858ms
13:	learn: 0.6497296	total: 24.6ms	remaining: 854ms
14:	learn: 0.6470246	total: 25.7ms	remaining: 830ms
15:	learn: 0.6440788	total: 27.5ms	remaining: 832ms
16:	learn: 0.6410897	total: 28.3ms	remaining: 805ms
17:	learn: 0.6390645	total: 29.3ms	remaining: 784ms
18:	learn: 0.6371593	total: 30.1ms	remaining: 761m

In [99]:
results

Unnamed: 0,accuracy,f1,roc_auc
0,0.736842,0.615385,0.845238


In [105]:
predict_2025_qb(model, player_name='Cameron Ward', recent_team='CLE', round=2, pick=33, season=2025)

0.5243127978166613

# Let's try to predict out of sample

In [55]:
prospect_names = pd.read_excel("~/Downloads/Top Prospects for the 2025 NFL Draft.xlsx", sheet_name=0, skiprows=2)

In [64]:
pd.read_excel("~/Downloads/Top Prospects for the 2025 NFL Draft.xlsx", sheet_name=2, skiprows=1)[['Pick', 'Team']]

Unnamed: 0,Pick,Team
0,1,Tennessee Titans
1,2,Cleveland Browns
2,3,New York Giants
3,4,New England Patriots
4,5,Jacksonville Jaguars
...,...,...
252,253,Miami Dolphins
253,254,New Orleans Saints
254,255,Cleveland Browns
255,256,Los Angeles Chargers


In [56]:
qbs_2025 = prospect_names[prospect_names['Position'] == "QB"]