In [30]:
import mfl as mfl
import pandas as pd
import numpy as np
import mfl.api.data_loaders as mfldata
import nfl_data_py as nfl

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score, precision_recall_curve

from xgboost import XGBClassifier, XGBRFClassifier

from catboost import CatBoostClassifier

import keras
from keras.layers import Dense, ReLU, Bidirectional, Normalization, Dropout, Input
from keras.models import Sequential

import joblib

In [113]:
mfldata.load_qb_data_cleaned()

Unnamed: 0,pfr_player_name,round,pick,season,G,Cmp,Att,Cmp%,Yds,TD,...,Int%,Y/A,AY/A,Y/C,Y/G,Rate,seasons,name,recent_team,seasons_with_draft_team
0,Sam Bradford,1,1,2010,31.0,604.0,893.0,67.6,8403.0,88.0,...,1.8,9.4,10.57,13.9,271.1,175.6,3.0,Sam Bradford,LA,4.0
1,Tim Tebow,1,25,2010,55.0,661.0,995.0,66.4,9285.0,88.0,...,1.6,9.3,10.38,14.0,168.8,170.8,4.0,Tim Tebow,DEN,2.0
2,Jimmy Clausen,2,48,2010,35.0,695.0,1110.0,62.6,8148.0,60.0,...,2.4,7.3,7.33,11.7,232.8,137.2,3.0,Jimmy Clausen,CAR,1.0
3,Colt McCoy,3,85,2010,53.0,1157.0,1645.0,70.3,13253.0,112.0,...,2.7,8.1,8.19,11.5,250.1,155.0,4.0,Colt McCoy,CLE,3.0
4,Mike Kafka,4,122,2010,30.0,408.0,637.0,64.1,4265.0,19.0,...,3.1,6.7,5.88,10.5,142.2,123.9,4.0,Mike Kafka,PHI,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,C.J. Beathard,3,104,2017,,,,,,,...,,,,,,,,,,
145,Joshua Dobbs,4,135,2017,37.0,614.0,999.0,61.5,7138.0,53.0,...,2.9,7.1,6.90,11.6,192.9,133.2,4.0,Joshua Dobbs,PIT,2.0
146,Nathan Peterman,5,171,2017,10.0,20.0,43.0,46.5,94.0,0.0,...,4.7,2.2,0.09,4.7,9.4,55.6,7.0,Nathan Peterman,BUF,2.0
147,Brad Kaaya,6,215,2017,38.0,721.0,1189.0,60.6,9972.0,69.0,...,2.0,8.4,8.64,13.8,262.4,146.2,3.0,Brad Kaaya,,


In [140]:
class FranchiseQB:
    def __init__(self, 
                 feature_set='numeric', 
                 model='catboost', 
                 dataset='../data/for_modeling.csv', 
                 **kwargs):
        
        self.feature_set = feature_set
        self.model = model
        self.dataset = dataset
        self.kwargs = kwargs
        self.model_map = {
            'catboost' : CatBoostClassifier(),
            'xgb' : XGBClassifier(), 
            'rf' : RandomForestClassifier(),
            'lr' : LogisticRegression(),
            'hgb' : HistGradientBoostingClassifier(),
            'svm' : SVC(),
            'nn_basic': ...
        }
        self.available_models = list(self.model_map.keys())
        self.model_func = self.model_map[model]
        self.df = pd.read_csv(dataset).dropna()
        self.all_players = self.df['pfr_player_name']

    def create_training_data(self, n_splits=4, stratify=True):
        pass

    def map_response(self, x):
        if x >= 4:
            return 1
        else: 
            return 0
        
    def score(self, y_test, y_probs, y_preds):
        accuracy = accuracy_score(y_test, y_preds)
        f1 = f1_score(y_test, y_preds)
        roc_auc = roc_auc_score(y_test, y_probs)

        metric_dict = {
            'accuracy' : accuracy,
            'f1' : f1,
            'roc_auc': roc_auc
        }

        return metric_dict

    # this needs refinement badly
    def catboost(self, kfold=False, folds=2):
        model = self.model_func
        df = self.df

        if self.feature_set == 'numeric':
            X = df[df.select_dtypes(include='float').columns].drop('seasons_with_draft_team',axis=1)
        elif self.feature_set == 'cat':
            df = df.drop(['name', 'pfr_player_name'], axis=1)
            model = CatBoostClassifier(one_hot_max_size=5, iterations=300, cat_features=df.select_dtypes(include='object').columns.tolist())
            X = df.drop('seasons_with_draft_team', axis=1)
        
        y = df['seasons_with_draft_team'].apply(self.map_response)

        self.X = X
        self.y = y

        k_strat = StratifiedKFold(n_splits=folds, shuffle=False)
        folds = k_strat.split(X, y)
        y_preds = []
        y_probs = []
        y_tests = []
        
        if kfold:
            for train, test in folds:
                
                X_train, X_test = X.iloc[train], X.iloc[test]
                y_train, y_test = y.values[train], y.values[test]    

                model.fit(X_train, y_train)
                y_tests.extend(y_test)
                y_preds.extend(model.predict(X_test))
                y_probs.extend(model.predict_proba(X_test)[:,1])
        
        else:
            X_train, X_test, y_train, y_test = train_test_split(df.drop('seasons_with_draft_team',
                                                                       axis=1),
                                                                       df['seasons_with_draft_team'].apply(self.map_response),
                                                                       test_size=.25,
                                                                       shuffle=True,
                                                                       stratify=df['seasons_with_draft_team'].apply(self.map_response))
            model = CatBoostClassifier(one_hot_max_size=5, iterations=300, cat_features=df.select_dtypes(include='object').columns.tolist())
            model.fit(X_train, y_train)

            y_tests.extend(y_test)
            y_preds.extend(model.predict(X_test))
            y_probs.extend((model.predict_proba(X_test)[:,1]))
        
        self.y_preds = y_preds
        self.y_probs = y_probs
        
        model = CatBoostClassifier(one_hot_max_size=5, iterations=300, cat_features=df.select_dtypes(include='object').columns.tolist())
        model.fit(self.X, self.y)
        self.fit_model = model
        
        metrics = self.score(y_tests, y_probs, y_preds)
        self.model_results = pd.DataFrame(metrics, index=[0])

    def lr(self, **kwargs):
        ...

    def predict_2025_qb(self, player_name, round, pick, recent_team, model='catboost'):
        
        name = player_name
        season = 2020
        model_path = f"/Users/benstager/Desktop/mfl_project/mfl/api/saved_models/{model}_v_0_0.pkl"
        model = joblib.load(model_path)
        

        features = pd.DataFrame({
            'pfr_player_name' : name, 
            'round' : round,
            'pick' : pick,
            'season' : season
        }, index=[0])

        predictors = mfldata.scrape_NFL_REF_QB(player_name=player_name).drop('name')
        processing = pd.concat([features, predictors], axis=1)
        processing['recent_team'] = recent_team

        val = model.predict_proba(processing)
        
        return val


In [134]:
NFL = FranchiseQB(feature_set='cat', model='catboost')
NFL.predict_2025_qb(player_name='Joe Burrow', round=1, pick=1, recent_team='CIN')

array([[0.81131964, 0.18868036]])

In [144]:
NFL.model_results

Unnamed: 0,accuracy,f1,roc_auc
0,0.740741,0.222222,0.757143


In [139]:
model.feature_names_

['pfr_player_name',
 'round',
 'pick',
 'season',
 'G',
 'Cmp',
 'Att',
 'Cmp%',
 'Yds',
 'TD',
 'TD%',
 'Int',
 'Int%',
 'Y/A',
 'AY/A',
 'Y/C',
 'Y/G',
 'Rate',
 'seasons',
 'name',
 'recent_team']

In [142]:
NFL = FranchiseQB(feature_set='cat', model='catboost')
NFL.catboost()

Learning rate set to 0.010625
0:	learn: 0.6876656	total: 2.27ms	remaining: 680ms
1:	learn: 0.6804419	total: 3.71ms	remaining: 553ms
2:	learn: 0.6748704	total: 5.49ms	remaining: 544ms
3:	learn: 0.6684385	total: 6.45ms	remaining: 477ms
4:	learn: 0.6617654	total: 7.42ms	remaining: 438ms
5:	learn: 0.6539512	total: 8.17ms	remaining: 400ms
6:	learn: 0.6468635	total: 9.13ms	remaining: 382ms
7:	learn: 0.6403487	total: 10.3ms	remaining: 375ms
8:	learn: 0.6334392	total: 11.2ms	remaining: 362ms
9:	learn: 0.6262633	total: 20.3ms	remaining: 589ms
10:	learn: 0.6206962	total: 42.4ms	remaining: 1.11s
11:	learn: 0.6153045	total: 43.6ms	remaining: 1.05s
12:	learn: 0.6100817	total: 44.5ms	remaining: 982ms
13:	learn: 0.6055429	total: 45.2ms	remaining: 924ms
14:	learn: 0.6010950	total: 45.9ms	remaining: 872ms
15:	learn: 0.5959014	total: 48.7ms	remaining: 864ms
16:	learn: 0.5897763	total: 50.9ms	remaining: 847ms
17:	learn: 0.5838615	total: 52.6ms	remaining: 824ms
18:	learn: 0.5800092	total: 53.2ms	remaining

In [143]:
NFL.fit_model.feature_names_

['round',
 'pick',
 'season',
 'G',
 'Cmp',
 'Att',
 'Cmp%',
 'Yds',
 'TD',
 'TD%',
 'Int',
 'Int%',
 'Y/A',
 'AY/A',
 'Y/C',
 'Y/G',
 'Rate',
 'seasons',
 'recent_team']

In [37]:
joblib.dump(NFL.fit_model, '/Users/benstager/Desktop/mfl_project/mfl/api/saved_models/catboost_v_0_0.pkl')

['/Users/benstager/Desktop/mfl_project/mfl/api/saved_models/catboost_v_0_0.pkl']

In [40]:
model = joblib.load('/Users/benstager/Desktop/mfl_project/mfl/api/saved_models/catboost_v_0_0.pkl')

In [41]:
model.feature_names_

['pfr_player_name',
 'round',
 'pick',
 'season',
 'G',
 'Cmp',
 'Att',
 'Cmp%',
 'Yds',
 'TD',
 'TD%',
 'Int',
 'Int%',
 'Y/A',
 'AY/A',
 'Y/C',
 'Y/G',
 'Rate',
 'seasons',
 'name',
 'recent_team']

In [46]:
shedeur.columns

Index(['G', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%', 'Int', 'Int%', 'Y/A',
       'AY/A', 'Y/C', 'Y/G', 'Rate', 'seasons', 'name'],
      dtype='object')

In [44]:
shedeur = mfldata.scrape_NFL_REF_QB(player_name='Shedeur Sanders')