In [10]:
import mfl as mfl
import pandas as pd
import numpy as np
import mfl.api.data_loaders as mfldata
import nfl_data_py as nfl

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score, precision_recall_curve

from xgboost import XGBClassifier, XGBRFClassifier

from catboost import CatBoostClassifier

import keras
from keras.layers import Dense, ReLU, Bidirectional, Normalization, Dropout, Input
from keras.models import Sequential

import joblib

In [94]:
class FranchiseQB:
    def __init__(self, 
                 feature_set='numeric', 
                 model='catboost', 
                 dataset='../data/for_modeling.csv', 
                 **kwargs):
        
        self.feature_set = feature_set
        self.model = model 
        self.dataset = dataset
        self.kwargs = kwargs
        self.model_map = {
            'catboost' : CatBoostClassifier(),
            'xgb' : XGBClassifier(), 
            'rf' : RandomForestClassifier(),
            'lr' : LogisticRegression(),
            'hgb' : HistGradientBoostingClassifier(),
            'svm' : SVC(),
            'nn_basic': ...
        }
        self.available_models = list(self.model_map.keys())
        self.model_func = self.model_map[model]
        self.full_dataset = pd.read_csv(dataset)

    def create_training_data(self, n_splits=4, stratify=True):
        pass

    def map_response(self, x):
        if x >= 4:
            return 1
        else: 
            return 0
        
    def score(self, y_test, y_probs, y_preds):
        accuracy = accuracy_score(y_test, y_preds)
        f1 = f1_score(y_test, y_preds)
        roc_auc = roc_auc_score(y_test, y_probs)

        metric_dict = {
            'accuracy' : accuracy,
            'f1' : f1,
            'roc_auc': roc_auc
        }

        return metric_dict

    def catboost(self, feature_set=None, kfold=False, folds=2):
        self.feature_set = feature_set
        self.kfold = kfold
        self.folds = folds

        df = self.full_dataset.dropna()
        df = df[df['season'] <= 2019]

        if feature_set is None:
            X = df.drop(['pfr_player_name', 'seasons_with_draft_team', 'name'],axis=1)
            y = df['seasons_with_draft_team']
        elif feature_set is not None:
            pass
        

        self.X = X
        self.y = y
        self.y_mapped = y.apply(self.map_response)

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, 
                                                                                self.y_mapped, 
                                                                                test_size=.25, 
                                                                                stratify=self.y_mapped, 
                                                                                shuffle=True)

        model = CatBoostClassifier(one_hot_max_size=5, 
                                   iterations=300, 
                                   cat_features=X.select_dtypes(include='object').columns.tolist())
        
        model.fit(self.X_train, self.y_train)

        self.y_preds = model.predict(self.X_test)
        self.y_probs = model.predict_proba(self.X_test)[:,1]

        metrics = self.score(self.y_test, self.y_probs, self.y_preds)
        
        model.fit(X, self.y_mapped)
        self.fit_model = model
        self.model_results = pd.DataFrame(metrics, index=[0])
        

    def predict_2025_qb(self, player_name, round, pick, recent_team, season=2020):
        
        name = player_name
        season = season
        
        variant_features = ['round', 'pick', 'season']
        available_features = np.setdiff1d(self.fit_model.feature_names_[:-1], variant_features).tolist()

        initial_features = pd.DataFrame({
            'round' : round,
            'pick' : pick,
            'season' : season
        }, index=[0])

        predictors = mfldata.scrape_NFL_REF_QB(player_name=player_name)[available_features]
        
        processing = pd.concat([initial_features, predictors], axis=1)
        processing['recent_team'] = recent_team

        return self.fit_model.predict_proba(processing)



In [110]:
mfldata.scrape_NFL_REF_QB(player_name='Joe Burrow')

Unnamed: 0,G,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Y/A,AY/A,Y/C,Y/G,Rate,seasons,name
0,11,29,39,74.4,287,2,5.1,0,0.0,7.4,8.38,9.9,26.1,153.1,7,Joe Burrow


In [91]:
sim.full_dataset.sort_values(['round', 'pick']).head(10)

Unnamed: 0,pfr_player_name,round,pick,season,G,Cmp,Att,Cmp%,Yds,TD,...,Int%,Y/A,AY/A,Y/C,Y/G,Rate,seasons,name,recent_team,seasons_with_draft_team
0,Sam Bradford,1,1,2010,31.0,604.0,893.0,67.6,8403.0,88.0,...,1.8,9.4,10.57,13.9,271.1,175.6,3.0,Sam Bradford,LA,4.0
13,Cam Newton,1,1,2011,6.0,6.0,12.0,50.0,54.0,0.0,...,0.0,4.5,4.5,9.0,9.0,87.8,7.0,Cam Newton,CAR,10.0
25,Andrew Luck,1,1,2012,38.0,713.0,1064.0,67.0,9430.0,82.0,...,2.1,8.9,9.47,13.2,248.2,162.8,3.0,Andrew Luck,IND,6.0
61,Jameis Winston,1,1,2015,27.0,562.0,851.0,66.0,7964.0,65.0,...,3.3,9.4,9.41,14.2,295.0,163.3,2.0,Jameis Winston,TB,5.0
68,Jared Goff,1,1,2016,37.0,977.0,1569.0,62.3,12200.0,96.0,...,1.9,7.8,8.14,12.5,329.7,144.0,3.0,Jared Goff,LA,5.0
92,Trevor Lawrence,1,1,2021,40.0,758.0,1138.0,66.6,10098.0,90.0,...,1.5,8.9,9.78,13.3,252.5,164.3,3.0,Trevor Lawrence,JAX,2.0
102,Joe Burrow,1,1,2020,11.0,29.0,39.0,74.4,287.0,2.0,...,0.0,7.4,8.38,9.9,26.1,153.1,7.0,Joe Burrow,CIN,3.0
115,Kyler Murray,1,1,2019,8.0,72.0,121.0,59.5,686.0,5.0,...,5.8,5.7,3.89,9.5,85.8,109.2,7.0,Kyler Murray,ARI,4.0
126,Baker Mayfield,1,1,2018,8.0,218.0,340.0,64.1,2315.0,12.0,...,2.6,6.8,6.32,10.6,289.4,127.7,8.0,Baker Mayfield,CLE,4.0
26,Robert Griffin III,1,2,2012,41.0,800.0,1192.0,67.1,10366.0,78.0,...,1.4,8.7,9.36,13.0,252.8,158.9,4.0,Robert Griffin III,,


In [95]:
sim = FranchiseQB(model='catboost', feature_set='cat')
sim.catboost()

Learning rate set to 0.009413
0:	learn: 0.6882563	total: 1.75ms	remaining: 524ms
1:	learn: 0.6830361	total: 3.6ms	remaining: 536ms
2:	learn: 0.6765017	total: 4.36ms	remaining: 431ms
3:	learn: 0.6718960	total: 5.12ms	remaining: 379ms
4:	learn: 0.6677894	total: 6.13ms	remaining: 362ms
5:	learn: 0.6639782	total: 7ms	remaining: 343ms
6:	learn: 0.6587472	total: 7.77ms	remaining: 325ms
7:	learn: 0.6542079	total: 8.23ms	remaining: 300ms
8:	learn: 0.6481893	total: 8.92ms	remaining: 288ms
9:	learn: 0.6447308	total: 9.44ms	remaining: 274ms
10:	learn: 0.6401899	total: 10.2ms	remaining: 268ms
11:	learn: 0.6361296	total: 10.9ms	remaining: 261ms
12:	learn: 0.6323282	total: 11.5ms	remaining: 253ms
13:	learn: 0.6289539	total: 12.1ms	remaining: 246ms
14:	learn: 0.6244628	total: 12.7ms	remaining: 241ms
15:	learn: 0.6200875	total: 13.4ms	remaining: 237ms
16:	learn: 0.6158711	total: 13.9ms	remaining: 232ms
17:	learn: 0.6101955	total: 14.5ms	remaining: 228ms
18:	learn: 0.6078847	total: 15.1ms	remaining: 22

In [96]:
sim.model_results

Unnamed: 0,accuracy,f1,roc_auc
0,0.761905,0.615385,0.867347


In [108]:
sim.predict_2025_qb(player_name='Kyle McCord', round=1, pick=21, recent_team='PIT', season=2025)

array([[0.40099278, 0.59900722]])

In [30]:
sim = FranchiseQB(model='catboost', feature_set='cat')

In [31]:
sim.catboost()

Learning rate set to 0.010625
0:	learn: 0.6846945	total: 2.17ms	remaining: 648ms
1:	learn: 0.6763246	total: 3.25ms	remaining: 484ms
2:	learn: 0.6685398	total: 4.46ms	remaining: 441ms
3:	learn: 0.6637113	total: 5.62ms	remaining: 416ms
4:	learn: 0.6546585	total: 6.6ms	remaining: 390ms
5:	learn: 0.6468109	total: 8.79ms	remaining: 431ms
6:	learn: 0.6405927	total: 9.98ms	remaining: 418ms
7:	learn: 0.6324809	total: 11.1ms	remaining: 405ms
8:	learn: 0.6270464	total: 12ms	remaining: 388ms
9:	learn: 0.6219342	total: 13ms	remaining: 377ms
10:	learn: 0.6160191	total: 14ms	remaining: 368ms
11:	learn: 0.6093047	total: 15ms	remaining: 360ms
12:	learn: 0.6030638	total: 16.2ms	remaining: 358ms
13:	learn: 0.5970717	total: 17.7ms	remaining: 361ms
14:	learn: 0.5902319	total: 19.3ms	remaining: 367ms
15:	learn: 0.5858150	total: 20.4ms	remaining: 362ms
16:	learn: 0.5779605	total: 21.3ms	remaining: 355ms
17:	learn: 0.5764234	total: 21.6ms	remaining: 338ms
18:	learn: 0.5705851	total: 22.3ms	remaining: 330ms
1

In [34]:
sim.fit_model.score()

TypeError: CatBoostClassifier.score() missing 1 required positional argument: 'X'

In [6]:
NFL.run()

Learning rate set to 0.010625
0:	learn: 0.6852129	total: 60.9ms	remaining: 18.2s
1:	learn: 0.6775699	total: 62.4ms	remaining: 9.3s
2:	learn: 0.6707844	total: 64.5ms	remaining: 6.39s
3:	learn: 0.6628242	total: 65.8ms	remaining: 4.87s
4:	learn: 0.6586915	total: 67.8ms	remaining: 4s
5:	learn: 0.6521197	total: 69.9ms	remaining: 3.42s
6:	learn: 0.6454259	total: 71.1ms	remaining: 2.98s
7:	learn: 0.6401258	total: 72.1ms	remaining: 2.63s
8:	learn: 0.6352136	total: 73.3ms	remaining: 2.37s
9:	learn: 0.6282357	total: 74.5ms	remaining: 2.16s
10:	learn: 0.6239145	total: 75.4ms	remaining: 1.98s
11:	learn: 0.6194098	total: 75.9ms	remaining: 1.82s
12:	learn: 0.6154147	total: 76.5ms	remaining: 1.69s
13:	learn: 0.6099335	total: 77.7ms	remaining: 1.59s
14:	learn: 0.6042443	total: 78.9ms	remaining: 1.5s
15:	learn: 0.5979067	total: 79.9ms	remaining: 1.42s
16:	learn: 0.5922089	total: 80.9ms	remaining: 1.35s
17:	learn: 0.5870287	total: 81.8ms	remaining: 1.28s
18:	learn: 0.5815895	total: 82.6ms	remaining: 1.2

In [7]:
NFL.model_results

Unnamed: 0,accuracy,f1,roc_auc
0,0.814815,0.545455,0.885714


In [5]:
df = mfldata.load_qb_data_cleaned()

In [89]:
df = df.dropna()

In [24]:
numerics_only = df.select_dtypes(include='float').dropna()

In [26]:
numerics_only.head(5)

Unnamed: 0,G,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Y/A,AY/A,Y/C,Y/G,Rate,seasons,seasons_with_draft_team
0,31.0,604.0,893.0,67.6,8403.0,88.0,9.9,16.0,1.8,9.4,10.57,13.9,271.1,175.6,3.0,4.0
1,55.0,661.0,995.0,66.4,9285.0,88.0,8.8,16.0,1.6,9.3,10.38,14.0,168.8,170.8,4.0,2.0
2,35.0,695.0,1110.0,62.6,8148.0,60.0,5.4,27.0,2.4,7.3,7.33,11.7,232.8,137.2,3.0,1.0
3,53.0,1157.0,1645.0,70.3,13253.0,112.0,6.8,45.0,2.7,8.1,8.19,11.5,250.1,155.0,4.0,3.0
4,30.0,408.0,637.0,64.1,4265.0,19.0,3.0,20.0,3.1,6.7,5.88,10.5,142.2,123.9,4.0,1.0


In [220]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('seasons_with_draft_team',
                                                                       axis=1),
                                                                       df['seasons_with_draft_team'].apply(map_response),
                                                                       test_size=.25,
                                                                       shuffle=True,
                                                                       stratify=df['seasons_with_draft_team'].apply(map_response))

In [111]:
X_train, X_test, y_train, y_test = train_test_split(numerics_only.drop('seasons_with_draft_team',
                                                                       axis=1),
                                                                       numerics_only['seasons_with_draft_team'].apply(map_response),
                                                                       test_size=.25,
                                                                       shuffle=True,
                                                                       stratify=df['seasons_with_draft_team'].apply(map_response))

In [None]:
CatBoostClassifier()

In [221]:
model = CatBoostClassifier(one_hot_max_size=5, iterations=300, cat_features=df.select_dtypes(include='object').columns.tolist())
model.fit(X_train, y_train)
y_preds = model.predict(X_test)
y_probs = model.predict_proba(X_test)[:,1]
accuracy_score(y_test, y_preds)
roc_auc_score(y_test, y_probs)

Learning rate set to 0.010625
0:	learn: 0.6860037	total: 2.22ms	remaining: 664ms
1:	learn: 0.6777927	total: 4.22ms	remaining: 629ms
2:	learn: 0.6695467	total: 5.27ms	remaining: 522ms
3:	learn: 0.6632005	total: 7.8ms	remaining: 578ms
4:	learn: 0.6562832	total: 9.6ms	remaining: 567ms
5:	learn: 0.6509253	total: 10.7ms	remaining: 524ms
6:	learn: 0.6448863	total: 11.7ms	remaining: 488ms
7:	learn: 0.6415797	total: 12.2ms	remaining: 446ms
8:	learn: 0.6364113	total: 13.4ms	remaining: 433ms
9:	learn: 0.6290033	total: 14.4ms	remaining: 419ms
10:	learn: 0.6220800	total: 15.3ms	remaining: 403ms
11:	learn: 0.6148267	total: 16.5ms	remaining: 397ms
12:	learn: 0.6101433	total: 18.4ms	remaining: 406ms
13:	learn: 0.6037479	total: 19.6ms	remaining: 400ms
14:	learn: 0.5977267	total: 20.6ms	remaining: 391ms
15:	learn: 0.5923923	total: 21.5ms	remaining: 381ms
16:	learn: 0.5867612	total: 22.4ms	remaining: 373ms
17:	learn: 0.5827596	total: 23.7ms	remaining: 372ms
18:	learn: 0.5771357	total: 24.6ms	remaining: 

0.7928571428571428

In [226]:
X_train

Unnamed: 0,pfr_player_name,round,pick,season,G,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Y/A,AY/A,Y/C,Y/G,Rate,seasons,name,recent_team
102,Joe Burrow,1,1,2020,11.0,29.0,39.0,74.4,287.0,2.0,5.1,0.0,0.0,7.4,8.38,9.9,26.1,153.1,7.0,Joe Burrow,CIN
97,Kyle Trask,2,64,2021,29.0,552.0,813.0,67.9,7386.0,69.0,8.5,15.0,1.8,9.1,9.95,13.4,254.7,168.5,5.0,Kyle Trask,TB
105,Jordan Love,1,26,2020,38.0,689.0,1125.0,61.2,8600.0,60.0,5.3,29.0,2.6,7.6,7.55,12.5,226.3,137.9,3.0,Jordan Love,GB
146,Nathan Peterman,5,171,2017,10.0,20.0,43.0,46.5,94.0,0.0,0.0,2.0,4.7,2.2,0.09,4.7,9.4,55.6,7.0,Nathan Peterman,BUF
119,Will Grier,3,100,2019,22.0,516.0,785.0,65.7,7354.0,71.0,9.0,20.0,2.5,9.4,10.03,14.3,334.3,169.2,7.0,Will Grier,CAR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Jameis Winston,1,1,2015,27.0,562.0,851.0,66.0,7964.0,65.0,7.6,28.0,3.3,9.4,9.41,14.2,295.0,163.3,2.0,Jameis Winston,TB
31,Nick Foles,3,88,2012,1.0,5.0,8.0,62.5,57.0,0.0,0.0,0.0,0.0,7.1,7.13,11.4,57.0,122.4,8.0,Nick Foles,PHI
13,Cam Newton,1,1,2011,6.0,6.0,12.0,50.0,54.0,0.0,0.0,0.0,0.0,4.5,4.50,9.0,9.0,87.8,7.0,Cam Newton,CAR
62,Marcus Mariota,1,2,2015,41.0,779.0,1167.0,66.8,10796.0,105.0,9.0,14.0,1.2,9.3,10.51,13.9,263.3,171.8,3.0,Marcus Mariota,TEN


In [21]:
df.select_dtypes(include='float')

Unnamed: 0,G,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Y/A,AY/A,Y/C,Y/G,Rate,seasons,seasons_with_draft_team
0,31.0,604.0,893.0,67.6,8403.0,88.0,9.9,16.0,1.8,9.4,10.57,13.9,271.1,175.6,3.0,4.0
1,55.0,661.0,995.0,66.4,9285.0,88.0,8.8,16.0,1.6,9.3,10.38,14.0,168.8,170.8,4.0,2.0
2,35.0,695.0,1110.0,62.6,8148.0,60.0,5.4,27.0,2.4,7.3,7.33,11.7,232.8,137.2,3.0,1.0
3,53.0,1157.0,1645.0,70.3,13253.0,112.0,6.8,45.0,2.7,8.1,8.19,11.5,250.1,155.0,4.0,3.0
4,30.0,408.0,637.0,64.1,4265.0,19.0,3.0,20.0,3.1,6.7,5.88,10.5,142.2,123.9,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,,,,,,,,,,,,,,,,
145,37.0,614.0,999.0,61.5,7138.0,53.0,5.3,29.0,2.9,7.1,6.90,11.6,192.9,133.2,4.0,2.0
146,10.0,20.0,43.0,46.5,94.0,0.0,0.0,2.0,4.7,2.2,0.09,4.7,9.4,55.6,7.0,2.0
147,38.0,721.0,1189.0,60.6,9972.0,69.0,5.8,24.0,2.0,8.4,8.64,13.8,262.4,146.2,3.0,


In [159]:
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(units=32, activation='tanh'))
model.add(Normalization())
model.add(Dropout(.5))
model.add(Dense(units=16, activation='tanh'))
model.add(Dense(units=8, activation='tanh'))
model.add(Normalization())
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='poisson')
model.fit(X_train, y_train)
y_probs = model.predict(X_test)
y_preds = (y_probs >= 5).astype(int)

roc_auc_score(y_test, y_probs)
accuracy_score(y_test, y_preds)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7525  
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step


0.7407407407407407

In [260]:
import pydantic

In [262]:
from pydantic import BaseModel

class MyFirstModel(BaseModel):
    first_name: str
    last_name: str

validating = MyFirstModel(first_name="marc", last_name="nealer")

In [272]:
vars()

{'__name__': '__main__',
 '__doc__': 'Automatically created module for IPython interactive environment',
 '__package__': None,
 '__loader__': None,
 '__spec__': None,
 '__builtin__': <module 'builtins' (built-in)>,
 '__builtins__': <module 'builtins' (built-in)>,
 '_ih': ['',
  'import mfl as mfl',
  'import mfl as mfl\nimport pandas as pd\nimport numpy as np',
  'df = mfl.api.data_loaders.load_qb_data_cleaned()',
  'import mfl as mfl\nimport pandas as pd\nimport numpy as np\nimport mfl.api.data_loaders as mfldata',
  'df = mfldata.load_qb_data_cleaned()',
  'df',
  'import mfl as mfl\nimport pandas as pd\nimport numpy as np\nimport mfl.api.data_loaders as mfldata\nimport nfl_data_py as nfl\n\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier\nfrom sklearn.ensemble',
  'import mfl as mfl\nimport pandas as pd\nimport numpy as np\nimport mfl.api.data_loaders as mfldata\nimport nfl_data_py as nfl\n\nfro

In [263]:
MyFirstModel(first_name='Ben', last_name=0)

ValidationError: 1 validation error for MyFirstModel
last_name
  Input should be a valid string [type=string_type, input_value=0, input_type=int]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type