In [164]:
import mfl as mfl
import pandas as pd
import numpy as np
import mfl.api.data_loaders as mfldata
import nfl_data_py as nfl

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score, precision_recall_curve

from xgboost import XGBClassifier, XGBRFClassifier

from catboost import CatBoostClassifier

import keras
from keras.layers import Dense, ReLU, Bidirectional, Normalization, Dropout, Input
from keras.models import Sequential

In [256]:
class FranchiseQB:
    def __init__(self, 
                 feature_set='numeric', 
                 model='catboost', 
                 dataset='../data/for_modeling.csv', 
                 **kwargs):
        
        self.feature_set = feature_set
        self.model = model
        self.dataset = dataset
        self.kwargs = kwargs
        self.model_map = {
            'catboost' : CatBoostClassifier(),
            'xgb' : XGBClassifier(), 
            'rf' : RandomForestClassifier(),
            'lr' : LogisticRegression(),
            'hgb' : HistGradientBoostingClassifier(),
            'svm' : SVC(),
            'nn_basic': ...
        }
        self.available_models = list(self.model_map.keys())
        self.model_func = self.model_map[model]
        self.full_dataset = pd.read_csv(dataset)

    def create_training_data(self, n_splits=4, stratify=True):
        pass

    def map_response(self, x):
        if x >= 4:
            return 1
        else: 
            return 0
        
    def score(self, y_test, y_probs, y_preds):
        accuracy = accuracy_score(y_test, y_preds)
        f1 = f1_score(y_test, y_preds)
        roc_auc = roc_auc_score(y_test, y_probs)

        metric_dict = {
            'accuracy' : accuracy,
            'f1' : f1,
            'roc_auc': roc_auc
        }

        return metric_dict

    def run(self, kfold=False, folds=2):
        df = self.full_dataset.dropna()
        model = self.model_func

        if self.feature_set == 'numeric':
            X = df[df.select_dtypes(include='float').columns].drop('seasons_with_draft_team',axis=1)
        elif self.feature_set == 'cat':
            model = CatBoostClassifier(one_hot_max_size=5, iterations=300, cat_features=df.select_dtypes(include='object').columns.tolist())
            X = df.drop('seasons_with_draft_team', axis=1)
        
        y = df['seasons_with_draft_team'].apply(self.map_response)

        self.X = X
        self.y = y

        k_strat = StratifiedKFold(n_splits=folds, shuffle=False)
        folds = k_strat.split(X, y)
        y_preds = []
        y_probs = []
        y_tests = []
        
        if kfold:
            for train, test in folds:
                
                X_train, X_test = X.iloc[train], X.iloc[test]
                y_train, y_test = y.values[train], y.values[test]    

                model.fit(X_train, y_train)
                y_tests.extend(y_test)
                y_preds.extend(model.predict(X_test))
                y_probs.extend(model.predict_proba(X_test)[:,1])
        
        else:
            X_train, X_test, y_train, y_test = train_test_split(df.drop('seasons_with_draft_team',
                                                                       axis=1),
                                                                       df['seasons_with_draft_team'].apply(self.map_response),
                                                                       test_size=.25,
                                                                       shuffle=True,
                                                                       stratify=df['seasons_with_draft_team'].apply(self.map_response))
            model = CatBoostClassifier(one_hot_max_size=5, iterations=300, cat_features=df.select_dtypes(include='object').columns.tolist())
            model.fit(X_train, y_train)

            y_tests.extend(y_test)
            y_preds.extend(model.predict(X_test))
            y_probs.extend((model.predict_proba(X_test)[:,1]))
        
        self.y_preds = y_preds
        self.y_probs = y_probs

        metrics = self.score(y_tests, y_probs, y_preds)
        self.model_results = pd.DataFrame(metrics, index=[0])

In [257]:
NFL = FranchiseQB(model='catboost', feature_set='cat')

In [258]:
NFL.run()

Learning rate set to 0.010625
0:	learn: 0.6869298	total: 2.11ms	remaining: 630ms
1:	learn: 0.6790383	total: 3.53ms	remaining: 527ms
2:	learn: 0.6718350	total: 4.75ms	remaining: 470ms
3:	learn: 0.6661520	total: 7.38ms	remaining: 546ms
4:	learn: 0.6617756	total: 9.25ms	remaining: 546ms
5:	learn: 0.6567523	total: 10.6ms	remaining: 520ms
6:	learn: 0.6519768	total: 11.9ms	remaining: 496ms
7:	learn: 0.6483100	total: 12.8ms	remaining: 466ms
8:	learn: 0.6457270	total: 13.7ms	remaining: 443ms
9:	learn: 0.6395348	total: 15.3ms	remaining: 443ms
10:	learn: 0.6348406	total: 16.3ms	remaining: 428ms
11:	learn: 0.6287834	total: 18.2ms	remaining: 437ms
12:	learn: 0.6244050	total: 19.2ms	remaining: 424ms
13:	learn: 0.6194762	total: 20.1ms	remaining: 411ms
14:	learn: 0.6148721	total: 21.2ms	remaining: 402ms
15:	learn: 0.6097979	total: 22.2ms	remaining: 394ms
16:	learn: 0.6046732	total: 23.6ms	remaining: 392ms
17:	learn: 0.5999661	total: 24.6ms	remaining: 386ms
18:	learn: 0.5936067	total: 25.7ms	remaining

In [259]:
NFL.model_results

Unnamed: 0,accuracy,f1,roc_auc
0,0.851852,0.666667,0.921429


In [5]:
df = mfldata.load_qb_data_cleaned()

In [89]:
df = df.dropna()

In [24]:
numerics_only = df.select_dtypes(include='float').dropna()

In [26]:
numerics_only.head(5)

Unnamed: 0,G,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Y/A,AY/A,Y/C,Y/G,Rate,seasons,seasons_with_draft_team
0,31.0,604.0,893.0,67.6,8403.0,88.0,9.9,16.0,1.8,9.4,10.57,13.9,271.1,175.6,3.0,4.0
1,55.0,661.0,995.0,66.4,9285.0,88.0,8.8,16.0,1.6,9.3,10.38,14.0,168.8,170.8,4.0,2.0
2,35.0,695.0,1110.0,62.6,8148.0,60.0,5.4,27.0,2.4,7.3,7.33,11.7,232.8,137.2,3.0,1.0
3,53.0,1157.0,1645.0,70.3,13253.0,112.0,6.8,45.0,2.7,8.1,8.19,11.5,250.1,155.0,4.0,3.0
4,30.0,408.0,637.0,64.1,4265.0,19.0,3.0,20.0,3.1,6.7,5.88,10.5,142.2,123.9,4.0,1.0


In [220]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('seasons_with_draft_team',
                                                                       axis=1),
                                                                       df['seasons_with_draft_team'].apply(map_response),
                                                                       test_size=.25,
                                                                       shuffle=True,
                                                                       stratify=df['seasons_with_draft_team'].apply(map_response))

In [111]:
X_train, X_test, y_train, y_test = train_test_split(numerics_only.drop('seasons_with_draft_team',
                                                                       axis=1),
                                                                       numerics_only['seasons_with_draft_team'].apply(map_response),
                                                                       test_size=.25,
                                                                       shuffle=True,
                                                                       stratify=df['seasons_with_draft_team'].apply(map_response))

In [None]:
CatBoostClassifier()

In [221]:
model = CatBoostClassifier(one_hot_max_size=5, iterations=300, cat_features=df.select_dtypes(include='object').columns.tolist())
model.fit(X_train, y_train)
y_preds = model.predict(X_test)
y_probs = model.predict_proba(X_test)[:,1]
accuracy_score(y_test, y_preds)
roc_auc_score(y_test, y_probs)

Learning rate set to 0.010625
0:	learn: 0.6860037	total: 2.22ms	remaining: 664ms
1:	learn: 0.6777927	total: 4.22ms	remaining: 629ms
2:	learn: 0.6695467	total: 5.27ms	remaining: 522ms
3:	learn: 0.6632005	total: 7.8ms	remaining: 578ms
4:	learn: 0.6562832	total: 9.6ms	remaining: 567ms
5:	learn: 0.6509253	total: 10.7ms	remaining: 524ms
6:	learn: 0.6448863	total: 11.7ms	remaining: 488ms
7:	learn: 0.6415797	total: 12.2ms	remaining: 446ms
8:	learn: 0.6364113	total: 13.4ms	remaining: 433ms
9:	learn: 0.6290033	total: 14.4ms	remaining: 419ms
10:	learn: 0.6220800	total: 15.3ms	remaining: 403ms
11:	learn: 0.6148267	total: 16.5ms	remaining: 397ms
12:	learn: 0.6101433	total: 18.4ms	remaining: 406ms
13:	learn: 0.6037479	total: 19.6ms	remaining: 400ms
14:	learn: 0.5977267	total: 20.6ms	remaining: 391ms
15:	learn: 0.5923923	total: 21.5ms	remaining: 381ms
16:	learn: 0.5867612	total: 22.4ms	remaining: 373ms
17:	learn: 0.5827596	total: 23.7ms	remaining: 372ms
18:	learn: 0.5771357	total: 24.6ms	remaining: 

0.7928571428571428

In [226]:
X_train

Unnamed: 0,pfr_player_name,round,pick,season,G,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Y/A,AY/A,Y/C,Y/G,Rate,seasons,name,recent_team
102,Joe Burrow,1,1,2020,11.0,29.0,39.0,74.4,287.0,2.0,5.1,0.0,0.0,7.4,8.38,9.9,26.1,153.1,7.0,Joe Burrow,CIN
97,Kyle Trask,2,64,2021,29.0,552.0,813.0,67.9,7386.0,69.0,8.5,15.0,1.8,9.1,9.95,13.4,254.7,168.5,5.0,Kyle Trask,TB
105,Jordan Love,1,26,2020,38.0,689.0,1125.0,61.2,8600.0,60.0,5.3,29.0,2.6,7.6,7.55,12.5,226.3,137.9,3.0,Jordan Love,GB
146,Nathan Peterman,5,171,2017,10.0,20.0,43.0,46.5,94.0,0.0,0.0,2.0,4.7,2.2,0.09,4.7,9.4,55.6,7.0,Nathan Peterman,BUF
119,Will Grier,3,100,2019,22.0,516.0,785.0,65.7,7354.0,71.0,9.0,20.0,2.5,9.4,10.03,14.3,334.3,169.2,7.0,Will Grier,CAR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Jameis Winston,1,1,2015,27.0,562.0,851.0,66.0,7964.0,65.0,7.6,28.0,3.3,9.4,9.41,14.2,295.0,163.3,2.0,Jameis Winston,TB
31,Nick Foles,3,88,2012,1.0,5.0,8.0,62.5,57.0,0.0,0.0,0.0,0.0,7.1,7.13,11.4,57.0,122.4,8.0,Nick Foles,PHI
13,Cam Newton,1,1,2011,6.0,6.0,12.0,50.0,54.0,0.0,0.0,0.0,0.0,4.5,4.50,9.0,9.0,87.8,7.0,Cam Newton,CAR
62,Marcus Mariota,1,2,2015,41.0,779.0,1167.0,66.8,10796.0,105.0,9.0,14.0,1.2,9.3,10.51,13.9,263.3,171.8,3.0,Marcus Mariota,TEN


In [21]:
df.select_dtypes(include='float')

Unnamed: 0,G,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Y/A,AY/A,Y/C,Y/G,Rate,seasons,seasons_with_draft_team
0,31.0,604.0,893.0,67.6,8403.0,88.0,9.9,16.0,1.8,9.4,10.57,13.9,271.1,175.6,3.0,4.0
1,55.0,661.0,995.0,66.4,9285.0,88.0,8.8,16.0,1.6,9.3,10.38,14.0,168.8,170.8,4.0,2.0
2,35.0,695.0,1110.0,62.6,8148.0,60.0,5.4,27.0,2.4,7.3,7.33,11.7,232.8,137.2,3.0,1.0
3,53.0,1157.0,1645.0,70.3,13253.0,112.0,6.8,45.0,2.7,8.1,8.19,11.5,250.1,155.0,4.0,3.0
4,30.0,408.0,637.0,64.1,4265.0,19.0,3.0,20.0,3.1,6.7,5.88,10.5,142.2,123.9,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,,,,,,,,,,,,,,,,
145,37.0,614.0,999.0,61.5,7138.0,53.0,5.3,29.0,2.9,7.1,6.90,11.6,192.9,133.2,4.0,2.0
146,10.0,20.0,43.0,46.5,94.0,0.0,0.0,2.0,4.7,2.2,0.09,4.7,9.4,55.6,7.0,2.0
147,38.0,721.0,1189.0,60.6,9972.0,69.0,5.8,24.0,2.0,8.4,8.64,13.8,262.4,146.2,3.0,


In [159]:
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(units=32, activation='tanh'))
model.add(Normalization())
model.add(Dropout(.5))
model.add(Dense(units=16, activation='tanh'))
model.add(Dense(units=8, activation='tanh'))
model.add(Normalization())
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='poisson')
model.fit(X_train, y_train)
y_probs = model.predict(X_test)
y_preds = (y_probs >= 5).astype(int)

roc_auc_score(y_test, y_probs)
accuracy_score(y_test, y_preds)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7525  
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step


0.7407407407407407