In [13]:
import skimpy

In [14]:
import nfl_data_py as nfl

import pandas as pd
import numpy as np
from skimpy import skim

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.metrics import recall_score, precision_score, precision_recall_curve, f1_score, auc, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest

from keras.layers import Dense, ReLU, Normalization, Dropout
from keras.models import Sequential
from keras.activations import relu, sigmoid
from keras.losses import binary_crossentropy

from xgboost import XGBClassifier

import pickle
import requests
from bs4 import BeautifulSoup
import seaborn as sns

import time
import os

import warnings
import sys

warnings.filterwarnings("ignore")

sys.path.append("~/Desktop/MfL/")

In [12]:
rd1_qb_data = pd.read_csv('~/Desktop/MfL/first_round_qb_training_data.csv')

In [11]:
nfl.import_players()

Unnamed: 0,status,display_name,first_name,last_name,esb_id,gsis_id,birth_date,college_name,position_group,position,...,status_description_abbr,status_short_description,gsis_it_id,short_name,smart_id,headshot,draft_number,draftround,uniform_number,suffix
0,RET,'Omar Ellison,'Omar,Ellison,ELL711319,00-0004866,1971-10-08,,WR,WR,...,,,,,3200454c-4c71-1319-728e-d49d3d236f8f,,,,,
1,ACT,A'Shawn Robinson,A'Shawn,Robinson,ROB367960,00-0032889,1995-03-21,Alabama,DL,DE,...,A01,Active,43335.0,A.Robinson,3200524f-4236-7960-bf20-bc060ac0f49c,https://static.www.nfl.com/image/upload/f_auto...,46.0,2.0,94,
2,RES,A.J. Arcuri,A.J.,Arcuri,ARC716900,00-0037845,1997-08-13,Michigan State,OL,T,...,R23,Reserve/Future,54726.0,A.Arcuri,32004152-4371-6900-5185-8cdd66b2ad11,https://static.www.nfl.com/image/upload/f_auto...,261.0,7.0,61,
3,ACT,A.J. Barner,A.J.,Barner,BAR235889,00-0039793,2002-05-03,Michigan,TE,TE,...,A01,Active,57242.0,A.Barner,32004241-5223-5889-95d9-0ba3aeeb36ed,https://static.www.nfl.com/image/upload/f_auto...,121.0,4.0,88,
4,RES,A.J. Bouye,Arlandus,Bouye,BOU651714,00-0030228,1991-08-16,Central Florida,DB,CB,...,R01,R/Injured,40688.0,A.Bouye,3200424f-5565-1714-cb38-07c822111a12,https://static.www.nfl.com/image/private/f_aut...,,,24,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20749,CUT,Zuri Henry,Zuri,Henry,HEN713594,00-0039689,2000-04-12,Texas-El Paso,OL,T,...,W03,Waivers/No Rec.,57839.0,Z.Henry,32004845-4e71-3594-d6f6-5f3aaca3acfc,,,,65,
20750,RET,Zuriel Smith,Zuriel,Smith,SMI828252,00-0022024,1980-01-15,,WR,WR,...,,,,,3200534d-4982-8252-c800-85b370612021,,,,,
20751,CUT,Zurlon Tipton,Zurlon,Tipton,TIP645432,00-0030855,1989-04-27,Central Michigan,RB,RB,...,A01,Active,41693.0,,32005449-5064-5432-923a-94c7fae9b469,https://static.www.nfl.com/image/private/f_aut...,,,,
20752,DEV,Zyon Gilbert,Zyon,Gilbert,GIL144859,00-0037373,1999-02-04,Florida Atlantic,DB,CB,...,P02,Prac Sq.; Inj,54958.0,Z.Gilbert,32004749-4c14-4859-6e4a-327bb4fcec5a,https://static.www.nfl.com/image/upload/f_auto...,,,42,


In [14]:
rd1_qb_data.columns

Index(['player', 'season', 'round', 'pick', 'team', 'age', 'college',
       'power_5', 'seasons', 'G', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%',
       'Int', 'Int%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'successful'],
      dtype='object')

In [None]:
class BustModel:
    
    # charts here?
    # add player cleaning pipeline
    rd1_qb_data = pd.read_csv('~/Desktop/MfL/first_round_qb_training_data.csv')
    rd1_qb_list = rd1_qb_data['player'].unique()
    valid_feature_list = rd1_qb_data.columns
    response_distribution = rd1_qb_data['successful'].value_counts()

    def __init__(self, 
                 save_model=False, 
                 model_list=['lr', 'svm', 'rf'],
                 feature_list=['G', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%',
                            'Int', 'Int%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate'],
                 trial_id= 'abc',
                 scale_data=False):

        self.save_model = save_model
        self.model_list = model_list
        self.feature_list = feature_list
        self.trial_id = trial_id
        self.scale_data = scale_data
        self.rd1_qb_summary = skim(BustModel.rd1_qb_data)

    def load_data(self):
        
        df = BustModel.rd1_qb_data
        df = df[df['AY/A'].isna() == False]
        
        self.df = df

        mappings = {False:0, True:1}
        df['successful'] = df['successful'].apply(lambda bool: mappings[bool])
        df['power_5'] = df['power_5'].apply(lambda bool: mappings[bool])

        df_final = df[self.feature_list]

        if self.scale_data:
            scaler = StandardScaler()
            scaled_df = scaler.fit_transform(df_final)

            df_final = pd.DataFrame()

            for idx in range(len(self.feature_list)):
                df_final[self.feature_list[idx]] = scaled_df[:,idx]
        
        df_final = pd.concat([df_final.reset_index(), df['successful'].reset_index()],axis=1).drop('index', axis=1)
        
        self.cleaned_features = df_final
    
    def create_training_data(self):
        
        self.load_data()
        
        if self.feature_list == 'all':
            X_train, X_test, y_train, y_test = train_test_split(self.df.drop('successful', axis=1), self.df['successful'], test_size=.3, stratify=self.df['successful'])
        else:
            X_train, X_test, y_train, y_test = train_test_split(self.df[self.feature_list], self.df['successful'], test_size=.3, stratify=self.df['successful'])

        self.X_train, self.X_test, self.y_train, self.y_test = X_train, X_test, y_train, y_test

    def run_SVM(self, kernel='rbf', save_SVM=False):

        self.create_training_data()
        X_train, X_test, y_train, y_test = self.X_train, self.X_test, self.y_train, self.y_test 

        df_features = pd.concat([X_train, X_test])
        df_responses = pd.concat([y_train, y_test])

        k_strat = StratifiedKFold(n_splits=4, shuffle=False)
        folds = k_strat.split(df_features, df_responses)
        
        acc = []
        roc = []
        
        for train, test in folds:
            
            X_train, X_test = df_features.iloc[train], df_features.iloc[test]
            y_train, y_test = df_responses.values[train], df_responses.values[test]
            
            model = SVC(kernel=kernel, probability=True)
            
            model.fit(X_train, y_train)
            y_preds = model.predict(X_test)
            y_proba = model.predict_proba(X_test)

            acc.append(accuracy_score(y_test, y_preds))
            roc.append(roc_auc_score(y_test, y_proba[:,1]))
        
        print(f'Accuracy: {np.average(acc)}, ROC: {np.average(roc)}')

        model.fit(df_features, df_responses)

        self.SVM_model = model

        result_dict = {
            
        }

        return 'SVM: SUCCESS'
    
    def run_RF(self):

        self.create_training_data()
        X_train, X_test, y_train, y_test = self.X_train, self.X_test, self.y_train, self.y_test 

        df_features = pd.concat([X_train, X_test])
        df_responses = pd.concat([y_train, y_test])

        k_strat = StratifiedKFold(n_splits=5, shuffle=False)
        folds = k_strat.split(df_features, df_responses)
        
        acc = []
        roc = []
        
        actuals = []
        preds = []
        probs = []
        
        for train, test in folds:
            
            X_train, X_test = df_features.iloc[train], df_features.iloc[test]
            y_train, y_test = df_responses.values[train], df_responses.values[test]
            
            model = RandomForestClassifier()
            
            model.fit(X_train, y_train)
            y_preds = model.predict(X_test)
            y_preds = (y_preds >= 5).astype(int)
            y_proba = model.predict_proba(X_test)

            actuals.extend(y_test)
            preds.extend(y_preds)
            probs.extend(y_proba)

            acc.append(accuracy_score(y_test, y_preds))
            roc.append(roc_auc_score(y_test, y_proba[:,1]))
        
        print(f'Accuracy: {np.average(acc)}, ROC: {np.average(roc)}')

        self.RF_acc = acc

        acc = np.average(acc)
        auc = np.average(roc)

        return (actuals, preds, probs)
    
    def run_LR(self):
        
        self.create_training_data()
        X_train, X_test, y_train, y_test = self.X_train, self.X_test, self.y_train, self.y_test 
        
        df_features = pd.concat([X_train, X_test])
        df_responses = pd.concat([y_train, y_test])

        k_strat = StratifiedKFold(n_splits=5, shuffle=False)
        folds = k_strat.split(df_features, df_responses)
        
        acc = []
        roc = []
        
        for train, test in folds:
            
            X_train, X_test = df_features.iloc[train], df_features.iloc[test]
            y_train, y_test = df_responses.values[train], df_responses.values[test]
            
            model = LogisticRegression()
            
            model.fit(X_train, y_train)
            y_preds = model.predict(X_test)
            y_preds = (y_preds >= 5).astype(int)
            y_proba = model.predict_proba(X_test)

            acc.append(accuracy_score(y_test, y_preds))
            roc.append(roc_auc_score(y_test, y_proba[:,1]))
        
        print(f'Accuracy: {np.average(acc)}, ROC: {np.average(roc)}')

        return 'SUCCESS:'
    
    
    def run_NFL_model(self, model_name, **kwargs):
        model_dict = {
            'lr' : LogisticRegression(),
            'rf' : RandomForestClassifier(),
            'xgb': XGBClassifier(),
            'svm': SVC()
        }

    def run_model_simulation(self):
        self.model_dict = {
            'lr': self.run_LR(),
            'svm': self.run_SVM(),
            'rf': self.run_RF(),
            'model1':...,
            'model2':...
        }

        models_to_use = [self.model_dict[key] for key in self.model_list]

        print("Running sim...\n")
        
        results = []
        for model in models_to_use:
            result = model
            results.append(model)
        
        return results
    
    def predict_player(self, player_name='Caleb Williams', use_model='best'):
        raw_data['pick'] = 1
        raw_data['power_5'] = 1

        with open('models/SVM_model_3.pkl', 'rb') as file_path:
            model = pickle.load(file_path)

        predict_sample = [int(raw_data['G'].values[0]), int(raw_data['TD'].values[0]), int(raw_data['pick'].values[0]), int(raw_data['Int'].values[0]), int(raw_data['power_5'].values[0]), int(raw_data['seasons'].values[0])]
        print(predict_sample)
        sample = model.predict(np.array(predict_sample).reshape(1,-1))
        prob = model.predict_proba(np.array(predict_sample).reshape(1,-1))

        return sample, prob

    # fix to include qb names
    def preview_feature_set(self):
        return self.load_data()

In [None]:
def predict_out_sample_player_test_new(player_name):

    #feature_subset=['G', 'TD', 'pick', 'Int', 'power_5']
    #feature_subset=['G', 'TD', 'pick', 'Int', 'power_5', 'seasons']
    # testing for Caleb Williams / Drake Maye
    raw_data['pick'] = 1
    raw_data['power_5'] = 1

    with open('models/SVM_model_3.pkl', 'rb') as file_path:
        model = pickle.load(file_path)

    predict_sample = [int(raw_data['G'].values[0]), int(raw_data['TD'].values[0]), int(raw_data['pick'].values[0]), int(raw_data['Int'].values[0]), int(raw_data['power_5'].values[0]), int(raw_data['seasons'].values[0])]
    print(predict_sample)
    sample = model.predict(np.array(predict_sample).reshape(1,-1))
    prob = model.predict_proba(np.array(predict_sample).reshape(1,-1))
    
    return sample, prob

In [9]:
nfl_model = BustModel(scale_data=False)

In [10]:
nfl_model.run_model_simulation()

Accuracy: 0.5535714285714285, ROC: 0.6733333333333333
Accuracy: 0.6555555555555554, ROC: 0.7841666666666667
Accuracy: 0.5535714285714285, ROC: 0.665
Running sim...



['SUCCESS:', 'SVM: SUCCESS', 'SUCCESS: Random Forest']