In [10]:
import nfl_data_py as nfl

import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.metrics import recall_score, precision_score, precision_recall_curve, f1_score, auc, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest

from keras.layers import Dense, ReLU, Normalization, Dropout
from keras.models import Sequential
from keras.activations import relu, sigmoid
from keras.losses import binary_crossentropy

from xgboost import XGBClassifier

import pickle
import requests
from bs4 import BeautifulSoup
import seaborn as sns

import time
import os

import warnings
import sys

warnings.filterwarnings("ignore")

sys.path.append("~/Desktop/MfL/")

In [11]:
pd.read_csv('~/Desktop/MfL/first_round_qb_training_data.csv')

Unnamed: 0,player,season,round,pick,team,age,college,power_5,seasons,G,...,TD,TD%,Int,Int%,Y/A,AY/A,Y/C,Y/G,Rate,successful
0,Sam Bradford,2010,1,1,LA,22.0,Oklahoma,False,3.0,31.0,...,88.0,9.9,16.0,1.8,9.4,10.57,13.9,271.1,175.6,False
1,Tim Tebow,2010,1,25,DEN,23.0,Florida,True,4.0,55.0,...,88.0,8.8,16.0,1.6,9.3,10.38,14.0,168.8,170.8,False
2,Jake Locker,2011,1,8,TEN,23.0,Washington,True,4.0,40.0,...,53.0,4.6,35.0,3.1,6.7,6.21,12.3,191.0,119.1,False
3,Blaine Gabbert,2011,1,10,JAX,21.0,Missouri,True,3.0,31.0,...,40.0,4.3,18.0,1.9,7.3,7.3,12.0,220.1,132.6,False
4,Christian Ponder,2011,1,12,MIN,23.0,Florida St.,True,4.0,35.0,...,49.0,5.1,30.0,3.1,7.1,6.74,11.5,196.3,132.1,False
5,Andrew Luck,2012,1,1,IND,22.0,Stanford,True,3.0,38.0,...,82.0,7.7,22.0,2.1,8.9,9.47,13.2,248.2,162.8,True
6,Robert Griffin III,2012,1,2,WAS,22.0,Baylor,True,4.0,41.0,...,78.0,6.5,17.0,1.4,8.7,9.36,13.0,252.8,158.9,False
7,Ryan Tannehill,2012,1,8,MIA,24.0,Texas A&M,True,4.0,50.0,...,42.0,5.4,21.0,2.7,7.0,6.91,11.3,109.0,134.2,True
8,Brandon Weeden,2012,1,22,CLE,28.0,Oklahoma St.,True,4.0,30.0,...,75.0,6.8,27.0,2.4,8.4,8.65,12.1,308.7,157.6,False
9,EJ Manuel,2013,1,16,BUF,23.0,Florida St.,True,4.0,43.0,...,47.0,5.2,28.0,3.1,8.6,8.27,12.9,180.0,150.4,False


In [12]:
rd1_qb_data = pd.read_csv('~/Desktop/MfL/first_round_qb_training_data.csv')


In [14]:
rd1_qb_data.columns

Index(['player', 'season', 'round', 'pick', 'team', 'age', 'college',
       'power_5', 'seasons', 'G', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%',
       'Int', 'Int%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'successful'],
      dtype='object')

In [82]:
class BustModel:
    
    # charts here?
    rd1_qb_data = pd.read_csv('~/Desktop/MfL/first_round_qb_training_data.csv')
    rd1_qb_list = rd1_qb_data['player'].unique()
    valid_feature_list = rd1_qb_data.columns
    response_distribution = rd1_qb_data['successful'].value_counts()

    def __init__(self, save_model=False, 
                 model_list=['lr', 'svm', 'xgboost', 'rf'],
                 feature_list=['G', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%',
                            'Int', 'Int%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate'],
                 trial_id= 'abc',
                 scale_data=False):

        self.save_model = save_model
        self.model_list = model_list
        self.feature_list = feature_list
        self.trial_id = trial_id
        self.scale_data = scale_data

    def load_data(self):
        
        df = BustModel.rd1_qb_data
        df = df[df['AY/A'].isna() == False]
        
        self.df = df

        mappings = {False:0, True:1}
        df['successful'] = df['successful'].apply(lambda bool: mappings[bool])
        df['power_5'] = df['power_5'].apply(lambda bool: mappings[bool])

        df_final = df[self.feature_list]

        if self.scale_data:
            scaler = StandardScaler()
            scaled_df = scaler.fit_transform(df_final)

            df_final = pd.DataFrame()

            for idx in range(len(self.feature_list)):
                df_final[self.feature_list[idx]] = scaled_df[:,idx]
        
        df_final = pd.concat([df_final.reset_index(), df['successful'].reset_index()],axis=1).drop('index', axis=1)
        
        self.cleaned_features = df_final
    
    def create_training_data(self):
        
        self.load_data()
        
        if self.feature_list == 'all':
            X_train, X_test, y_train, y_test = train_test_split(self.df.drop('successful', axis=1), self.df['successful'], test_size=.3, stratify=self.df['successful'])
        else:
            X_train, X_test, y_train, y_test = train_test_split(self.df[self.feature_list], self.df['successful'], test_size=.3, stratify=self.df['successful'])

        self.X_train, self.X_test, self.y_train, self.y_test = X_train, X_test, y_train, y_test

    def run_SVM(self, kernel='rbf', save_SVM=False):

        self.create_training_data()
        X_train, X_test, y_train, y_test = self.X_train, self.X_test, self.y_train, self.y_test 

        df_features = pd.concat([X_train, X_test])
        df_responses = pd.concat([y_train, y_test])

        k_strat = StratifiedKFold(n_splits=4, shuffle=False)
        folds = k_strat.split(df_features, df_responses)
        
        acc = []
        roc = []
        
        for train, test in folds:
            
            X_train, X_test = df_features.iloc[train], df_features.iloc[test]
            y_train, y_test = df_responses.values[train], df_responses.values[test]
            
            model = SVC(kernel=kernel, probability=True)
            
            model.fit(X_train, y_train)
            y_preds = model.predict(X_test)
            y_proba = model.predict_proba(X_test)

            acc.append(accuracy_score(y_test, y_preds))
            roc.append(roc_auc_score(y_test, y_proba[:,1]))
        
        print(f'Accuracy: {np.average(acc)}, ROC: {np.average(roc)}')

        model.fit(df_features, df_responses)

        self.SVM_model = model

        return 'SVM: SUCCESS'
    
    def run_RF(self):

        self.create_training_data()
        X_train, X_test, y_train, y_test = self.X_train, self.X_test, self.y_train, self.y_test 

        df_features = pd.concat([X_train, X_test])
        df_responses = pd.concat([y_train, y_test])

        k_strat = StratifiedKFold(n_splits=5, shuffle=False)
        folds = k_strat.split(df_features, df_responses)
        
        acc = []
        roc = []
        
        for train, test in folds:
            
            X_train, X_test = df_features.iloc[train], df_features.iloc[test]
            y_train, y_test = df_responses.values[train], df_responses.values[test]
            
            model = RandomForestClassifier()
            
            model.fit(X_train, y_train)
            y_preds = model.predict(X_test)
            y_preds = (y_preds >= 5).astype(int)
            y_proba = model.predict_proba(X_test)

            acc.append(accuracy_score(y_test, y_preds))
            roc.append(roc_auc_score(y_test, y_proba[:,1]))
        
        print(f'Accuracy: {np.average(acc)}, ROC: {np.average(roc)}')
        
        return "SUCCESS: Random Forest"
    
    def run_LR(self):
        
        self.create_training_data()
        X_train, X_test, y_train, y_test = self.X_train, self.X_test, self.y_train, self.y_test 
        
        df_features = pd.concat([X_train, X_test])
        df_responses = pd.concat([y_train, y_test])

        k_strat = StratifiedKFold(n_splits=5, shuffle=False)
        folds = k_strat.split(df_features, df_responses)
        
        acc = []
        roc = []
        
        for train, test in folds:
            
            X_train, X_test = df_features.iloc[train], df_features.iloc[test]
            y_train, y_test = df_responses.values[train], df_responses.values[test]
            
            model = LogisticRegression()
            
            model.fit(X_train, y_train)
            y_preds = model.predict(X_test)
            y_preds = (y_preds >= 5).astype(int)
            y_proba = model.predict_proba(X_test)

            acc.append(accuracy_score(y_test, y_preds))
            roc.append(roc_auc_score(y_test, y_proba[:,1]))
        
        print(f'Accuracy: {np.average(acc)}, ROC: {np.average(roc)}')

        return 'SUCCESS:'
    
    def run_model_list(self):
        pass
    
    # fix to include qb names
    def preview_feature_set(self):
        return self.load_data()

In [83]:
hi = BustModel(scale_data=False)

In [85]:
hi.run_SVM()

TypeError: BustModel.create_training_data() got an unexpected keyword argument 'feature_list'