In [9]:
import nfl_data_py as nfl

import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.metrics import recall_score, precision_score, precision_recall_curve, f1_score, auc, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest

from keras.layers import Dense, ReLU, Normalization, Dropout
from keras.models import Sequential
from keras.activations import relu, sigmoid
from keras.losses import binary_crossentropy

from xgboost import XGBClassifier

import pickle
import requests
from bs4 import BeautifulSoup
import seaborn as sns

import time
import os

import warnings
warnings.filterwarnings("ignore")

In [66]:
def save_model(model, file_name, cols):
   
    current_dir = os.path.dirname('nfl_model.ipynb')
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    csv_path = os.path.join(parent_dir, "models", f"{file_name}.pkl")

    with open(csv_path, 'wb') as csv_path:
        pickle.dump(model, csv_path)

    #pd.DataFrame(cols).to_csv(csv_path, index=False)

In [11]:
def reset_scaler(run_iter, cols_to_use):

    current_dir = os.path.dirname('nfl_model.ipynb')
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    csv_path = os.path.join(parent_dir, "data", "first_round_qb_training_data.csv")
    
    df = pd.read_csv(csv_path)
    df = df[df['AY/A'].isna() == False]

    mappings = {False:0, True:1}
    df['successful'] = df['successful'].apply(lambda bool: mappings[bool])

    df_scale = df[['season', 'pick', 'age', 'seasons', 'G', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%', 'Int', 'Int%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate']]
    scaled_cols = ['season', 'pick', 'age', 'seasons', 'G', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%', 'Int', 'Int%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate']
    new_scaled_cols = df[['G', 'TD', 'pick', 'Int']]

    scaler = StandardScaler()
    scaled_df = scaler.fit_transform(new_scaled_cols)

    scaler_path = f'scaler_{run_iter}.pkl'
    csv_path = os.path.join(parent_dir, "models", scaler_path)
    
    with open(csv_path, 'wb') as csv_path:
        pickle.dump(scaler, csv_path)

In [13]:
current_dir = os.path.dirname('nfl_model.ipynb')
parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
csv_path = os.path.join(parent_dir, "data", "first_round_qb_training_data.csv")

In [14]:
df = pd.read_csv(csv_path)

In [16]:
df.columns

Index(['player', 'season', 'round', 'pick', 'team', 'age', 'college',
       'power_5', 'seasons', 'G', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%',
       'Int', 'Int%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'successful'],
      dtype='object')

In [22]:
cols = ['G', 'TD', 'pick', 'Int', 'power_5']

In [50]:
def load_data(run_iter, cols_to_use):

    current_dir = os.path.dirname('nfl_model.ipynb')
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    csv_path = os.path.join(parent_dir, "data", "first_round_qb_training_data.csv")
    
    df = pd.read_csv(csv_path)
    df = df[df['AY/A'].isna() == False]

    mappings = {False:0, True:1}
    df['successful'] = df['successful'].apply(lambda bool: mappings[bool])
    df['power_5'] = df['power_5'].apply(lambda bool: mappings[bool])

    df_final = df[cols_to_use]

    scaler = StandardScaler()
    scaled_df = scaler.fit_transform(df_final)

    scaler_path = f'scaler_{run_iter}.pkl'
    csv_path = os.path.join(parent_dir, "models", scaler_path)
    
    with open(csv_path, 'wb') as csv_path:
        pickle.dump(scaler, csv_path)

    final_df = pd.DataFrame()

    for idx in range(len(cols_to_use)):
        final_df[cols_to_use[idx]] = scaled_df[:,idx]
    
    final_df = pd.concat([final_df.reset_index(), df['successful'].reset_index()],axis=1).drop('index', axis=1)

    return final_df

In [51]:
df_final = load_data(7, cols)

In [52]:
df_final

Unnamed: 0,G,TD,pick,Int,power_5,successful
0,-0.179415,1.035071,-0.956459,-0.427941,-2.104417,0
1,1.973565,1.035071,1.601577,-0.427941,0.475191,0
2,0.627953,-0.31458,-0.210365,1.688315,0.475191,0
3,-0.179415,-0.815879,0.002805,-0.205177,0.475191,0
4,0.179415,-0.468826,0.215975,1.131405,0.475191,0
5,0.448538,0.803702,-0.956459,0.24035,0.475191,1
6,0.71766,0.649456,-0.849874,-0.316559,0.475191,0
7,1.525028,-0.738756,-0.210365,0.128968,0.475191,1
8,-0.269123,0.533772,1.281823,0.79726,0.475191,0
9,0.897075,-0.545949,0.642314,0.908642,0.475191,0


In [34]:
def load_data_unscaled():

    current_dir = os.path.dirname('nfl_model.ipynb')
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    csv_path = os.path.join(parent_dir, "data", "first_round_qb_training_data.csv")
    
    df = pd.read_csv(csv_path)
    df = df[df['AY/A'].isna() == False]
    df = df[['power_5', 'season', 'pick', 'age', 'seasons', 'G', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%', 'Int', 'Int%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'successful']]

    mappings = {False:0, True:1}
    df['successful'] = df['successful'].apply(lambda bool: mappings[bool])
    df['successful'] = df['successful'].apply(lambda bool: mappings[bool])
    df['power_5'] = df['power_5'].apply(lambda bool: mappings[bool])

    return df

In [235]:
def extract_passing_table(player_name):

    first_name = player_name.split(' ')[0].lower()
    last_name = player_name.split(' ')[1].lower()
    player_url = f'https://www.sports-reference.com/cfb/players/{first_name}-{last_name}-1.html'
    html_content = requests.get(player_url).text
    
    if player_name == 'Zach Wilson':
        first_name = player_name.split(' ')[0].lower()
        last_name = player_name.split(' ')[1].lower()
        player_url = f'https://www.sports-reference.com/cfb/players/{first_name}-{last_name}-3.html'
        html_content = requests.get(player_url).text
        
    if player_name == 'Justin Fields' or player_name == 'Jordan Love':
        first_name = player_name.split(' ')[0].lower()
        last_name = player_name.split(' ')[1].lower()
        player_url = f'https://www.sports-reference.com/cfb/players/{first_name}-{last_name}-2.html'
        html_content = requests.get(player_url).text
    
    if player_name == 'Daniel Jones':
        first_name = player_name.split(' ')[0].lower()
        last_name = player_name.split(' ')[1].lower()
        player_url = f'https://www.sports-reference.com/cfb/players/{first_name}-{last_name}-4.html'
        html_content = requests.get(player_url).text
    
    if player_name == 'Josh Allen':
        first_name = player_name.split(' ')[0].lower()
        last_name = player_name.split(' ')[1].lower()
        player_url = f'https://www.sports-reference.com/cfb/players/{first_name}-{last_name}-7.html'
        html_content = requests.get(player_url).text

    if player_name == 'Mitchell Trubisky':
        first_name = player_name.split(' ')[0].lower()
        first_name = 'Mitch'
        last_name = player_name.split(' ')[1].lower()
        player_url = 'https://www.sports-reference.com/cfb/players/mitch-trubisky-1.html'
        html_content = requests.get(player_url).text

    if player_name == 'Caleb Williams':
        first_name = player_name.split(' ')[0].lower()
        first_name = 'Mitch'
        last_name = player_name.split(' ')[1].lower()
        player_url = 'https://www.sports-reference.com/cfb/players/caleb-williams-3.html'
        html_content = requests.get(player_url).text

    if player_name == 'Anthony Richardson':
        first_name = player_name.split(' ')[0].lower()
        first_name = 'Mitch'
        last_name = player_name.split(' ')[1].lower()
        player_url = 'https://www.sports-reference.com/cfb/players/anthony-richardson-2.html'
        html_content = requests.get(player_url).text

    if len(player_name.split(' ')) > 2:
        first_name = player_name.split(' ')[0].lower()
        last_name = player_name.split(' ')[1].lower()
        suffix = player_name.split(' ')[2].lower()
        player_url = f'https://www.sports-reference.com/cfb/players/{first_name}-{last_name}-{suffix}-1.html'
        html_content = requests.get(player_url).text
    
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table', {'id': 'passing_standard'})
    
    if not table:
        print("Passing stats table not found")
        return None

    baseline_headers = []
    headers = [th.getText() for th in table.find_all('th')]
    rows = []
    
    for tr in table.find_all('tr')[1:]:
        cells = [td.getText() for td in tr.find_all('td')]
        if cells: 
            rows.append(cells)
    
    yr_college = len(rows) - 1

    column_names = [
    'G',        
    'Cmp',      
    'Att',      
    'Cmp%',     
    'Yds',      
    'TD',       
    'TD%',      
    'Int',      
    'Int%',     
    'Y/A',      
    'AY/A',     
    'Y/C',      
    'Y/G',      
    'Rate',
    'seasons',
    'name'
    ]
    
    totals = rows[len(rows)-1]
    totals = [stat for stat in totals if stat.strip()]
    totals.append(yr_college)
    totals.append(player_name)

    if len(totals) == len(column_names):
        final = pd.DataFrame({name: [value] for name, value in zip(column_names, totals)})
    else:
        print("Error: Number of stats does not match number of column names.")

    return final

In [291]:
def predict_in_sample_player(player_name):
    features = pd.read_csv('first_round_features_responses.csv')
    df = pd.read_csv('first_round_qb_training_data.csv')
    
    df = df[df['AY/A'].isna() == False]
    features['player_name'] = np.array(df['player'].values)

    sample = features[features['player_name'] == player_name]

    with open('SVM_model_0.pkl', 'rb') as file_path:
        model = pickle.load(file_path)
    
    mapping = {0:'a bust', 1:'not a bust'}
    response = model.predict(sample.drop(['successful', 'player_name'][['G', 'seasons', 'TD', 'pick']], axis=1))[0]
    str_response = mapping[response]
    
    print(f'Model says that {player_name} is {str_response}')
    return response

In [299]:
player_name = 'Shedeur Sanders'
raw_data = extract_passing_table(player_name)

In [304]:
def predict_out_sample_player_test(player_name):

    # testing for Caleb Williams / Drake Maye
    raw_data['age'] = 22
    raw_data['pick'] = 1
    raw_data['power_5'] = True
    raw_data['season'] = 2024
    df = raw_data

    new_data = pd.DataFrame()
    
    for i in ['season', 'pick', 'age', 'seasons', 'G', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%', 'Int', 'Int%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate']:
        new_data[i] = raw_data[i]
    
    with open('models/scaler.pkl', 'rb') as file_path:
        scaler = pickle.load(file_path)
    x = scaler.transform(new_data)

    
    power_5 = 1
    feature_subset=['G', 'TD', 'pick', 'Int', 'power_5']
    sample = [x[0][4], x[0][9], x[0][1], x[0][11], power_5]
    mappings = {False:0, True:1}

    mappings = {False:0, True:1}
    df['power_5'] = df['power_5'].apply(lambda bool: mappings[bool])

    predict_sample =  [x[0][4], x[0][9], x[0][1], x[0][11], power_5]

    with open('models/SVM_model_1.pkl', 'rb') as file_path:
        model = pickle.load(file_path)

    sample = model.predict(np.array(predict_sample).reshape(1,-1))
    prob = model.predict_proba(np.array(predict_sample).reshape(1,-1))
    
    return sample, prob


In [438]:
player_name = 'Caleb Williams'
raw_data = extract_passing_table(player_name)

In [440]:
def predict_out_sample_player_test_new(player_name):

    #feature_subset=['G', 'TD', 'pick', 'Int', 'power_5']
    #feature_subset=['G', 'TD', 'pick', 'Int', 'power_5', 'seasons']
    # testing for Caleb Williams / Drake Maye
    raw_data['pick'] = 1
    raw_data['power_5'] = 1

    with open('models/SVM_model_3.pkl', 'rb') as file_path:
        model = pickle.load(file_path)

    predict_sample = [int(raw_data['G'].values[0]), int(raw_data['TD'].values[0]), int(raw_data['pick'].values[0]), int(raw_data['Int'].values[0]), int(raw_data['power_5'].values[0]), int(raw_data['seasons'].values[0])]
    print(predict_sample)
    sample = model.predict(np.array(predict_sample).reshape(1,-1))
    prob = model.predict_proba(np.array(predict_sample).reshape(1,-1))
    
    return sample, prob


In [441]:
test = predict_out_sample_player_test_new('test')
print(test[0])
print(test[1])

[26, 72, 1, 10, 1, 6]
[0]
[[0.62534866 0.37465134]]


In [305]:
test = predict_out_sample_player_test('Shedeur Sanders')
print(test[0])
print(test[1])



In [37]:
def create_training_data(df_final, feature_subset=[]):
    if feature_subset == []:
        X_train, X_test, y_train, y_test = train_test_split(df_final.drop('successful', axis=1), df_final['successful'], test_size=.3, stratify=df_final['successful'])
    else:
        X_train, X_test, y_train, y_test = train_test_split(df_final[feature_subset], df_final['successful'], test_size=.3, stratify=df_final['successful'])

    return X_train, X_test, y_train, y_test

# model mess around space

In [35]:
df_final = load_data(8, cols)
df_final.columns

Index(['G', 'TD', 'pick', 'Int', 'power_5'], dtype='object')

In [144]:
X_new = SelectKBest(k=8).fit_transform(df_final.drop('successful',axis=1), df_final['successful'])
df_testing = pd.concat([pd.DataFrame(X_new).reset_index(), df_final['successful'].reset_index()],axis=1).reset_index().drop(['index','level_0'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(df_testing.drop('successful',axis=1), df_testing['successful'], stratify=df_testing['successful'], test_size=.3)

In [54]:
X_train, X_test, y_train, y_test = create_training_data(df_final, feature_subset=[])

In [58]:
def run_SVC(X_train, X_test, y_train, y_test):
    
    df_features = pd.concat([X_train, X_test])
    df_responses = pd.concat([y_train, y_test])

    k_strat = StratifiedKFold(n_splits=4, shuffle=False)
    folds = k_strat.split(df_features, df_responses)
    
    acc = []
    roc = []
    
    for train, test in folds:
        
        X_train, X_test = df_features.iloc[train], df_features.iloc[test]
        y_train, y_test = df_responses.values[train], df_responses.values[test]
        
        model = SVC(kernel='rbf', probability=True)
        
        model.fit(X_train, y_train)
        y_preds = model.predict(X_test)
        y_proba = model.predict_proba(X_test)

        acc.append(accuracy_score(y_test, y_preds))
        roc.append(roc_auc_score(y_test, y_proba[:,1]))
    
    print(f'Accuracy: {np.average(acc)}, ROC: {np.average(roc)}')

    model.fit(df_features, df_responses)

    return model, X_train.columns

x = run_SVC(X_train, X_test, y_train, y_test)

Accuracy: 0.8416666666666667, ROC: 0.8820833333333334


In [67]:
save_model(x[0], 'SVM_model_final', [])

In [59]:
def run_RF(X_train, X_test, y_train, y_test):

    df_features = pd.concat([X_train, X_test])
    df_responses = pd.concat([y_train, y_test])

    k_strat = StratifiedKFold(n_splits=5, shuffle=False)
    folds = k_strat.split(df_features, df_responses)
    
    acc = []
    roc = []
    
    for train, test in folds:
        
        X_train, X_test = df_features.iloc[train], df_features.iloc[test]
        y_train, y_test = df_responses.values[train], df_responses.values[test]
        
        model = XGBClassifier()
        
        model.fit(X_train, y_train)
        y_preds = model.predict(X_test)
        y_preds = (y_preds >= 5).astype(int)
        y_proba = model.predict_proba(X_test)

        acc.append(accuracy_score(y_test, y_preds))
        roc.append(roc_auc_score(y_test, y_proba[:,1]))
    
    print(f'Accuracy: {np.average(acc)}, ROC: {np.average(roc)}')
    feature_dict = {i:j for i, j in zip(X_train.columns, model.feature_importances_)}
    
run_RF(X_train, X_test, y_train, y_test)

Accuracy: 0.5535714285714285, ROC: 0.7458333333333333


In [60]:
def run_LR(X_train, X_test, y_train, y_test):
    
    df_features = pd.concat([X_train, X_test])
    df_responses = pd.concat([y_train, y_test])

    k_strat = StratifiedKFold(n_splits=5, shuffle=False)
    folds = k_strat.split(df_features, df_responses)
    
    acc = []
    roc = []
    
    for train, test in folds:
        
        X_train, X_test = df_features.iloc[train], df_features.iloc[test]
        y_train, y_test = df_responses.values[train], df_responses.values[test]
        
        model = LogisticRegression()
        
        model.fit(X_train, y_train)
        y_preds = model.predict(X_test)
        y_preds = (y_preds >= 5).astype(int)
        y_proba = model.predict_proba(X_test)

        acc.append(accuracy_score(y_test, y_preds))
        roc.append(roc_auc_score(y_test, y_proba[:,1]))
    
    print(f'Accuracy: {np.average(acc)}, ROC: {np.average(roc)}')

run_LR(X_train, X_test, y_train, y_test)


Accuracy: 0.5535714285714285, ROC: 0.6525


In [61]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_preds_LR = model.predict(X_test)
y_probs_LR = model.predict_proba(X_test)

accuracy_LR = accuracy_score(y_test, y_preds_LR)
roc_LR = roc_auc_score(y_test, y_probs_LR[:,1])
print('---Linear Regresion---')
print(accuracy_LR)
print(roc_LR)
print()

model = SVC(kernel='rbf', probability=True)
model.fit(X_train, y_train)
y_preds_SVC = model.predict(X_test)
y_probs_SVC = model.predict_proba(X_test)[:,1]
ROC = roc_auc_score(y_test, y_probs_SVC)

y_preds_SVC = (y_preds_SVC >= .5).astype(int)
accuracy_SVC = accuracy_score(y_test, y_preds_SVC)

print('---SVM---')
print(ROC)
print(accuracy_SVC)
print()

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_preds_RF = model.predict(X_test)
y_probs_RF = model.predict_proba(X_test)

accuracy_RF = accuracy_score(y_test, y_preds_RF)
roc_RF = roc_auc_score(y_test, y_probs_RF[:,1])
print('---Random Forest---')
print(accuracy_RF)
print(roc_RF)
print()

model = Sequential()

model.add(Dense(units=32, input_dim=X_test.shape[1], activation='tanh'))
model.add(Normalization())
model.add(Dropout(.5))
model.add(Dense(units=16, activation='tanh'))
model.add(Normalization())
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy')


model.fit(X_train, y_train)
y_preds_NN = model.predict(X_test)
y_preds_NN = (y_preds_NN >= .5).astype(int)

accuracy_NN = accuracy_score(y_test, y_preds_NN)

print('Neural Network')
print(accuracy_NN)

---Linear Regresion---
0.5833333333333334
0.5142857142857142

---SVM---
0.7714285714285715
0.8333333333333334

---Random Forest---
0.6666666666666666
0.7

Neural Network
0.5


In [1153]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_preds_RF = model.predict(X_test)
y_probs_RF = model.predict_proba(X_test)

accuracy_RF = accuracy_score(y_test, y_preds_RF)
roc_RF = roc_auc_score(y_test, y_probs_RF[:,1])
print('---Random Forest---')
print(accuracy_RF)
print(roc_RF)
print()

---Random Forest---
0.8333333333333334
0.8714285714285714



In [160]:
model = SVC(kernel='rbf', probability=True)
model.fit(X_train, y_train)
y_preds_SVC = model.predict(X_test)
y_probs_SVC = model.predict_proba(X_test)[:,1]
ROC = roc_auc_score(y_test, y_probs_SVC)

y_preds_SVC = (y_preds_SVC >= .5).astype(int)
accuracy_SVC = accuracy_score(y_test, y_preds_SVC)

print('---SVM---')
print(ROC)
print(accuracy_SVC)
print()

---SVM---
0.6285714285714286
0.5833333333333334



In [48]:
model.fit(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]))

In [221]:
def save_model(model, file_name):

    with open(file_name, 'wb') as file_path:
        pickle.dump(model, file_path)

In [49]:
with open('SVM_model_0.pkl', 'wb') as file_path:
    pickle.dump(model, file_path)

In [162]:
def run_baseline(df):

    k_strat = StratifiedKFold(n_splits=5, shuffle=False)
    folds = k_strat.split(df.drop('successful',axis=1), df['successful'])
    acc = []
    roc = []
    
    for train, test in folds:
        X_train, X_test = df.drop('successful',axis=1).iloc[train], df.drop('successful',axis=1).iloc[test]
        y_train, y_test = df['successful'].values[train], df['successful'].values[test]

        y_preds = np.tile(0, len(y_test))
        accuracy = accuracy_score(y_test, y_preds)
        roc_auc = roc_auc_score(y_test, y_preds)

        acc.append(accuracy)
        roc.append(roc_auc)

    print(np.average(acc), np.average(roc))

In [163]:
run_baseline(df_final)

0.5535714285714285 0.5


In [164]:
def run_RF(df):

    k_strat = StratifiedKFold(n_splits=2)
    folds = k_strat.split(df[['pick','seasons','Int', 'power_5', 'G','Yds','TD']], df['successful'])
    acc = []
    roc = []
    
    for train, test in folds:
        X_train, X_test = df[['pick','seasons','Int', 'power_5', 'G','Yds','TD']].iloc[train], df[['pick','seasons','Int', 'power_5', 'G','Yds','TD']].iloc[test]
        y_train, y_test = df['successful'].values[train], df['successful'].values[test]

        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        
        y_preds = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:,1]
        
        accuracy = accuracy_score(y_preds, y_test)
        roc_auc = roc_auc_score(y_test, y_proba)

        print(accuracy)
        print(roc_auc)
        acc.append(accuracy)
        roc.append(roc_auc)

    print(np.average(acc), np.average(roc))

In [732]:
df_final = load_data()

In [736]:
run_RF(df_final)

0.631578947368421
0.6761363636363636
0.47368421052631576
0.6277777777777778
0.5526315789473684 0.6519570707070708


In [250]:
df_final.to_csv('first_round_features_responses.csv',index=False)

In [241]:
pd.read_csv('first_round_qb_training_data.csv').shape[0]

40

In [575]:
def run_baseline(df):

    k_strat = StratifiedKFold(n_splits=5, shuffle=False)
    folds = k_strat.split(df.drop(['successful', 'season', 'AY/A', 'pick'],axis=1), df['successful'])
    acc = []
    roc = []
    
    for train, test in folds:
        X_train, X_test = df.drop(['successful', 'season', 'AY/A', 'pick'],axis=1).iloc[train], df.drop(['successful', 'season', 'AY/A', 'pick'],axis=1).iloc[test]
        y_train, y_test = df['successful'].values[train], df['successful'].values[test]

        y_preds = np.tile(0, len(y_test))
        accuracy = accuracy_score(y_test, y_preds)
        roc_auc = roc_auc_score(y_test, y_preds)

        acc.append(accuracy)
        roc.append(roc_auc)

    print(np.average(acc), np.average(roc))



In [1042]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_preds_LR = model.predict(X_test)
y_probs_LR = model.predict_proba(X_test)

accuracy_LR = accuracy_score(y_test, y_preds_LR)
roc_LR = roc_auc_score(y_test, y_probs_LR[:,1])
print(accuracy_LR)
print(roc_LR)

model = SVC(kernel='linear', probability=True)
model.fit(X_train, y_train)
y_preds_SVC = model.predict(X_test)
y_probs_SVC = model.predict_proba(X_test)[:,1]
ROC = roc_auc_score(y_test, y_probs_SVC)

y_preds_SVC = (y_preds_SVC >= .5).astype(int)
accuracy_SVC = accuracy_score(y_test, y_preds_SVC)

print(ROC)
print(accuracy_SVC)

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_preds_RF = model.predict(X_test)
y_probs_RF = model.predict_proba(X_test)

accuracy_RF = accuracy_score(y_test, y_preds_RF)
roc_RF = roc_auc_score(y_test, y_probs_RF[:,1])
print(accuracy_RF)
print(roc_RF)

0.6666666666666666
0.6571428571428571
0.6285714285714286
0.6666666666666666
0.75
0.7142857142857143


In [852]:
from xgboost import XGBClassifier

In [865]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_preds_RF = model.predict(X_test)
y_probs_RF = model.predict_proba(X_test)

accuracy_RF = accuracy_score(y_test, y_preds_RF)
roc_RF = roc_auc_score(y_test, y_probs_RF[:,1])
print(accuracy_RF)
print(roc_RF)

0.6
0.5833333333333333


In [1073]:
model = Sequential()

model.add(Dense(units=32, input_dim=X_test.shape[1], activation='tanh'))
model.add(Normalization())
model.add(Dropout(.5))
model.add(Dense(units=16, activation='tanh'))
model.add(Normalization())
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy')


model.fit(X_train, y_train)
y_preds_NN = model.predict(X_test)
y_preds_NN = (y_preds_NN >= .5).astype(int)

accuracy_NN = accuracy_score(y_test, y_preds_NN)

print(accuracy_NN)

0.6666666666666666


In [170]:
accuracy_NN

0.7142857142857143

In [126]:
accuracy_NN

0.5