## Baseball HOF Classifier

### Data Preprocessing:

First, we will clean up our data and join together the tables to create a finished table with all the features we need to build our models.

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Start with the fangraphs table
fg = pd.read_csv('fangraphs/fg_career_data.csv')
fg.head()

Unnamed: 0,Name,Team,G,PA,HR,R,RBI,SB,ISO,BABIP,...,HBP,SF,SH,GDP,BB/K,wRAA,wRC,WPA,RE24,playerid
0,Eduardo Rodriguez,Brewers,30,1,0,1,0.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,,,1011102
1,Scott Munninghoff,Phillies,4,1,0,1,0.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,0.07,0.92,1009324
2,Eric Cammack,Mets,8,1,0,0,1.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.1,1,0.0,1.0,1001864
3,Frank O'Connor,Phillies,3,2,1,1,3.0,0.0,1.5,1.0,...,0.0,,,,0.0,1.9,2,,,1009712
4,Hub Knolls,Superbas,2,2,0,0,0.0,0.0,1.0,,...,0.0,,1.0,,,1.5,2,,,1007047


In [3]:
# Filter to just batters who played at least 1200 games
fg = fg[fg['G'] >= 1200]
fg.head()

Unnamed: 0,Name,Team,G,PA,HR,R,RBI,SB,ISO,BABIP,...,HBP,SF,SH,GDP,BB/K,wRAA,wRC,WPA,RE24,playerid
249,Babe Ruth,- - -,2503,10616,714,2174,2217.0,123.0,0.348,0.34,...,43.0,,113.0,2.0,1.55,1437.2,2727,,,1011327
265,Ted Williams,Red Sox,2292,9791,521,1798,1839.0,24.0,0.289,0.328,...,39.0,20.0,5.0,197.0,2.85,1219.8,2349,,,1014040
310,Lou Gehrig,Yankees,2164,9660,493,1888,1995.0,102.0,0.292,0.332,...,45.0,,106.0,2.0,1.91,1046.8,2265,,,1004598
354,Jimmie Foxx,- - -,2317,9670,534,1751,1922.0,87.0,0.284,0.336,...,13.0,,71.0,69.0,1.11,934.2,2136,,,1004285
357,Rogers Hornsby,- - -,2259,9475,301,1579,1584.0,135.0,0.218,0.365,...,48.0,,216.0,3.0,1.53,899.0,2018,,,1006030


In [4]:
# id mapping table
id_map = pd.read_csv('mlb_rosetta-master/mlb_rosetta.csv', dtype={'retrosheet_id': object, 'lahman_id': object, 'baseball_reference_id': object})
id_map.head()

Unnamed: 0,id,first,last,current,bis_id,bis_milb_id,retrosheet_id,stats_inc_id,baseball_db_id,baseball_prospectus_id,lahman_id,westbay_id,korea_kbo_id,japan_npb_id,baseball_reference_id,uuid,duplicate,created_at,updated_at
0,110001,Hank,Aaron,,1000001.0,,aaroh101,,,AARON19340205A,aaronha01,,,,aaronha01,5a36cc6f-e91d-4cbe-b7a4-25178b6a6123,,2010-12-30 04:39:45,2010-12-30 04:39:45
1,110002,Tommie,Aaron,,1000002.0,,aarot101,,,AARON19390805A,aaronto01,,,,aaronto01,a3f2f0b1-6c75-42dc-a29b-98ea86396fb2,,2010-12-30 04:39:45,2010-12-30 04:39:45
2,110003,Don,Aase,,1000003.0,,aased001,,,AASE19540908A,aasedo01,,,,aasedo01,d7b213bc-85e2-49ab-8911-1b9b26e4d327,,2010-12-30 04:39:45,2011-03-18 23:52:53
3,110004,John,Abadie,,1000004.0,,abadj101,,,ABADIE18541104A,abadijo01,,,,abadijo01,14973d8b-c6eb-44f3-b14b-0c245ec56515,,2010-12-30 04:39:45,2010-12-30 04:39:45
4,110005,Ed,Abbaticchio,,1000005.0,,abbae101,,,ABBATICCH18770415A,abbated01,,,,abbated01,e51cc3d1-ec1f-43d7-a5f1-e62bddc3ea3d,,2010-12-30 04:39:45,2010-12-30 04:39:45


In [5]:
# Just keep lahman_id and bis_id (fangraphs id)
id_map = id_map[['bis_id', 'lahman_id', 'baseball_reference_id']]
# Remove rows where there is no lahman_id
id_map = id_map.dropna(subset=['lahman_id'])
# Remove rows where there is no baseball_reference_id
id_map = id_map.dropna(subset=['baseball_reference_id'])
id_map.head()

Unnamed: 0,bis_id,lahman_id,baseball_reference_id
0,1000001.0,aaronha01,aaronha01
1,1000002.0,aaronto01,aaronto01
2,1000003.0,aasedo01,aasedo01
3,1000004.0,abadijo01,abadijo01
4,1000005.0,abbated01,abbated01


In [6]:
# Rename fangraphs playerid column to be bis_id
fg = fg.rename(columns={'playerid': 'bis_id'})
fg.head()

Unnamed: 0,Name,Team,G,PA,HR,R,RBI,SB,ISO,BABIP,...,HBP,SF,SH,GDP,BB/K,wRAA,wRC,WPA,RE24,bis_id
249,Babe Ruth,- - -,2503,10616,714,2174,2217.0,123.0,0.348,0.34,...,43.0,,113.0,2.0,1.55,1437.2,2727,,,1011327
265,Ted Williams,Red Sox,2292,9791,521,1798,1839.0,24.0,0.289,0.328,...,39.0,20.0,5.0,197.0,2.85,1219.8,2349,,,1014040
310,Lou Gehrig,Yankees,2164,9660,493,1888,1995.0,102.0,0.292,0.332,...,45.0,,106.0,2.0,1.91,1046.8,2265,,,1004598
354,Jimmie Foxx,- - -,2317,9670,534,1751,1922.0,87.0,0.284,0.336,...,13.0,,71.0,69.0,1.11,934.2,2136,,,1004285
357,Rogers Hornsby,- - -,2259,9475,301,1579,1584.0,135.0,0.218,0.365,...,48.0,,216.0,3.0,1.53,899.0,2018,,,1006030


In [7]:
# Join in lahman_id into fangraphs df
fg = fg.merge(id_map, how='inner', left_on='bis_id', right_on='bis_id')
fg.head()

Unnamed: 0,Name,Team,G,PA,HR,R,RBI,SB,ISO,BABIP,...,SH,GDP,BB/K,wRAA,wRC,WPA,RE24,bis_id,lahman_id,baseball_reference_id
0,Lou Gehrig,Yankees,2164,9660,493,1888,1995.0,102.0,0.292,0.332,...,106.0,2.0,1.91,1046.8,2265,,,1004598,gehrilo01,gehrilo01
1,Jimmie Foxx,- - -,2317,9670,534,1751,1922.0,87.0,0.284,0.336,...,71.0,69.0,1.11,934.2,2136,,,1004285,foxxji01,foxxji01
2,Rogers Hornsby,- - -,2259,9475,301,1579,1584.0,135.0,0.218,0.365,...,216.0,3.0,1.53,899.0,2018,,,1006030,hornsro01,hornsro01
3,Hank Greenberg,- - -,1394,6096,331,1051,1276.0,58.0,0.292,0.323,...,35.0,66.0,1.01,556.1,1287,,,1004996,greenha01,greenha01
4,Ty Cobb,- - -,3035,13072,117,2246,1937.0,892.0,0.146,0.378,...,295.0,,2.7,1093.8,2534,,,1002378,cobbty01,cobbty01


In [8]:
# People table
people = pd.read_csv('lahman/People.csv')
people.head()

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,...,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
0,aardsda01,1981.0,12.0,27.0,USA,CO,Denver,,,,...,Aardsma,David Allan,215.0,75.0,R,R,2004-04-06,2015-08-23,aardd001,aardsda01
1,aaronha01,1934.0,2.0,5.0,USA,AL,Mobile,,,,...,Aaron,Henry Louis,180.0,72.0,R,R,1954-04-13,1976-10-03,aaroh101,aaronha01
2,aaronto01,1939.0,8.0,5.0,USA,AL,Mobile,1984.0,8.0,16.0,...,Aaron,Tommie Lee,190.0,75.0,R,R,1962-04-10,1971-09-26,aarot101,aaronto01
3,aasedo01,1954.0,9.0,8.0,USA,CA,Orange,,,,...,Aase,Donald William,190.0,75.0,R,R,1977-07-26,1990-10-03,aased001,aasedo01
4,abadan01,1972.0,8.0,25.0,USA,FL,Palm Beach,,,,...,Abad,Fausto Andres,184.0,73.0,L,L,2001-09-10,2006-04-13,abada001,abadan01


In [9]:
# Just keep debut and finalGame
people = people[['playerID', 'debut', 'finalGame']]
people.head()

Unnamed: 0,playerID,debut,finalGame
0,aardsda01,2004-04-06,2015-08-23
1,aaronha01,1954-04-13,1976-10-03
2,aaronto01,1962-04-10,1971-09-26
3,aasedo01,1977-07-26,1990-10-03
4,abadan01,2001-09-10,2006-04-13


In [10]:
# Get debut year, final year and number of seasons
people['debut_year'] = pd.DatetimeIndex(people['debut']).year
people['final_year'] = pd.DatetimeIndex(people['finalGame']).year
people['seasons'] = people['final_year'] - people['debut_year'] + 1
people = people[['playerID', 'debut_year', 'final_year', 'seasons']]
people.head()

Unnamed: 0,playerID,debut_year,final_year,seasons
0,aardsda01,2004.0,2015.0,12.0
1,aaronha01,1954.0,1976.0,23.0
2,aaronto01,1962.0,1971.0,10.0
3,aasedo01,1977.0,1990.0,14.0
4,abadan01,2001.0,2006.0,6.0


In [11]:
# filter to final_year <= 2006 and seasons >= 10 to get only a list of HOF eligible players who we are confident have already had their HOF fate determined
people = people[(people['final_year'] <= 2006) & (people['seasons'] >= 10)]
people.head()

Unnamed: 0,playerID,debut_year,final_year,seasons
1,aaronha01,1954.0,1976.0,23.0
2,aaronto01,1962.0,1971.0,10.0
3,aasedo01,1977.0,1990.0,14.0
7,abbated01,1897.0,1910.0,14.0
12,abbotgl01,1973.0,1984.0,12.0


In [12]:
# Join debut year and final year into fangraphs df
fg = fg.merge(people, how='inner', left_on='lahman_id', right_on='playerID')
fg.head()

Unnamed: 0,Name,Team,G,PA,HR,R,RBI,SB,ISO,BABIP,...,wRC,WPA,RE24,bis_id,lahman_id,baseball_reference_id,playerID,debut_year,final_year,seasons
0,Lou Gehrig,Yankees,2164,9660,493,1888,1995.0,102.0,0.292,0.332,...,2265,,,1004598,gehrilo01,gehrilo01,gehrilo01,1923.0,1939.0,17.0
1,Jimmie Foxx,- - -,2317,9670,534,1751,1922.0,87.0,0.284,0.336,...,2136,,,1004285,foxxji01,foxxji01,foxxji01,1925.0,1945.0,21.0
2,Rogers Hornsby,- - -,2259,9475,301,1579,1584.0,135.0,0.218,0.365,...,2018,,,1006030,hornsro01,hornsro01,hornsro01,1915.0,1937.0,23.0
3,Hank Greenberg,- - -,1394,6096,331,1051,1276.0,58.0,0.292,0.323,...,1287,,,1004996,greenha01,greenha01,greenha01,1930.0,1947.0,18.0
4,Ty Cobb,- - -,3035,13072,117,2246,1937.0,892.0,0.146,0.378,...,2534,,,1002378,cobbty01,cobbty01,cobbty01,1905.0,1928.0,24.0


In [13]:
# Allstar table
allstar = pd.read_csv('lahman/AllstarFull.csv')
allstar.head()

Unnamed: 0,playerID,yearID,gameNum,gameID,teamID,lgID,GP,startingPos
0,gomezle01,1933,0,ALS193307060,NYA,AL,1.0,1.0
1,ferreri01,1933,0,ALS193307060,BOS,AL,1.0,2.0
2,gehrilo01,1933,0,ALS193307060,NYA,AL,1.0,3.0
3,gehrich01,1933,0,ALS193307060,DET,AL,1.0,4.0
4,dykesji01,1933,0,ALS193307060,CHA,AL,1.0,5.0


In [14]:
# Get number of allstar appearances per player
allstar = allstar[allstar['gameNum'].isin([0, 1])]
allstar = allstar.groupby('playerID').size().to_frame('all_star_apps').reset_index()
allstar.head()

Unnamed: 0,playerID,all_star_apps
0,aaronha01,21
1,aasedo01,1
2,abreubo01,2
3,abreujo02,2
4,adamsac01,1


In [15]:
# Join raw_all_star_apps into fangraphs df
fg = fg.merge(allstar, how='left', left_on='lahman_id', right_on='playerID')
fg['all_star_apps'].fillna(0, inplace=True)
fg.head()

Unnamed: 0,Name,Team,G,PA,HR,R,RBI,SB,ISO,BABIP,...,RE24,bis_id,lahman_id,baseball_reference_id,playerID_x,debut_year,final_year,seasons,playerID_y,all_star_apps
0,Lou Gehrig,Yankees,2164,9660,493,1888,1995.0,102.0,0.292,0.332,...,,1004598,gehrilo01,gehrilo01,gehrilo01,1923.0,1939.0,17.0,gehrilo01,7.0
1,Jimmie Foxx,- - -,2317,9670,534,1751,1922.0,87.0,0.284,0.336,...,,1004285,foxxji01,foxxji01,foxxji01,1925.0,1945.0,21.0,foxxji01,9.0
2,Rogers Hornsby,- - -,2259,9475,301,1579,1584.0,135.0,0.218,0.365,...,,1006030,hornsro01,hornsro01,hornsro01,1915.0,1937.0,23.0,,0.0
3,Hank Greenberg,- - -,1394,6096,331,1051,1276.0,58.0,0.292,0.323,...,,1004996,greenha01,greenha01,greenha01,1930.0,1947.0,18.0,greenha01,5.0
4,Ty Cobb,- - -,3035,13072,117,2246,1937.0,892.0,0.146,0.378,...,,1002378,cobbty01,cobbty01,cobbty01,1905.0,1928.0,24.0,,0.0


In [16]:
# HOF labels table
hof = pd.read_csv('baseball_ref/hof.csv')
hof.head()

Unnamed: 0,Year,Name,Unnamed: 2,Voted By,Inducted As,Votes,% of Ballots
0,,,,,,,
1,2019.0,Harold Baines\baineha01,1959-Living,Veterans,Player,,
2,2019.0,Roy Halladay\hallaro01,1977-2017,BBWAA,Player,363.0,85.4%
3,2019.0,Edgar Martinez\martied01,1963-Living,BBWAA,Player,363.0,85.4%
4,2019.0,Mike Mussina\mussimi01,1968-Living,BBWAA,Player,326.0,76.7%


In [17]:
# Clean HOF table
hof[['Name', 'baseball_reference_id']] = hof.Name.str.split('\\',expand=True,)
hof['HOF'] = 1
hof = hof[~hof['baseball_reference_id'].isnull()]
hof = hof[['baseball_reference_id', 'HOF']]
hof.head()

Unnamed: 0,baseball_reference_id,HOF
1,baineha01,1
2,hallaro01,1
3,martied01,1
4,mussimi01,1
5,riverma01,1


In [18]:
# Join HOF labels into fangraphs df
fg = fg.merge(hof, how='left', left_on='baseball_reference_id', right_on='baseball_reference_id')
fg.head()

Unnamed: 0,Name,Team,G,PA,HR,R,RBI,SB,ISO,BABIP,...,bis_id,lahman_id,baseball_reference_id,playerID_x,debut_year,final_year,seasons,playerID_y,all_star_apps,HOF
0,Lou Gehrig,Yankees,2164,9660,493,1888,1995.0,102.0,0.292,0.332,...,1004598,gehrilo01,gehrilo01,gehrilo01,1923.0,1939.0,17.0,gehrilo01,7.0,1.0
1,Jimmie Foxx,- - -,2317,9670,534,1751,1922.0,87.0,0.284,0.336,...,1004285,foxxji01,foxxji01,foxxji01,1925.0,1945.0,21.0,foxxji01,9.0,1.0
2,Rogers Hornsby,- - -,2259,9475,301,1579,1584.0,135.0,0.218,0.365,...,1006030,hornsro01,hornsro01,hornsro01,1915.0,1937.0,23.0,,0.0,1.0
3,Hank Greenberg,- - -,1394,6096,331,1051,1276.0,58.0,0.292,0.323,...,1004996,greenha01,greenha01,greenha01,1930.0,1947.0,18.0,greenha01,5.0,1.0
4,Ty Cobb,- - -,3035,13072,117,2246,1937.0,892.0,0.146,0.378,...,1002378,cobbty01,cobbty01,cobbty01,1905.0,1928.0,24.0,,0.0,1.0


In [19]:
fg['HOF'].fillna(0, inplace=True)
fg.head()

Unnamed: 0,Name,Team,G,PA,HR,R,RBI,SB,ISO,BABIP,...,bis_id,lahman_id,baseball_reference_id,playerID_x,debut_year,final_year,seasons,playerID_y,all_star_apps,HOF
0,Lou Gehrig,Yankees,2164,9660,493,1888,1995.0,102.0,0.292,0.332,...,1004598,gehrilo01,gehrilo01,gehrilo01,1923.0,1939.0,17.0,gehrilo01,7.0,1.0
1,Jimmie Foxx,- - -,2317,9670,534,1751,1922.0,87.0,0.284,0.336,...,1004285,foxxji01,foxxji01,foxxji01,1925.0,1945.0,21.0,foxxji01,9.0,1.0
2,Rogers Hornsby,- - -,2259,9475,301,1579,1584.0,135.0,0.218,0.365,...,1006030,hornsro01,hornsro01,hornsro01,1915.0,1937.0,23.0,,0.0,1.0
3,Hank Greenberg,- - -,1394,6096,331,1051,1276.0,58.0,0.292,0.323,...,1004996,greenha01,greenha01,greenha01,1930.0,1947.0,18.0,greenha01,5.0,1.0
4,Ty Cobb,- - -,3035,13072,117,2246,1937.0,892.0,0.146,0.378,...,1002378,cobbty01,cobbty01,cobbty01,1905.0,1928.0,24.0,,0.0,1.0


In [20]:
# Remove features with NaN values
fg = fg[[x for x in fg.columns if x not in [x for x in fg.columns if fg[x].isnull().values.any()]]]
fg.head()

Unnamed: 0,Name,Team,G,PA,HR,R,RBI,SB,ISO,AVG,...,wRC,bis_id,lahman_id,baseball_reference_id,playerID_x,debut_year,final_year,seasons,all_star_apps,HOF
0,Lou Gehrig,Yankees,2164,9660,493,1888,1995.0,102.0,0.292,0.34,...,2265,1004598,gehrilo01,gehrilo01,gehrilo01,1923.0,1939.0,17.0,7.0,1.0
1,Jimmie Foxx,- - -,2317,9670,534,1751,1922.0,87.0,0.284,0.325,...,2136,1004285,foxxji01,foxxji01,foxxji01,1925.0,1945.0,21.0,9.0,1.0
2,Rogers Hornsby,- - -,2259,9475,301,1579,1584.0,135.0,0.218,0.358,...,2018,1006030,hornsro01,hornsro01,hornsro01,1915.0,1937.0,23.0,0.0,1.0
3,Hank Greenberg,- - -,1394,6096,331,1051,1276.0,58.0,0.292,0.313,...,1287,1004996,greenha01,greenha01,greenha01,1930.0,1947.0,18.0,5.0,1.0
4,Ty Cobb,- - -,3035,13072,117,2246,1937.0,892.0,0.146,0.366,...,2534,1002378,cobbty01,cobbty01,cobbty01,1905.0,1928.0,24.0,0.0,1.0


## Creating Training and Testing Sets

First we will scale our features using the StandardScaler. Then, we will split our data into a training set and a testing set (25%). 

In [21]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)

features = ['G', 'PA', 'HR', 'R', 'RBI', 'SB', 'ISO', 'AVG', 'OBP', 'SLG', 'wOBA', 'wRC+', 'BsR', 'Off', 'Def', 'WAR', 'AB', 'H', '1B', '2B', '3B', 'BB', 'HBP', 'wRAA', 'wRC', 'debut_year', 'final_year', 'seasons', 'all_star_apps']
X = StandardScaler().fit_transform(fg[features])
y = fg['HOF']

scorer = make_scorer(accuracy_score)

## Over-Sampling

We will use Over-Sampling on the training data to handle the fact that we have imbalanced classes. 

In [22]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)

Using TensorFlow backend.


## Random Forests

In [23]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {'max_depth': [3, None],
              'max_features': [1, 3, 10],
              'min_samples_split': [2, 3, 10],
              'bootstrap': [True, False],
              'criterion': ['gini', 'entropy'],
              'n_estimators': [10, 100]
             }

rf = RandomForestClassifier(random_state=0)
rf_grid_obj = GridSearchCV(rf, param_grid, scoring=scorer, cv=5)
rf_grid_fit = rf_grid_obj.fit(X_train, y_train)
best_rf = rf_grid_fit.best_estimator_

y_pred = best_rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
rf_results = {'model': 'Random Forests', 'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}
print("Random Forests Testing Set Accuracy: " + str(round(100*acc, 2)) + "%")
print("Random Forests Testing Set Precision: " + str(round(100*precision, 2)) + "%")
print("Random Forests Testing Set Recall: " + str(round(100*recall, 2)) + "%")
print("Random Forests Testing Set F1-score: " + str(round(f1, 4)))

Random Forests Testing Set Accuracy: 96.3%
Random Forests Testing Set Precision: 87.5%
Random Forests Testing Set Recall: 82.35%
Random Forests Testing Set F1-score: 0.8485


## Logistic Regression

We will use Logistic Regression with L1 Regularization to choose the optimal Logistic Regression model (with implicit feature selection):

In [24]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l1', solver='liblinear', random_state=0)
lr.fit(X_train,y_train)

y_pred = lr.predict(X_test)
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
lr_results = {'model': 'Logistic Regression', 'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}
print("Logistic Regression Testing Set Accuracy: " + str(round(100*acc, 2)) + "%")
print("Logistic Regression Testing Set Precision: " + str(round(100*precision, 2)) + "%")
print("Logistic Regression Testing Set Recall: " + str(round(100*recall, 2)) + "%")
print("Logistic Regression Testing Set F1-score: " + str(round(f1, 4)))

Logistic Regression Testing Set Accuracy: 85.93%
Logistic Regression Testing Set Precision: 46.88%
Logistic Regression Testing Set Recall: 88.24%
Logistic Regression Testing Set F1-score: 0.6122


## Neural Networks (MLP)

After experimenting with various architechtures, epochs and batch size, this MLP performed the best:

In [25]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout

model = Sequential()
model.add(Dense(32, input_dim=len(features), kernel_initializer='normal', activation='relu'))
model.add(Dense(16, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=40, batch_size=10, verbose=0)

y_pred = np.array([1 if x > 0.5 else 0 for x in model.predict(X_test)]).astype(float)
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
nn_results = {'model': 'Neural Networks', 'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}
print("Neural Networks Testing Set Accuracy: " + str(round(100*acc, 2)) + "%")
print("Neural Networks Testing Set Precision: " + str(round(100*precision, 2)) + "%")
print("Neural Networks Testing Set Recall: " + str(round(100*recall, 2)) + "%")
print("Neural Networks Testing Set F1-score: " + str(round(f1, 4)))

Neural Networks Testing Set Accuracy: 91.85%
Neural Networks Testing Set Precision: 63.64%
Neural Networks Testing Set Recall: 82.35%
Neural Networks Testing Set F1-score: 0.7179


## Conclusion

In [26]:
results = pd.DataFrame([rf_results, lr_results, nn_results])
results = results[['model', 'accuracy', 'precision', 'recall', 'f1']]
results

Unnamed: 0,model,accuracy,precision,recall,f1
0,Random Forests,0.962963,0.875,0.823529,0.848485
1,Logistic Regression,0.859259,0.46875,0.882353,0.612245
2,Neural Networks,0.918519,0.636364,0.823529,0.717949


The Random Forests model performed the best in this case. The Random Forests accuracy (96.3%), precision (87.5%), and F1-score (0.8485) were the best of the 3 models by a significant margin. It did have a lower recall than the other two models (82% versus 88%), but overall it was still the best performing.