In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
from functools import reduce

# File Paths
hof_path = 'Resources/HallOfFame.csv'
batting_path = 'Resources/Batting.csv'
player_path = 'Resources/People.csv'
awards_path = 'Resources/AwardsPlayers.csv'
allstar_path = 'Resources/AllstarFull.csv'

# Create Dataframes
hof_df = pd.read_csv(hof_path)
batting_df = pd.read_csv(batting_path)
player_df = pd.read_csv(player_path)
awards_df = pd.read_csv(awards_path)
allstar_df = pd.read_csv(allstar_path)


In [2]:
# sum all batter stats by year for career totals
d_bat = dict.fromkeys(('G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 
                   'IBB', 'HBP', 'SH', 'SF', 'GIDP'), ['sum'])

group_batting_df = batting_df.groupby('playerID', as_index = False).agg(d_bat)
group_batting_df.columns = group_batting_df.columns.droplevel(1)
group_batting_df

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,aardsda01,331,4,0,0,0,0,0,0.0,0.0,0.0,0,2.0,0.0,0.0,1.0,0.0,0.0
1,aaronha01,3298,12364,2174,3771,624,98,755,2297.0,240.0,73.0,1402,1383.0,293.0,32.0,21.0,121.0,328.0
2,aaronto01,437,944,102,216,42,6,13,94.0,9.0,8.0,86,145.0,3.0,0.0,9.0,6.0,36.0
3,aasedo01,448,5,0,0,0,0,0,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,0.0
4,abadan01,15,21,1,2,0,0,0,0.0,0.0,1.0,4,5.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20161,zupofr01,16,18,3,3,1,0,0,0.0,0.0,0.0,2,6.0,0.0,0.0,0.0,0.0,0.0
20162,zuvelpa01,209,491,41,109,17,2,2,20.0,2.0,0.0,34,50.0,1.0,2.0,18.0,0.0,8.0
20163,zuverge01,266,142,5,21,2,1,0,7.0,0.0,1.0,9,39.0,0.0,0.0,16.0,0.0,3.0
20164,zwilldu01,366,1280,167,364,76,15,30,202.0,46.0,0.0,128,155.0,0.0,4.0,31.0,0.0,0.0


In [3]:
# sum all batter all star games for career
d_asg = {'GP':'sum'}
group_allstar_df = allstar_df.groupby('playerID', as_index = False).agg(d_asg)
group_allstar_df

Unnamed: 0,playerID,GP
0,aaronha01,24
1,aasedo01,1
2,abreubo01,2
3,abreujo02,3
4,acunaro01,1
...,...,...
1902,zimmery01,2
1903,ziskri01,2
1904,zitoba01,2
1905,zobribe01,2


In [4]:
# merge the 5 dataframes into one dataframe
merged_df = reduce(lambda x,y: pd.merge(x,y, on='playerID', how='outer'), 
                    [hof_df, group_batting_df, player_df, awards_df, group_allstar_df])
merged_df.head()

Unnamed: 0,playerID,yearID_x,votedBy,ballots,needed,votes,inducted,category,needed_note,G,...,debut,finalGame,retroID,bbrefID,awardID,yearID_y,lgID,tie,notes,GP
0,cobbty01,1936.0,BBWAA,226.0,170.0,222.0,Y,Player,,3035.0,...,1905-08-30,1928-09-11,cobbt101,cobbty01,Baseball Magazine All-Star,1908.0,AL,,RF,
1,cobbty01,1936.0,BBWAA,226.0,170.0,222.0,Y,Player,,3035.0,...,1905-08-30,1928-09-11,cobbt101,cobbty01,Baseball Magazine All-Star,1908.0,ML,,RF,
2,cobbty01,1936.0,BBWAA,226.0,170.0,222.0,Y,Player,,3035.0,...,1905-08-30,1928-09-11,cobbt101,cobbty01,Baseball Magazine All-Star,1909.0,AL,,RF,
3,cobbty01,1936.0,BBWAA,226.0,170.0,222.0,Y,Player,,3035.0,...,1905-08-30,1928-09-11,cobbt101,cobbty01,Baseball Magazine All-Star,1909.0,ML,,RF,
4,cobbty01,1936.0,BBWAA,226.0,170.0,222.0,Y,Player,,3035.0,...,1905-08-30,1928-09-11,cobbt101,cobbty01,Triple Crown,1909.0,AL,,,


In [5]:
# sort by playerID and drop duplicates and keeping the last playerID
sorted_df = merged_df.sort_values(by=['playerID', 'inducted','AB'], ascending=False)
sorted_df.drop_duplicates(subset=['playerID'], keep='last', inplace = True)
sorted_df

Unnamed: 0,playerID,yearID_x,votedBy,ballots,needed,votes,inducted,category,needed_note,G,...,debut,finalGame,retroID,bbrefID,awardID,yearID_y,lgID,tie,notes,GP
44177,zychto01,,,,,,,,,70.0,...,2015-09-04,2017-08-19,zycht001,zychto01,,,,,,
44176,zwilldu01,,,,,,,,,366.0,...,1910-08-14,1916-07-12,zwild101,zwilldu01,,,,,,
44175,zuverge01,,,,,,,,,266.0,...,1951-04-21,1959-06-15,zuveg101,zuverge01,,,,,,
44174,zuvelpa01,,,,,,,,,209.0,...,1982-09-04,1991-05-02,zuvep001,zuvelpa01,,,,,,
44173,zupofr01,,,,,,,,,16.0,...,1957-07-01,1961-05-09,zupof101,zupofr01,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24093,abadan01,,,,,,,,,15.0,...,2001-09-10,2006-04-13,abada001,abadan01,,,,,,
24092,aasedo01,,,,,,,,,448.0,...,1977-07-26,1990-10-03,aased001,aasedo01,,,,,,1.0
24091,aaronto01,,,,,,,,,437.0,...,1962-04-10,1971-09-26,aarot101,aaronto01,,,,,,
16631,aaronha01,1982.0,BBWAA,415.0,312.0,406.0,Y,Player,,3298.0,...,1954-04-13,1976-10-03,aaroh101,aaronha01,TSN All-Star,1971.0,NL,,OF,24.0


In [6]:
sorted_df.columns

Index(['playerID', 'yearID_x', 'votedBy', 'ballots', 'needed', 'votes',
       'inducted', 'category', 'needed_note', 'G', 'AB', 'R', 'H', '2B', '3B',
       'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP',
       'birthYear', 'birthMonth', 'birthDay', 'birthCountry', 'birthState',
       'birthCity', 'deathYear', 'deathMonth', 'deathDay', 'deathCountry',
       'deathState', 'deathCity', 'nameFirst', 'nameLast', 'nameGiven',
       'weight', 'height', 'bats', 'throws', 'debut', 'finalGame', 'retroID',
       'bbrefID', 'awardID', 'yearID_y', 'lgID', 'tie', 'notes', 'GP'],
      dtype='object')

In [7]:
sorted_df.drop(['yearID_x', 'votedBy', 'ballots', 'needed', 'votes', 'category', 
               'needed_note', 'yearID_y', 'deathYear',
              'deathMonth', 'deathDay', 'deathCountry', 'deathState', 'deathCity',
              'nameFirst', 'nameLast', 'nameGiven', 'birthYear', 'birthMonth',
              'birthDay', 'birthCountry', 'birthState', 'birthCity', 
              'retroID', 'bbrefID', 'yearID_x', 'tie', 'notes', 'yearID_y',
              'lgID'], axis = 1, inplace = True)

In [8]:
cleaned_df = sorted_df.reset_index()

In [9]:
cleaned_df['inducted'].fillna('N', inplace=True)
cleaned_df['awardID'].fillna('None', inplace=True)
cleaned_df['GP'].fillna('0', inplace=True)

In [10]:
names = {'playerID':'Player ID','inducted':'Inducted into HOF', 'weight':'Weight', 'height':'Height', 
         'bats':'Batting Hand', 'throws':'Throwing Hand', 'debut':'MLB Debut', 'finalGame':'Final MLB Game',
         'awardID':'Award Name', 'GP':'All Star Games Played'
        }

In [11]:
cleaned_df.rename(columns=names, inplace=True)

In [12]:
cleaned_df.isnull().sum()

index                       0
Player ID                   0
Inducted into HOF           0
G                         204
AB                        204
R                         204
H                         204
2B                        204
3B                        204
HR                        204
RBI                       204
SB                        204
CS                        204
BB                        204
SO                        204
IBB                       204
HBP                       204
SH                        204
SF                        204
GIDP                      204
Weight                    816
Height                    736
Batting Hand             1181
Throwing Hand             977
MLB Debut                 210
Final MLB Game            210
Award Name                  0
All Star Games Played       0
dtype: int64

In [13]:
cleaned_df.notnull().sum()

index                    20370
Player ID                20370
Inducted into HOF        20370
G                        20166
AB                       20166
R                        20166
H                        20166
2B                       20166
3B                       20166
HR                       20166
RBI                      20166
SB                       20166
CS                       20166
BB                       20166
SO                       20166
IBB                      20166
HBP                      20166
SH                       20166
SF                       20166
GIDP                     20166
Weight                   19554
Height                   19634
Batting Hand             19189
Throwing Hand            19393
MLB Debut                20160
Final MLB Game           20160
Award Name               20370
All Star Games Played    20370
dtype: int64

In [14]:
filtered_df = cleaned_df.loc[cleaned_df['AB'] >= 3000]
filtered_df

Unnamed: 0,index,Player ID,Inducted into HOF,G,AB,R,H,2B,3B,HR,...,SF,GIDP,Weight,Height,Batting Hand,Throwing Hand,MLB Debut,Final MLB Game,Award Name,All Star Games Played
15,44162,zobribe01,N,1651.0,5880.0,884.0,1566.0,349.0,44.0,167.0,...,67.0,124.0,210.0,75.0,B,R,2006-08-01,2019-09-29,World Series MVP,2.0
19,17832,ziskri01,N,1453.0,5144.0,681.0,1477.0,245.0,26.0,207.0,...,41.0,138.0,200.0,73.0,R,R,1971-09-08,1983-09-21,TSN All-Star,2.0
28,44147,zimmery01,N,1799.0,6654.0,963.0,1846.0,417.0,22.0,284.0,...,69.0,212.0,215.0,75.0,R,R,2005-09-01,2021-10-03,Comeback Player of the Year,2.0
35,44136,zimmehe01,N,1456.0,5304.0,695.0,1566.0,275.0,105.0,58.0,...,0.0,0.0,176.0,71.0,R,R,1907-09-08,1919-09-10,Baseball Magazine All-Star,0
37,44130,zimmedo01,N,1095.0,3283.0,353.0,773.0,130.0,22.0,91.0,...,14.0,99.0,165.0,69.0,R,R,1954-07-02,1965-10-02,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20307,24161,adairje01,N,1165.0,4019.0,378.0,1022.0,163.0,19.0,57.0,...,30.0,149.0,175.0,72.0,R,R,1958-09-02,1970-05-03,,0
20331,24134,abreujo02,N,1113.0,4353.0,612.0,1262.0,263.0,16.0,228.0,...,45.0,148.0,235.0,75.0,R,R,2014-03-31,2021-10-02,TSN All-Star,3.0
20334,24125,abreubo01,N,2425.0,8480.0,1453.0,2470.0,574.0,59.0,288.0,...,85.0,165.0,220.0,72.0,L,R,1996-09-01,2014-09-28,Gold Glove,2.0
20362,24096,abbated01,N,855.0,3044.0,355.0,772.0,99.0,43.0,11.0,...,0.0,0.0,170.0,71.0,R,R,1897-09-04,1910-09-15,,0


In [15]:
no_null_df = filtered_df.dropna()

In [16]:
no_null_df['Inducted into HOF'].value_counts()

N    1726
Y      44
Name: Inducted into HOF, dtype: int64

In [17]:
hof_binary_encoded = pd.get_dummies(no_null_df, columns=["Batting Hand", "Throwing Hand", "Award Name"])
hof_binary_encoded

Unnamed: 0,index,Player ID,Inducted into HOF,G,AB,R,H,2B,3B,HR,...,Award Name_Roberto Clemente Award,Award Name_Rookie of the Year,Award Name_SIlver Slugger,Award Name_Silver Slugger,Award Name_TSN All-Star,Award Name_TSN Guide MVP,Award Name_TSN Major League Player of the Year,Award Name_TSN Player of the Year,Award Name_Triple Crown,Award Name_World Series MVP
15,44162,zobribe01,N,1651.0,5880.0,884.0,1566.0,349.0,44.0,167.0,...,0,0,0,0,0,0,0,0,0,1
19,17832,ziskri01,N,1453.0,5144.0,681.0,1477.0,245.0,26.0,207.0,...,0,0,0,0,1,0,0,0,0,0
28,44147,zimmery01,N,1799.0,6654.0,963.0,1846.0,417.0,22.0,284.0,...,0,0,0,0,0,0,0,0,0,0
35,44136,zimmehe01,N,1456.0,5304.0,695.0,1566.0,275.0,105.0,58.0,...,0,0,0,0,0,0,0,0,0,0
37,44130,zimmedo01,N,1095.0,3283.0,353.0,773.0,130.0,22.0,91.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20307,24161,adairje01,N,1165.0,4019.0,378.0,1022.0,163.0,19.0,57.0,...,0,0,0,0,0,0,0,0,0,0
20331,24134,abreujo02,N,1113.0,4353.0,612.0,1262.0,263.0,16.0,228.0,...,0,0,0,0,1,0,0,0,0,0
20334,24125,abreubo01,N,2425.0,8480.0,1453.0,2470.0,574.0,59.0,288.0,...,0,0,0,0,0,0,0,0,0,0
20362,24096,abbated01,N,855.0,3044.0,355.0,772.0,99.0,43.0,11.0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
final_hof_df = hof_binary_encoded.drop(['index', 'Player ID', 'MLB Debut', 'Final MLB Game'], axis=1).reset_index(drop=True)

In [19]:
final_hof_df.columns

Index(['Inducted into HOF', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB',
       'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP', 'Weight', 'Height',
       'All Star Games Played', 'Batting Hand_B', 'Batting Hand_L',
       'Batting Hand_R', 'Throwing Hand_L', 'Throwing Hand_R',
       'Award Name_ALCS MVP', 'Award Name_All-Star Game MVP',
       'Award Name_Babe Ruth Award', 'Award Name_Baseball Magazine All-Star',
       'Award Name_Branch Rickey Award',
       'Award Name_Comeback Player of the Year', 'Award Name_Gold Glove',
       'Award Name_Hank Aaron Award', 'Award Name_Hutch Award',
       'Award Name_Lou Gehrig Memorial Award',
       'Award Name_Most Valuable Player', 'Award Name_NLCS MVP',
       'Award Name_None', 'Award Name_Outstanding DH Award',
       'Award Name_Roberto Clemente Award', 'Award Name_Rookie of the Year',
       'Award Name_SIlver Slugger', 'Award Name_Silver Slugger',
       'Award Name_TSN All-Star', 'Award Name_TSN Guide MVP',
       'Award 

In [20]:
final_hof_df

Unnamed: 0,Inducted into HOF,G,AB,R,H,2B,3B,HR,RBI,SB,...,Award Name_Roberto Clemente Award,Award Name_Rookie of the Year,Award Name_SIlver Slugger,Award Name_Silver Slugger,Award Name_TSN All-Star,Award Name_TSN Guide MVP,Award Name_TSN Major League Player of the Year,Award Name_TSN Player of the Year,Award Name_Triple Crown,Award Name_World Series MVP
0,N,1651.0,5880.0,884.0,1566.0,349.0,44.0,167.0,768.0,116.0,...,0,0,0,0,0,0,0,0,0,1
1,N,1453.0,5144.0,681.0,1477.0,245.0,26.0,207.0,792.0,8.0,...,0,0,0,0,1,0,0,0,0,0
2,N,1799.0,6654.0,963.0,1846.0,417.0,22.0,284.0,1061.0,43.0,...,0,0,0,0,0,0,0,0,0,0
3,N,1456.0,5304.0,695.0,1566.0,275.0,105.0,58.0,796.0,175.0,...,0,0,0,0,0,0,0,0,0,0
4,N,1095.0,3283.0,353.0,773.0,130.0,22.0,91.0,352.0,45.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1765,N,1165.0,4019.0,378.0,1022.0,163.0,19.0,57.0,366.0,29.0,...,0,0,0,0,0,0,0,0,0,0
1766,N,1113.0,4353.0,612.0,1262.0,263.0,16.0,228.0,788.0,11.0,...,0,0,0,0,1,0,0,0,0,0
1767,N,2425.0,8480.0,1453.0,2470.0,574.0,59.0,288.0,1363.0,400.0,...,0,0,0,0,0,0,0,0,0,0
1768,N,855.0,3044.0,355.0,772.0,99.0,43.0,11.0,324.0,142.0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Create our features
X = final_hof_df.drop('Inducted into HOF', axis=1)

# Create our target
y = final_hof_df['Inducted into HOF']

In [22]:
y.value_counts()

N    1726
Y      44
Name: Inducted into HOF, dtype: int64

In [23]:
# filtered_df.to_csv('filtered_df.csv', index=False)

# HOF Resampling

In [24]:
import warnings
warnings.filterwarnings('ignore')

In [25]:
from pathlib import Path
from collections import Counter

# Split the Data into Training and Testing

In [26]:
# Create our features
X = final_hof_df.drop("Inducted into HOF", axis=1)
X = pd.get_dummies(X)


# Create our target
y = final_hof_df["Inducted into HOF"]

In [27]:
X.describe()

Unnamed: 0,G,AB,R,H,2B,3B,HR,RBI,SB,CS,...,All Star Games Played_11.0,All Star Games Played_12.0,All Star Games Played_13.0,All Star Games Played_14.0,All Star Games Played_15.0,All Star Games Played_16.0,All Star Games Played_18.0,All Star Games Played_24.0,All Star Games Played_26.0,All Star Games Played_0
count,1770.0,1770.0,1770.0,1770.0,1770.0,1770.0,1770.0,1770.0,1770.0,1770.0,...,1770.0,1770.0,1770.0,1770.0,1770.0,1770.0,1770.0,1770.0,1770.0,1770.0
mean,1489.511864,5229.311299,749.091525,1444.553672,252.476271,49.783051,127.163842,685.333898,121.871186,38.581921,...,0.00452,0.003955,0.002825,0.001695,0.00113,0.00226,0.001695,0.00113,0.000565,0.479096
std,457.957942,1780.286002,333.922193,560.47247,109.877869,37.509439,115.448824,339.049914,130.852207,36.905503,...,0.067096,0.06278,0.053089,0.041146,0.033605,0.047498,0.041146,0.033605,0.023769,0.499704
min,745.0,3001.0,138.0,516.0,45.0,1.0,1.0,162.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1140.0,3840.25,502.25,1011.0,174.0,24.0,41.0,443.0,34.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1392.0,4812.5,665.0,1308.0,226.0,39.0,95.0,587.0,76.0,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1759.75,6234.5,913.75,1743.0,307.0,66.0,170.0,848.75,167.0,56.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,3562.0,14053.0,2295.0,4256.0,792.0,309.0,762.0,2297.0,1406.0,335.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [28]:
# Check the balance of our target values
y.value_counts()

N    1726
Y      44
Name: Inducted into HOF, dtype: int64

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Oversampling

In this section, we will compare two oversampling algorithms to determine which algorithm results in the best performance. 

### Naive Random Oversampling

In [30]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'N': 1295, 'Y': 1295})

In [31]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [34]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
naive_score = balanced_accuracy_score(y_test, y_pred)
naive_score

0.8448375870069605

In [33]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[405,  26],
       [  3,   9]])

In [35]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix_df = pd.DataFrame(
    confusion_matrix, index=["Actual HOF", "Actual NoHOF"], columns=["Predicted HOF", "Predicted NoHOF"]
)

print(f'Naive Random Oversampling')
print(f'--------------------------------------------------------------------------')
print(f'Accuracy Score: ')
display(naive_score)
print(f'--------------------------------------------------------------------------')
print(f'Confusion Matrix:')
display(confusion_matrix_df)
print(f'--------------------------------------------------------------------------')
print(f'Classification Report:')
print(classification_report_imbalanced(y_test, y_pred))

Naive Random Oversampling
--------------------------------------------------------------------------
Accuracy Score: 


0.8448375870069605

--------------------------------------------------------------------------
Confusion Matrix:


Unnamed: 0,Predicted HOF,Predicted NoHOF
Actual HOF,405,26
Actual NoHOF,3,9


--------------------------------------------------------------------------
Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

          N       0.99      0.94      0.75      0.97      0.84      0.72       431
          Y       0.26      0.75      0.94      0.38      0.84      0.69        12

avg / total       0.97      0.93      0.76      0.95      0.84      0.72       443



### SMOTE Oversampling

In [36]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)
Counter(y_resampled)

Counter({'N': 1295, 'Y': 1295})

In [37]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [38]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
SMOTE_score = balanced_accuracy_score(y_test, y_pred)
SMOTE_score

0.8483178654292343

In [39]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[408,  23],
       [  3,   9]])

In [40]:
# Print the imbalanced classification report
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix_df = pd.DataFrame(
    confusion_matrix, index=["Actual HOF", "Actual NoHOF"], columns=["Predicted HOF", "Predicted NoHOF"]
)

print(f'SMOTE Oversampling')
print(f'--------------------------------------------------------------------------')
print(f'Accuracy Score: ')
display(SMOTE_score)
print(f'--------------------------------------------------------------------------')
print(f'Confusion Matrix:')
display(confusion_matrix_df)
print(f'--------------------------------------------------------------------------')
print(f'Classification Report:')
print(classification_report_imbalanced(y_test, y_pred))

SMOTE Oversampling
--------------------------------------------------------------------------
Accuracy Score: 


0.8483178654292343

--------------------------------------------------------------------------
Confusion Matrix:


Unnamed: 0,Predicted HOF,Predicted NoHOF
Actual HOF,408,23
Actual NoHOF,3,9


--------------------------------------------------------------------------
Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

          N       0.99      0.95      0.75      0.97      0.84      0.72       431
          Y       0.28      0.75      0.95      0.41      0.84      0.70        12

avg / total       0.97      0.94      0.76      0.95      0.84      0.72       443



# Undersampling

In this section, we will test an undersampling algorithms to determine which algorithm results in the best performance compared to the oversampling algorithms above. 

In [41]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete

from collections import Counter
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'N': 32, 'Y': 32})

In [42]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [43]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
cluster_centroids_score = balanced_accuracy_score(y_test, y_pred)
cluster_centroids_score

0.8483178654292343

In [44]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[367,  64],
       [  3,   9]])

In [45]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix_df = pd.DataFrame(
    confusion_matrix, index=["Actual HOF", "Actual NoHOF"], columns=["Predicted HOF", "Predicted NoHOF"]
)

print(f'Cluster Centroids Undersampling')
print(f'--------------------------------------------------------------------------')
print(f'Accuracy Score: ')
display(cluster_centroids_score)
print(f'--------------------------------------------------------------------------')
print(f'Confusion Matrix:')
display(confusion_matrix_df)
print(f'--------------------------------------------------------------------------')
print(f'Classification Report:')
print(classification_report_imbalanced(y_test, y_pred))

Cluster Centroids Undersampling
--------------------------------------------------------------------------
Accuracy Score: 


0.8483178654292343

--------------------------------------------------------------------------
Confusion Matrix:


Unnamed: 0,Predicted HOF,Predicted NoHOF
Actual HOF,367,64
Actual NoHOF,3,9


--------------------------------------------------------------------------
Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

          N       0.99      0.85      0.75      0.92      0.80      0.65       431
          Y       0.12      0.75      0.85      0.21      0.80      0.63        12

avg / total       0.97      0.85      0.75      0.90      0.80      0.64       443



# Combination (Over and Under) Sampling

In this section, we will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. 

In [46]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete

from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=70)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({'N': 1572, 'Y': 1705})

In [47]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [48]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
smoteenn_score = balanced_accuracy_score(y_test, y_pred)
smoteenn_score

0.9675174013921113

In [49]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[403,  28],
       [  0,  12]])

In [50]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix_df = pd.DataFrame(
    confusion_matrix, index=["Actual HOF", "Actual NoHOF"], columns=["Predicted HOF", "Predicted NoHOF"]
)

print(f'Combination Sampling (SMOTEENN)')
print(f'--------------------------------------------------------------------------')
print(f'Accuracy Score: ') 
display(smoteenn_score)
print(f'--------------------------------------------------------------------------')
print(f'Confusion Matrix:')
display(confusion_matrix_df)
print(f'--------------------------------------------------------------------------')
print(f'Classification Report:')
print(classification_report_imbalanced(y_test, y_pred))

Combination Sampling (SMOTEENN)
--------------------------------------------------------------------------
Accuracy Score: 


0.9675174013921113

--------------------------------------------------------------------------
Confusion Matrix:


Unnamed: 0,Predicted HOF,Predicted NoHOF
Actual HOF,403,28
Actual NoHOF,0,12


--------------------------------------------------------------------------
Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

          N       1.00      0.94      1.00      0.97      0.97      0.93       431
          Y       0.30      1.00      0.94      0.46      0.97      0.94        12

avg / total       0.98      0.94      1.00      0.95      0.97      0.93       443

