In [1]:
# Initial imports.
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import seaborn as sn
import matplotlib.pyplot as plt

In [2]:
# Read in batting and fielding dataframe.
BatField_df = pd.read_csv(r'data\ml_BattingFielding.csv')
BatField_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data\\ml_BattingFielding.csv'

In [None]:
# Check shape of df.
BatField_df.shape

In [None]:
# Check for null values.
BatField_df.isnull().sum()

In [None]:
# Drop null values.
BatField_df = BatField_df.dropna(subset=['AVG'])

In [None]:
# Double check df shape
BatField_df.shape

In [None]:
# Read in pitching dataframe.
Pitch_df = pd.read_csv(r'data\ml_Pitching.csv')
Pitch_df.head()

In [None]:
# Check dataframe shape.
Pitch_df.shape

In [None]:
# Check for null values.
Pitch_df.isnull().sum()

In [None]:
# Drop the rows with null values
Pitch_df = Pitch_df.dropna(subset=['ERA','BAOpp'])

In [None]:
# Double check shape of df.
Pitch_df.shape

In [None]:
# define which dataframe will be used in machine learning model
# e.g. pitching or batting/fielding

# COMMENT OUT WHICHEVER YOU DO NOT WANT TO RUN!

df = BatField_df

# df = Pitch_df

In [None]:
# test conditional to compare / identify df's (for use later in code)
if df.equals(Pitch_df):
    print('Pitching')
if df.equals(BatField_df):
    print('Batting/Fielding')

In [None]:
# encode inducted column from Y/N to numeric 0/1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = df.copy()
df2['inducted'] = le.fit_transform(df2['inducted'])

In [None]:
# encode playerID to numeric
# possibly remove if we want to keep playerID's as index to later reference player names

# df2['playerID'] = le.fit_transform(df2['playerID'])

In [None]:
# define input (X) and output (y) variables
# drop columns of unnecessary features in X 
# batting/fielding 1st iteration showed the following features contributed <3%:
# SF, CS, IBB, SB

y = df2["inducted"]
if df.equals(BatField_df):
    X = df2.drop(columns=["inducted", "playerID", "SF", "CS", "IBB", "SB"])  # Batters/Fielders
if df.equals(Pitch_df):
    X = df2.drop(columns=["inducted", "playerID"]) # Pitchers

In [None]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# Calculate feature importance in the Random Forest model & sort largest to smallest
importances = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances

In [None]:
# sort the features by their importance.
sorted(zip(X.columns), reverse=True)

In [None]:
rf_model.feature_importances_.sum()

In [None]:
x = [lis[-1] for lis in importances]
x

In [None]:
y = [lis[0] for lis in importances]
y

In [None]:
# DANE'S CODE:
# %matplotlib inline
# plt.style.use('ggplot')
# x = ['R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'SO', 'SH', 'AVG', 'PO', 'A', 'E', 'DP', 'award_count']
# y = importances
# x_pos = [i for i, _ in enumerate(x)]

# plt.bar(x_pos, importances, color='green')
# plt.xlabel('Features Importance')
# plt.show()

# NICK's comments
# would like to investigate how to auto generate x list instead of manually entering
# Need to label x ticks
# would like to reorder, largest to smallest

# TRY THIS CODE:
%matplotlib inline
plt.style.use('ggplot')
x = [lis[-1] for lis in importances]
y = [lis[0] for lis in importances]
x_pos = [i for i, _ in enumerate(x)]

plt.bar(x_pos, y, color='green')
plt.title('Features Importance')
plt.xlabel('Features')
plt.ylabel('Importance [%]')

plt.xticks(x_pos,x,rotation=90,ha='center')
plt.show()

In [None]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [None]:
print (X_test) #test dataset (without the actual outcome)
print (y_pred) #predicted values

In [None]:
df2

In [None]:
df2.columns

In [None]:
# create df with all data to investigate statistical trends
df3 = pd.merge(X_test, df2[["inducted","playerID"]], how="inner", left_index=True, right_index=True)

if df.equals(BatField_df):
    # This one is specific to Batting/Fielding!!!
    df3 = df3[['playerID','R','H','2B','3B','HR','RBI','BB','SO','SH','GIDP','AVG','PO','A','E','DP','award_count','inducted']]
    
if df.equals(Pitch_df):
    # This one is specific to Pitching
    df3 = df3[['playerID','W','L','CG','SHO','SV','H_allowed','ER','HR_allowed','BB_allowed','StruckOut','BAOpp','ERA','WP','HitBatsmen','BK','R_allowed','SH_allowed','SF_allowed','GIDP_pitcher','award_count','inducted']]
    
df3["predicted"] = y_pred   
df3

In [None]:
# Append dataframe with column comparing actual inducted to predicted accuracy.
df3['Accuracy'] = np.where(df3['inducted'] == df3['predicted'], 'Yes', 'No')
df3.head()

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
plt.suptitle('')
df3.boxplot(column='award_count', by='Accuracy', ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
plt.suptitle('')
df3.boxplot(column='R', by='Accuracy', ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
plt.suptitle('')
df3.boxplot(column='AVG', by='Accuracy', ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
plt.suptitle('')
df3.boxplot(column='H', by='Accuracy', ax=ax)

In [None]:
# Create dataframe of only inaccurate predictions.
accuracy = df3.groupby('Accuracy')
no_df = accuracy.get_group('No')
no_df.head()

In [None]:
# Create dataframe of only accurate predictions.
yes_df = accuracy.get_group('Yes')
yes_df.head()

In [None]:
# Show mean data for each feature grouped by accuracy of prediction.
mean_summary = df3.groupby('Accuracy').mean()
mean_summary