# EDA STARTER

In [None]:
# Import dependencies

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 

from sklearn.preprocessing import StandardScaler, normalize, MinMaxScaler

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("Hitters_Adjusted_Salary.csv")

In [None]:
df = df.drop(columns=["Unnamed: 0"], axis=1)

# Best features according to correlation matrix

df = df[["yearID", "playerID", "teamID", "GS", "AB", "R", "H", "2B", "GIDP", "IBB", "BB", "RBI", "HR", "ADJ Salary"]]

# Best features from our group EDA discussion

# df = df[["yearID", "playerID", "teamID", "G", "AB", "R", "H", "RBI", "ADJ Salary"]]

df["ADJ Salary"] = df["ADJ Salary"].astype("int").round()

df = df.loc[(df["ADJ Salary"] > 0), :]

df = df.drop_duplicates(subset=["yearID", "playerID"]).reset_index(drop=True)

df.duplicated(subset=["yearID", "playerID", "teamID"]).value_counts()

new_df = df.copy()

In [None]:
new_df

# EDA - you can skip if you just want to jump to model creation below

In [None]:
df.info()

In [None]:
df.sample(25)

In [None]:
corr = df.corr()
corr = corr.style.background_gradient(cmap='Purples')
corr

In [None]:
df.hist(figsize = (15, 15))  

In [None]:
sns.PairGrid(df[["GS", "AB", "R", "H", "2B", "GIDP", "IBB", "BB", "RBI", "HR", "ADJ Salary"]]).map_upper(plt.scatter)

In [None]:
df.to_csv("cleaned_hitter.csv", index=False)

In [None]:
df.info()

In [None]:
df["yearID"].value_counts()

In [None]:
df["playerID"].value_counts()

In [None]:
df["teamID"].value_counts()

In [None]:
df["H"].value_counts() 

In [None]:
df["R"].value_counts() 

In [None]:
df["RBI"].value_counts() 

In [None]:
df["AB"].value_counts() 

In [None]:
df["ADJ Salary"].value_counts() 

# Binning salaries

In [None]:
new_df

In [None]:
# Create bins in which to place values based upon ADJ Salary
bins = [0, 999999, 5999999, 10000000, 40000000]

# Create labels for these bins
group_labels = ["< 1 mill", "1 mill to 5 mill", "6 mill to 10 mill", "> 10 mill"]

# Slice the data and place it into bins
pd.cut(new_df["ADJ Salary"], bins, labels=group_labels).head()

new_df["ADJ Salary Group"] = pd.cut(new_df["ADJ Salary"], bins, labels=[1, 2, 3, 4])
new_df.head()

#Create a GroupBy object based upon ADJ Salary Group
salary_group = new_df.groupby("ADJ Salary Group")

# Find how many rows fall into each bin
print(salary_group["playerID"].count())

one_hots = pd.get_dummies(new_df["ADJ Salary Group"], prefix="ADJ Salary Group", prefix_sep="_")
one_hots

ml_df = pd.concat([new_df, one_hots], axis=1)
ml_df

# Vanilla LR

In [None]:
df = ml_df.copy()
df

In [None]:
# Assign X and y

X = df.drop(columns=["ADJ Salary", "ADJ Salary Group", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary Group"]

In [None]:
X

In [None]:
y

In [None]:
# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 

# Create a scaler to standardize the data

scaler = StandardScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression().fit(X_train_scaled, y_train)

# Score the model

print(f"STD SCALER Linear Regression Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"STD SCALER Linear Regression Testing Data Score: {model.score(X_test_scaled, y_test)}") 

model = LinearRegression().fit(X_train, y_train)

# Score the model

print(f"NO SCALER Linear Regression Training Data Score: {model.score(X_train, y_train)}")
print(f"NO SCALER Linear Regression Testing Data Score: {model.score(X_test, y_test)}")

# Improved LR

In [None]:
df = ml_df.copy()

# Assign X and y

X = df.drop(columns=["ADJ Salary", "ADJ Salary Group", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary Group"] 

# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 

# Create a scaler to standardize the data

scaler = StandardScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression().fit(X_train_scaled, y_train)

# Score the model

print(f"STD SCALER Linear Regression Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"STD SCALER Linear Regression Testing Data Score: {model.score(X_test_scaled, y_test)}") 

model = LinearRegression().fit(X_train, y_train)

# Score the model

print(f"NO SCALER Linear Regression Training Data Score: {model.score(X_train, y_train)}")
print(f"NO SCALER Linear Regression Testing Data Score: {model.score(X_test, y_test)}")

# K-nearest neighbors

In [None]:
df = ml_df.copy()

# Assign X and y

X = df.drop(columns=["ADJ Salary", "ADJ Salary Group", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary Group"].values 

# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 

# Create a scaler to standardize the data

scaler = StandardScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Loop through different k values to find which has the highest accuracy.
# Note: We use only odd numbers because we don't want any ties.

train_scores = []
test_scores = []

for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
        
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
X.columns

In [None]:
# Note that k: 5 provides the best accuracy where the classifier starts to stablize

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

print('k=5 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

In [None]:
y_pred_knn = knn.predict(X_test_scaled)
cm_knn = confusion_matrix(y_test, y_pred_knn)

print(cm_knn)
print(classification_report(y_test, y_pred_knn))

# Note that each pair always adds up to exactly 1

y_pred_proba_knn = knn.predict_proba(X_test_scaled[0:10])[:, 1]
print(y_pred_proba_knn)

In [None]:
# See what Random Forest offers

clf = RandomForestClassifier(random_state=0, n_estimators=200).fit(X_train_scaled, y_train)

print(f"RandomForestClassifier Training Score: {clf.score(X_train_scaled, y_train)}")
print(f"RandomForestClassifier Testing Score: {clf.score(X_test_scaled, y_test)}")

feature_importances = clf.feature_importances_ 

print(feature_importances)

features = sorted(zip(X.columns, clf.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,50)
plt.margins(y=0.01)

ax.barh(y=cols, width=width)

plt.show()

In [None]:
# Now try with the selected features

sel = SelectFromModel(clf).fit(X_train_scaled, y_train)

X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X_train_scaled), y_train, random_state=0)

scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

clf = RandomForestClassifier(random_state=0, n_estimators=200).fit(X_selected_train_scaled, y_train)

print(f"SelectFromModel RandomForestClassifier Training Score: {clf.score(X_selected_train_scaled, y_train)}")
print(f"SelectFromModel RandomForestClassifier Testing Score: {clf.score(X_selected_test_scaled, y_test)}")

In [None]:
model = ExtraTreesClassifier().fit(X_train_scaled, y_train)

print(f"\n\nExtraTreesClassifier Training Score: {model.score(X_train_scaled, y_train)}")
print(f"ExtraTreesClassifier Testing Score: {model.score(X_test_scaled, y_test)}")

feature_importances = model.feature_importances_ 

features = sorted(zip(X.columns, model.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,50)
plt.margins(y=0.01)

ax.barh(y=cols, width=width)

plt.show()

In [None]:
# Now try with the selected features

sel = SelectFromModel(model).fit(X_train_scaled, y_train)

X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X_train_scaled), y_train, random_state=0)

scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

model = ExtraTreesClassifier().fit(X_selected_train_scaled, y_train)

print(f"SelectFromModel ExtraTreesClassifier Training Score: {model.score(X_selected_train_scaled, y_train)}")
print(f"SelectFromModel ExtraTreesClassifier Testing Score: {model.score(X_selected_test_scaled, y_test)}")

In [None]:
lr_model = LogisticRegression(max_iter=10000) 

# Train the logistic regression model
    
lr_model.fit(X_train, y_train)

# Print the training and testing scores

print(f"Training Data Score: {lr_model.score(X_train, y_train)}")
print(f"Testing Data Score: {lr_model.score(X_test, y_test)}")

In [None]:
# Get the probability for each class in the model
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)[:,1]

print(y_pred_proba_lr)

In [None]:
 # Show the confusion matrix for the logistic regression model
    
y_pred_lr = lr_model.predict(X_test_scaled)
cm_lr = confusion_matrix(y_test, y_pred_lr)

print(cm_lr)

In [None]:
# Print the classification report for the logistic regression model

print(classification_report(y_test, y_pred_lr))