# EDA STARTER

In [None]:
# Import dependencies

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.cluster import KMeans 
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, StandardScaler

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# EDA 

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("Hitters_Adjusted_Salary.csv")
df

## Can skip these if just running models =>

In [None]:
df = pd.read_csv("Hitters_Adjusted_Salary.csv")
df.info()

corr = df.corr()
corr = corr.style.background_gradient(cmap='Purples')
corr

df.hist(figsize = (15, 15))  

sns.PairGrid(df[["GS", "AB", "R", "H", "2B", "GIDP", "IBB", "BB", "RBI", "HR", "ADJ Salary"]]).map_upper(plt.scatter)

df["yearID"].value_counts()

df["playerID"].value_counts()

df["teamID"].value_counts()

df["H"].value_counts() 

df["R"].value_counts() 

df["RBI"].value_counts() 

df["AB"].value_counts() 

df["ADJ Salary"].value_counts() 

## Can skip these if just running models <=

In [None]:
df = df.drop(columns=["Unnamed: 0"], axis=1)

totalhits = df["H"]
doubles = df["2B"]
triples = df["3B"]
homeruns = df["HR"]
atbats = df["AB"]
walks = df["BB"]
hit_by_pitch = df["HBP"]
sac_hits = df["SH"]
sac_flies = df["SF"]
singles = (totalhits - homeruns - triples - doubles)

df["slug_%"] = (singles + 2*doubles + 3*triples + 4*homeruns) / atbats
df["slug_%"] = df["slug_%"].fillna(0)
df["avg"] = totalhits / atbats
df["avg"] = df["avg"].fillna(0)
df["plate_appearances"] = atbats + walks + hit_by_pitch + sac_hits + sac_flies
df["avg"] = df["avg"].fillna(0)
df["on_base_%"] = (totalhits + walks + hit_by_pitch) / (atbats + walks + hit_by_pitch + sac_flies)
df["on_base_%"] = df["on_base_%"].fillna(0)
df["1B"] = singles
df["1B"] = df["1B"].fillna(0)

df["ADJ Salary"] = df["ADJ Salary"].astype("int").round()

df = df.loc[(df["ADJ Salary"] > 0), :]

df = df.drop_duplicates(subset=["yearID", "playerID"], keep=False).reset_index(drop=True)

df.duplicated(subset=["yearID", "playerID", "teamID"]).value_counts()

# Best features according to correlation matrix

df = df[["yearID", "playerID", "teamID", "GS", "AB", "R", "H", "2B", "GIDP", "IBB", "BB", "RBI", "HR", "ADJ Salary"]]

# Best features from our group EDA discussion

# df = df[["yearID", "playerID", "teamID", "G", "AB", "R", "H", "RBI", "ADJ Salary"]]

# df = df[["yearID", "playerID", "teamID", "RBI", "avg", "on_base_%", "ADJ Salary"]]

# Weed out rookies

no_rookies_df = df.groupby(["playerID"]).filter(lambda g: g["yearID"].count() > 3)

no_rookies_df = no_rookies_df.reset_index(drop=True)
no_rookies_df

In [None]:
# Create bins in which to place values based upon ADJ Salary
bins = [0, 999999, 5999999, 10000000, 40000000]

# # Create labels for these bins
# group_labels = ["< 1 mill", "1 mill to 5 mill", "6 mill to 10 mill", "> 10 mill"]

# # Slice the data and place it into bins
# pd.cut(new_df["ADJ Salary"], bins, labels=np.arange(4)).head()

no_rookies_df["ADJ Salary Group"] = pd.cut(no_rookies_df["ADJ Salary"], bins, labels=np.arange(4))

In [None]:
new_df = no_rookies_df.copy()

In [None]:
new_df.to_csv("cleaned_hitter_no_rookies.csv", index=False)

In [None]:
new_df

# Linear Regression Benchmark

In [None]:
df = new_df.copy()
df

# Assign X and y

X = df.drop(columns=["ADJ Salary", "ADJ Salary Group", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary Group"]

# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 

# Create a scaler to standardize the data

scaler = MinMaxScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression().fit(X_train_scaled, y_train)

# Score the model

print(f"STD SCALER Linear Regression Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"STD SCALER Linear Regression Testing Data Score: {model.score(X_test_scaled, y_test)}\n\n") 

model = LinearRegression().fit(X_train, y_train)

# Score the model

print(f"NO SCALER Linear Regression Training Data Score: {model.score(X_train, y_train)}")
print(f"NO SCALER Linear Regression Testing Data Score: {model.score(X_test, y_test)}")

# K-means benchmark

In [None]:
#Windows KMeans bug fix

import os
os.environ["OMP_NUM_THREADS"] = '1'

df = new_df.copy()
df

# Assign X and y

X = df.drop(columns=["ADJ Salary", "ADJ Salary Group", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary Group"]

# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 

# Create a scaler to standardize the data

scaler = StandardScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Finding the best value for k using the Elbow Curve
inertia = []
k = list(range(1, 9))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow

# Plot the elbow curve to find the best candidate(s) for k
plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(list(range(11)))
plt.title('Elbow Curve')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

# K-nearest neighbors

In [None]:
df = new_df.copy()
df

# Assign X and y

X = df.drop(columns=["ADJ Salary", "ADJ Salary Group", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary Group"]

# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 

# Create a scaler to standardize the data

scaler = StandardScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Loop through different k values to find which has the highest accuracy.
# Note: We use only odd numbers because we don't want any ties.

train_scores = []
test_scores = []

for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
        
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
# Note that k: 7 provides the best accuracy where the classifier starts to stablize

knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_scaled, y_train)

print("k=7 Test Acc: %.3f" % knn.score(X_test_scaled, y_test))

y_pred_knn = knn.predict(X_test_scaled)
cm_knn = confusion_matrix(y_test, y_pred_knn)

print(cm_knn)
print(classification_report(y_test, y_pred_knn))

# Note that each pair always adds up to exactly 1

y_pred_proba_knn = knn.predict_proba(X_test_scaled[0:10])[:, 1]
print(f"Prediction probabilities: \n\n{y_pred_proba_knn}")

# Random Forest Benchmark

In [None]:
# See what Random Forest offers

df = new_df.copy()
df

# Assign X and y

X = df.drop(columns=["ADJ Salary", "ADJ Salary Group", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary Group"]

# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 

# Create a scaler to standardize the data

scaler = StandardScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = RandomForestClassifier(random_state=0, n_estimators=200, max_depth=7).fit(X_train_scaled, y_train)

print(f"RandomForestClassifier Training Score: {clf.score(X_train_scaled, y_train)}")
print(f"RandomForestClassifier Testing Score: {clf.score(X_test_scaled, y_test)}\n\n")

# Now try with the selected features

sel = SelectFromModel(clf).fit(X_train_scaled, y_train)

X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X_train_scaled), y_train, random_state=0)

scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

clf = RandomForestClassifier(random_state=0, n_estimators=200, max_depth=7).fit(X_selected_train_scaled, y_train)

print(f"SelectFromModel RandomForestClassifier Training Score: {clf.score(X_selected_train_scaled, y_train)}")
print(f"SelectFromModel RandomForestClassifier Testing Score: {clf.score(X_selected_test_scaled, y_test)}")

# Extra Trees Benchmark

In [None]:
df = new_df.copy()
df

# Assign X and y

X = df.drop(columns=["ADJ Salary", "ADJ Salary Group", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary Group"]

# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 

# Create a scaler to standardize the data

scaler = MinMaxScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = ExtraTreesClassifier(max_depth=7).fit(X_train_scaled, y_train)

print(f"ExtraTreesClassifier Training Score: {model.score(X_train_scaled, y_train)}")
print(f"ExtraTreesClassifier Testing Score: {model.score(X_test_scaled, y_test)}\n\n")


# Now try with the selected features

sel = SelectFromModel(model).fit(X_train_scaled, y_train)

X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X_train_scaled), y_train, random_state=0)

scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

model = ExtraTreesClassifier(max_depth=7).fit(X_selected_train_scaled, y_train)

print(f"SelectFromModel ExtraTreesClassifier Training Score: {model.score(X_selected_train_scaled, y_train)}")
print(f"SelectFromModel ExtraTreesClassifier Testing Score: {model.score(X_selected_test_scaled, y_test)}")

# SVM Benchmark

In [None]:
# Support vector machine linear classifier

from sklearn.svm import SVC 

df = new_df.copy()
df

# Assign X and y

X = df.drop(columns=["ADJ Salary", "ADJ Salary Group", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary Group"]

# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 

# Create a scaler to standardize the data

scaler = StandardScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = SVC(kernel="linear")
model.fit(X_train_scaled, y_train)

# Model Accuracy

print(f"SVC Training Score: {model.score(X_train_scaled, y_train)}")
print(f"SVC Testing Score: {model.score(X_test_scaled, y_test)}")

predictions = model.predict(X_test_scaled)

print(classification_report(y_test, predictions))