# EDA STARTER

In [None]:
# Import dependencies

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.cluster import KMeans 
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, StandardScaler

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("Hitters_Adjusted_Salary.csv")
df

In [None]:
df = df.drop(columns=["Unnamed: 0"], axis=1)

# Best features according to correlation matrix

df = df[["yearID", "playerID", "teamID", "GS", "AB", "R", "H", "2B", "GIDP", "IBB", "BB", "RBI", "HR", "ADJ Salary"]]

# Best features from our group EDA discussion

# df = df[["yearID", "playerID", "teamID", "G", "AB", "R", "H", "RBI", "ADJ Salary"]]

df["ADJ Salary"] = df["ADJ Salary"].astype("int").round()

df = df.loc[(df["ADJ Salary"] > 0), :]

df = df.drop_duplicates(subset=["yearID", "playerID"]).reset_index(drop=True)

df.duplicated(subset=["yearID", "playerID", "teamID"]).value_counts()

no_rookies_df = df.groupby(["playerID"]).filter(lambda g: g["yearID"].count() > 3)
no_rookies_df = no_rookies_df.reset_index(drop=True)
no_rookies_df

In [None]:
# Create bins in which to place values based upon ADJ Salary
bins = [0, 999999, 5999999, 10000000, 40000000]

# # Create labels for these bins
# group_labels = ["< 1 mill", "1 mill to 5 mill", "6 mill to 10 mill", "> 10 mill"]

# # Slice the data and place it into bins
# pd.cut(new_df["ADJ Salary"], bins, labels=np.arange(4)).head()

no_rookies_df["ADJ Salary Group"] = pd.cut(no_rookies_df["ADJ Salary"], bins, labels=np.arange(4))

In [None]:
new_df = no_rookies_df.copy()

# Linear Regression Benchmark

In [None]:
df = new_df.copy()
df

# Assign X and y

X = df.drop(columns=["ADJ Salary", "ADJ Salary Group", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary Group"]

# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 

# Create a scaler to standardize the data

scaler = MinMaxScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression().fit(X_train_scaled, y_train)

# Score the model

print(f"STD SCALER Linear Regression Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"STD SCALER Linear Regression Testing Data Score: {model.score(X_test_scaled, y_test)}\n\n") 

model = LinearRegression().fit(X_train, y_train)

# Score the model

print(f"NO SCALER Linear Regression Training Data Score: {model.score(X_train, y_train)}")
print(f"NO SCALER Linear Regression Testing Data Score: {model.score(X_test, y_test)}")

# K-means benchmark

In [None]:
df = new_df.copy()

#Windows KMeans bug fix

import os
os.environ["OMP_NUM_THREADS"] = '1'

# Assign X and y

X = df.drop(columns=["ADJ Salary", "ADJ Salary Group", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary Group"].values

# Finding the best value for k using the Elbow Curve
inertia = []
k = list(range(1, 9))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow

# Plot the elbow curve to find the best candidate(s) for k
plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(list(range(11)))
plt.title('Elbow Curve')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

# K-nearest neighbors

In [None]:
df = new_df.copy()

# Assign X and y

X = df.drop(columns=["ADJ Salary", "ADJ Salary Group", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary Group"].values 

# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 

# Create a scaler to standardize the data

scaler = StandardScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Loop through different k values to find which has the highest accuracy.
# Note: We use only odd numbers because we don't want any ties.

train_scores = []
test_scores = []

for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
        
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

X.columns

# Note that k: 9 provides the best accuracy where the classifier starts to stablize

knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_scaled, y_train)

print('k=9 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

y_pred_knn = knn.predict(X_test_scaled)
cm_knn = confusion_matrix(y_test, y_pred_knn)

print(cm_knn)
print(classification_report(y_test, y_pred_knn))

# Note that each pair always adds up to exactly 1

y_pred_proba_knn = knn.predict_proba(X_test_scaled[0:10])[:, 1]
print(y_pred_proba_knn)

# Random Forest Benchmark

In [None]:
# See what Random Forest offers

clf = RandomForestClassifier(random_state=0, n_estimators=200).fit(X_train_scaled, y_train)

print(f"RandomForestClassifier Training Score: {clf.score(X_train_scaled, y_train)}")
print(f"RandomForestClassifier Testing Score: {clf.score(X_test_scaled, y_test)}")

feature_importances = clf.feature_importances_ 

print(feature_importances)

features = sorted(zip(X.columns, clf.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,50)
plt.margins(y=0.01)

ax.barh(y=cols, width=width)

plt.show()

# Now try with the selected features

sel = SelectFromModel(clf).fit(X_train_scaled, y_train)

X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X_train_scaled), y_train, random_state=0)

scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

clf = RandomForestClassifier(random_state=0, n_estimators=200).fit(X_selected_train_scaled, y_train)

print(f"SelectFromModel RandomForestClassifier Training Score: {clf.score(X_selected_train_scaled, y_train)}")
print(f"SelectFromModel RandomForestClassifier Testing Score: {clf.score(X_selected_test_scaled, y_test)}")

# Extra Trees Benchmark

In [None]:
model = ExtraTreesClassifier().fit(X_train_scaled, y_train)

print(f"\n\nExtraTreesClassifier Training Score: {model.score(X_train_scaled, y_train)}")
print(f"ExtraTreesClassifier Testing Score: {model.score(X_test_scaled, y_test)}")

feature_importances = model.feature_importances_ 

features = sorted(zip(X.columns, model.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,50)
plt.margins(y=0.01)

ax.barh(y=cols, width=width)

plt.show()

# Now try with the selected features

sel = SelectFromModel(model).fit(X_train_scaled, y_train)

X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X_train_scaled), y_train, random_state=0)

scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

model = ExtraTreesClassifier().fit(X_selected_train_scaled, y_train)

print(f"SelectFromModel ExtraTreesClassifier Training Score: {model.score(X_selected_train_scaled, y_train)}")
print(f"SelectFromModel ExtraTreesClassifier Testing Score: {model.score(X_selected_test_scaled, y_test)}")

# PCA/T-SNE Benchmark

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
# Standardize the dataset so that columns that contain larger values do not influence the outcome more than columns with smaller values.
clean_scaled = StandardScaler().fit_transform(X)
print(clean_scaled)

In [None]:
# Perform dimensionality reduction with PCA preserving 90% of the explained variance ( n_components=0.90)
# Initialize PCA model
pca = PCA(n_components=0.90)

# Get two principal components for the iris data.
clean_pca = pca.fit_transform(clean_scaled)

In [None]:
clean_df_pca = pd.DataFrame(data=clean_pca)
clean_df_pca

In [None]:
# Fetch the explained variance
pca.explained_variance_ratio_

In [None]:
# Further reduce the dataset dimensions with t-SNE

# Initialize t-SNE model
tsne = TSNE(learning_rate = 50)

# Reduce dimensions
tsne_features = tsne.fit_transform(clean_pca)

# The dataset has 2 columns
tsne_features.shape

In [None]:
# Prepare to plot the dataset

# The first column of transformed features
clean_df_pca["x"] = tsne_features[:,0]

# The second column of transformed features
clean_df_pca["y"] = tsne_features[:,1]

# Visualize the clusters
plt.scatter(clean_df_pca["x"],clean_df_pca["y"])
plt.show()

In [None]:
labels = new_df["ADJ Salary"]
labels.value_counts()

# Visualize the clusters with color
plt.scatter(clean_df_pca["x"],clean_df_pca["y"], c=labels)
plt.legend()
plt.show()

# EDA - you can skip if you just want to jump to model creation below

In [None]:
df.info()

In [None]:
df.sample(25)

In [None]:
corr = df.corr()
corr = corr.style.background_gradient(cmap='Purples')
corr

In [None]:
df.hist(figsize = (15, 15))  

In [None]:
sns.PairGrid(df[["GS", "AB", "R", "H", "2B", "GIDP", "IBB", "BB", "RBI", "HR", "ADJ Salary"]]).map_upper(plt.scatter)

In [None]:
df.to_csv("cleaned_hitter.csv", index=False)

In [None]:
df.info()

In [None]:
df["yearID"].value_counts()

In [None]:
df["playerID"].value_counts()

In [None]:
df["teamID"].value_counts()

In [None]:
df["H"].value_counts() 

In [None]:
df["R"].value_counts() 

In [None]:
df["RBI"].value_counts() 

In [None]:
df["AB"].value_counts() 

In [None]:
df["ADJ Salary"].value_counts() 

In [None]:
sns.FacetGrid(ml_df, hue="ADJ Salary Group", size=5).map(sns.distplot,"G").add_legend()
sns.FacetGrid(ml_df, hue="ADJ Salary Group", size=5).map(sns.distplot,"AB").add_legend()
sns.FacetGrid(ml_df, hue="ADJ Salary Group", size=5).map(sns.distplot,"R").add_legend()
sns.FacetGrid(ml_df, hue="ADJ Salary Group", size=5).map(sns.distplot,"H").add_legend()
sns.FacetGrid(ml_df, hue="ADJ Salary Group", size=5).map(sns.distplot,"RBI").add_legend()
sns.FacetGrid(ml_df, hue="ADJ Salary Group", size=5).map(sns.distplot,"ADJ Salary").add_legend()

plt.show()

# df = df[["yearID", "playerID", "teamID", "G", "AB", "R", "H", "RBI", "ADJ Salary"]]

sns.boxplot(x="ADJ Salary Group", y="G", data=ml_df)
plt.show()

sns.boxplot(x="ADJ Salary Group", y="AB", data=ml_df)
plt.show()

sns.boxplot(x="ADJ Salary Group", y="R", data=ml_df)
plt.show()

sns.boxplot(x="ADJ Salary Group", y="H", data=ml_df)
plt.show()

sns.boxplot(x="ADJ Salary Group", y="RBI", data=ml_df)
plt.show()

sns.set_style("whitegrid")
sns.pairplot(ml_df[["G", "AB", "R", "H", "RBI", "ADJ Salary Group"]], hue="ADJ Salary Group", size=3);
plt.show()