# EDA STARTER

In [None]:
# Import dependencies

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, normalize, MinMaxScaler

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("Hitters_Adjusted_Salary.csv")

In [None]:
df.info()

In [None]:
df.sample(25)

In [None]:
corr = df.corr()
corr = corr.style.background_gradient(cmap='Purples')
corr

In [None]:
df = df[["yearID", "playerID", "teamID", "GS", "AB", "R", "H", "2B", "GIDP", "IBB", "BB", "RBI", "HR", "ADJ Salary"]]

In [None]:
df

In [None]:
df["ADJ Salary"] = df["ADJ Salary"].astype("int").round()

In [None]:
df = df.drop_duplicates(subset=["yearID", "playerID"]).reset_index(drop=True)
df

In [None]:
df.duplicated(subset=["yearID", "playerID", "teamID"]).value_counts()

In [None]:
df.to_csv("cleaned_hitter.csv", index=False)

In [None]:
df.info()

In [None]:
df["yearID"].value_counts()

In [None]:
df["playerID"].value_counts()

In [None]:
df["teamID"].value_counts()

In [None]:
df["H"].value_counts() 

In [None]:
df["R"].value_counts() 

In [None]:
df["RBI"].value_counts() 

In [None]:
df["AB"].value_counts() 

In [None]:
df["ADJ Salary"].value_counts() 

In [None]:
new_df = df.copy()

# Visualizations

In [None]:
corr = df.corr()
corr = corr.style.background_gradient(cmap='Purples')
corr

In [None]:
df.hist(figsize = (15, 15))  

In [None]:
sns.PairGrid(df[["GS", "AB", "R", "H", "2B", "GIDP", "IBB", "BB", "RBI", "HR", "ADJ Salary"]]).map_upper(plt.scatter)

# Vanilla LR

In [None]:
# Assign X and y

X = df.drop(["ADJ Salary", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary"] 

# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 

# Create a scaler to standardize the data

scaler = StandardScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression().fit(X_train_scaled, y_train)

# Score the model

print(f"STD SCALER Linear Regression Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"STD SCALER Linear Regression Testing Data Score: {model.score(X_test_scaled, y_test)}") 

model = LinearRegression().fit(X_train, y_train)

# Score the model

print(f"NO SCALER Linear Regression Training Data Score: {model.score(X_train, y_train)}")
print(f"NO SCALER Linear Regression Testing Data Score: {model.score(X_test, y_test)}")

# Improved LR

In [None]:
df = new_df.copy()

# Assign X and y

X = df.drop(["ADJ Salary", "yearID", "playerID", "teamID"], axis=1)
y = df["ADJ Salary"] 

# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 

# Create a scaler to standardize the data

scaler = StandardScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression().fit(X_train_scaled, y_train)

# Score the model

print(f"STD SCALER Linear Regression Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"STD SCALER Linear Regression Testing Data Score: {model.score(X_test_scaled, y_test)}") 

model = LinearRegression().fit(X_train, y_train)

# Score the model

print(f"NO SCALER Linear Regression Training Data Score: {model.score(X_train, y_train)}")
print(f"NO SCALER Linear Regression Testing Data Score: {model.score(X_test, y_test)}")

# New take on modeling