# **LINEAR REGRESSION BLOCK START**

In [None]:
# Import dependencies

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
filepath = "Hitters_Adjusted_Salary.csv"
df = pd.read_csv(filepath, encoding="utf-8", low_memory=False)
abbr_dict = {"GS": "games_started", "InnOuts": "inning_outs", "G": "games_played", "AB": "at_bats",\
                "R": "runs", "H": "hits", "2B": "doubles", "3B": "triples", "HR": "home_runs",\
                "RBI": "runs_batted_in", "SB": "stolen_bases", "CS": "caught_stealing", "BB": "base_on_balls",\
                "SO": "strike_outs", "IBB": "intentional_walks", "HBP": "hit_by_pitch", "SH": "sacrifice_hits",\
                "SF": "sacrifice_flies", "GIDP": "ground_into_double_play"}
    
df = df.rename(columns=abbr_dict)
df.head()

# Duplicate years code
Everything in this block is figuring out the duplicate years issue

In [None]:
year_count = pd.DataFrame(df[['playerID', 'yearID']].groupby(['playerID', 'yearID']).value_counts().sort_values().reset_index(name='counts'))
year_count.head()

In [None]:
single_years_df = pd.merge(df, year_count, on=['playerID','yearID'])
single_years_df = single_years_df.query('counts == 1')

single_years_df.head()

In [None]:
cleaned_df = single_years_df.drop(["Unnamed: 0", "yearID", "playerID", "teamID", "lgID", "salary", "PO", "A", "E", "DP"], axis=1)
cleaned_df = cleaned_df.reset_index(drop=True)
cleaned_df.head()
    

In [None]:
df.shape

In [None]:
(15023-12322)/15023

In [None]:

df.query('playerID=="whitema01" & yearID==1996').sort_values('teamID')


In [None]:
# code below requires downloading the original data from here: https://www.seanlahman.com/baseball-archive/statistics/
filepath = "core/Salaries.csv"
salary_df = pd.read_csv(filepath, encoding="utf-8", low_memory=False)
salary_df.head()

In [None]:
filepath = "core/Batting.csv"
batting_df = pd.read_csv(filepath, encoding="utf-8", low_memory=False)
batting_df = batting_df[batting_df['yearID']>=1985]
batting_df.head()

In [None]:
combined_df = pd.merge(salary_df, batting_df, on=['playerID','yearID','teamID','lgID'])
combined_df.head()

In [None]:
combined_df.query('playerID=="whitema01" & yearID==1996').sort_values('teamID')


In [None]:
red_df = df[['playerID','yearID', 'salary', "ADJ Salary"]]
red_comb_df = pd.merge(red_df, combined_df, on=['playerID','yearID','salary'],how="right")
red_comb_df.head()

In [None]:
red_comb_df.query('playerID=="whitema01" & yearID==1996').sort_values('teamID')

# END duplicate years code

In [None]:
def load_dataset():
    """ Reads dataset csv and returns pandas dataframe """
    
    filepath = "Hitters_Adjusted_Salary.csv"

    df = pd.read_csv(filepath, encoding="utf-8", low_memory=False)
    
    return df

## Keep this Code!!!!!!!!!!!!!

In [None]:
def clean_dataset(a_df):
    """ Returns deduped, na-dropped, index-reset dataframe """    
    year_count = pd.DataFrame(a_df[['playerID', 'yearID']].groupby(['playerID', 'yearID']).value_counts().sort_values().reset_index(name='counts'))
    single_years_df = pd.merge(df, year_count, on=['playerID','yearID'])
    a_df = single_years_df.query('counts == 1')
    
    a_df = a_df.drop_duplicates()   
        
    a_df = a_df.dropna()
    
    a_df = a_df.drop(["Unnamed: 0", "yearID", "playerID", "teamID", "lgID", "salary", "PO", "A", "E", "DP",'counts'], axis=1)
        
    a_df = a_df.reset_index(drop=True)
    
    return a_df   

In [None]:
def rename_columns(a_df):
    """ Returns dataframe with meaningful column names """    
    
    abbr_dict = {"GS": "games_started", "InnOuts": "inning_outs", "G": "games_played", "AB": "at_bats",\
                "R": "runs", "H": "hits", "2B": "doubles", "3B": "triples", "HR": "home_runs",\
                "RBI": "runs_batted_in", "SB": "stolen_bases", "CS": "caught_stealing", "BB": "base_on_balls",\
                "SO": "strike_outs", "IBB": "intentional_walks", "HBP": "hit_by_pitch", "SH": "sacrifice_hits",\
                "SF": "sacrifice_flies", "GIDP": "ground_into_double_play"}
    
    a_df = a_df.rename(columns=abbr_dict)
    
    return a_df   

In [None]:
def examine_dataset(a_df):
    """ Provides summary info and visualizations of dataset """
    
    print(a_df.info())
    
    a_df.hist(figsize = (15, 15))  
    

In [None]:
def round_salaries(a_df):
    """ Returns dataframe with salary column made into int and rounded """

    a_df["ADJ Salary"] = a_df["ADJ Salary"].astype("int").round()

    return a_df

In [None]:
def run_regression(a_df):
    """ Runs linear regression on dataframe, prints model scores """

    # Assign X and y

    X = a_df.drop(["log_of_salary", "ADJ Salary"], axis=1)
    y = a_df["log_of_salary"]
    print(X.columns)
    
    # Split the data into X_train, X_test, y_train, y_test

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Create a scaler to standardize the data

    scaler = StandardScaler()

    # Train the scaler with the X_train data.

    scaler.fit(X_train)

    # Transform X_train and X_test.

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = LinearRegression().fit(X_train_scaled, y_train)
    
    predicted = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)
    
    print(f"MSE: {mse}")
    print(f"R2: {r2}") 
    
    # Score the model

    print(f"Linear Regression Training Data Score: {model.score(X_train_scaled, y_train)}")
    print(f"Linear Regression Testing Data Score: {model.score(X_test_scaled, y_test)}")  
    
    plt.bar(np.arange(len(model.coef_)), model.coef_)
    plt.title(f'Linear Regression coefficient plot')
    plt.show()  

In [None]:
def run_LASSO(a_df):
    """ Runs LASSO regression on dataframe, prints model scores """

    # Assign X and y

    X = a_df.drop(["log_of_salary", "ADJ Salary"], axis=1)
    y = a_df["log_of_salary"]
    
    # Split the data into X_train, X_test, y_train, y_test

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Create a scaler to standardize the data

    scaler = StandardScaler()

    # Train the scaler with the X_train data.

    scaler.fit(X_train)

    # Transform X_train and X_test.

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    lasso_reg = Lasso().fit(X_train, y_train)

    predicted = lasso_reg.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)
    
    print(f"MSE: {mse}")
    print(f"R2: {r2}") 

    # Score the model

    print(f"LASSO Regression Training Data Score: {lasso_reg.score(X_train_scaled, y_train)}")
    print(f"LASSO Regression Testing Data Score: {lasso_reg.score(X_test_scaled, y_test)}")
    
    plt.bar(np.arange(len(lasso_reg.coef_)), lasso_reg.coef_)
    plt.title(f'LASSO Regression coefficient plot')
    plt.show()  
    
    sel = SelectFromModel(lasso_reg)
    sel.fit(X_train_scaled, y_train)
    SelectFromModel(estimator=Lasso())
    
    X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)
    
    scaler = StandardScaler().fit(X_selected_train)
    
    X_selected_train_scaled = scaler.transform(X_selected_train)
    X_selected_test_scaled = scaler.transform(X_selected_test)

    new_lasso_reg = LinearRegression().fit(X_selected_train_scaled, y_train)
    print(f"New linear regression score: {new_lasso_reg.score(X_selected_test_scaled, y_test)}")

In [None]:
def run_Ridge(a_df):
    """ Runs Ridge regression on dataframe, prints model scores """

    # Assign X and y

    X = a_df.drop(["log_of_salary",'ADJ Salary'], axis=1)
    y = a_df["log_of_salary"]
    print(X.columns)
    # Split the data into X_train, X_test, y_train, y_test

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Create a scaler to standardize the data

    scaler = StandardScaler()

    # Train the scaler with the X_train data.

    scaler.fit(X_train)

    # Transform X_train and X_test.

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    ridge_reg = Ridge().fit(X_train, y_train)

    predicted = ridge_reg.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)
    
    print(f"MSE: {mse}")
    print(f"R2: {r2}") 

    # Score the model

    print(f"Ridge Regression Training Data Score: {ridge_reg.score(X_train_scaled, y_train)}")
    print(f"Ridge Regression Testing Data Score: {ridge_reg.score(X_test_scaled, y_test)}")

    plt.bar(np.arange(len(ridge_reg.coef_)), ridge_reg.coef_)
    plt.title(f'Ridge Regression coefficient plot')
    plt.show()     
    
    sel = SelectFromModel(ridge_reg)
    sel.fit(X_train_scaled, y_train)
    SelectFromModel(estimator=Ridge())
    print(sel.get_support())
    
    X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)
  
    scaler = StandardScaler().fit(X_selected_train)
    
    X_selected_train_scaled = scaler.transform(X_selected_train)
    X_selected_test_scaled = scaler.transform(X_selected_test)

    new_ridge_reg = LinearRegression().fit(X_selected_train_scaled, y_train)
    print(f"New linear regression score: {new_ridge_reg.score(X_selected_test_scaled, y_test)}")

In [None]:
def run_ElasticNet(a_df):
    """ Runs ElasticNet regression on dataframe, prints model scores """

    # Assign X and y

    X = a_df.drop(["log_of_salary", "ADJ Salary"], axis=1)
    y = a_df["log_of_salary"]
    
    # Split the data into X_train, X_test, y_train, y_test

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Create a scaler to standardize the data

    scaler = StandardScaler()

    # Train the scaler with the X_train data.

    scaler.fit(X_train)

    # Transform X_train and X_test.

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    elasticnet_reg = ElasticNet().fit(X_train, y_train)
      
    predicted = elasticnet_reg.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)
    
    print(f"MSE: {mse}")
    print(f"R2: {r2}") 

    # Score the model

    print(f"ElasticNet Regression Training Data Score: {elasticnet_reg.score(X_train_scaled, y_train)}")
    print(f"ElasticNet Regression Testing Data Score: {elasticnet_reg.score(X_test_scaled, y_test)}")
    
    plt.bar(np.arange(len(elasticnet_reg.coef_)), elasticnet_reg.coef_)
    plt.title(f'ElasticNet Regression coefficient plot')
    plt.show()  
    
    sel = SelectFromModel(elasticnet_reg)
    sel.fit(X_train_scaled, y_train)
    SelectFromModel(estimator=ElasticNet())
    
    X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)
    
    scaler = StandardScaler().fit(X_selected_train)
    
    X_selected_train_scaled = scaler.transform(X_selected_train)
    X_selected_test_scaled = scaler.transform(X_selected_test)

    new_elasticnet_reg = LinearRegression().fit(X_selected_train_scaled, y_train)
    print(f"New linear regression score: {new_elasticnet_reg.score(X_selected_test_scaled, y_test)}")

In [None]:
loaded_df = load_dataset()

In [None]:
loaded_df["ADJ Salary"] = loaded_df["ADJ Salary"].astype(int)

loaded_df = loaded_df.loc[~(loaded_df["ADJ Salary"] <= 0), :]

In [None]:
clean_df = clean_dataset(loaded_df)

In [None]:
clean_df = rename_columns(clean_df)

In [None]:
clean_df.head(10)


In [None]:
examine_dataset(clean_df)

In [None]:
clean_df = round_salaries(clean_df)

In [None]:
clean_df["log_of_salary"] = np.log(clean_df["ADJ Salary"])

In [None]:
clean_df["log_of_salary"].describe()

In [None]:
run_regression(clean_df)

In [None]:
clean_df.head()

In [None]:
run_regression(clean_df)

In [None]:
run_LASSO(clean_df)

In [None]:
run_Ridge(clean_df)

In [None]:
clean_df.head()

In [None]:
run_ElasticNet(clean_df)

# Results:

- Using the logarithm of the ADJ Salary column values improved the regression score
    - I had to eliminate any salary values <= 0 to make this work
- Ridge regression achieved the best score: 0.7291495360671353

# To Do:

- Add / remove features from the dataset?


# **LINEAR REGRESSION BLOCK END**

# **PCA START BLOCK**

In [None]:
clean_df.head(10)

In [None]:
clean_df.columns

In [None]:
#import dependencies
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
# Drop ADJ Salary column
clean_df2= clean_df.drop(["ADJ Salary"], axis=1)
clean_df2.head()

In [None]:
clean_df2.shape

In [None]:
# Standardize the dataset so that columns that contain larger values do not influence the outcome more than columns with smaller values.
clean_scaled = StandardScaler().fit_transform(clean_df2)
print(clean_scaled)

In [None]:
# Perform dimensionality reduction with PCA preserving 90% of the explained variance ( n_components=0.90)
# Initialize PCA model
pca = PCA(n_components=0.90)

# Get two principal components for the iris data.
clean_pca = pca.fit_transform(clean_scaled)

In [None]:
clean_pca.shape

In [None]:
clean_df_pca = pd.DataFrame(data=clean_pca)
clean_df_pca

In [None]:
# Transform PCA data to a DataFrame
clean_df_pca = pd.DataFrame(data=clean_pca, columns=["principal component 1", "principal component 2",
                                                                "principal component 3","principal component 4",
                                                                 "principal component 5","principal component 6",
                                                                 "principal component 7","principal component 8"])
clean_df_pca.head()

In [None]:
clean_df_pca.columns

In [None]:
# Fetch the explained variance
pca.explained_variance_ratio_

In [None]:
# Further reduce the dataset dimensions with t-SNE

# Initialize t-SNE model
tsne = TSNE(learning_rate = 50)

# Reduce dimensions
tsne_features = tsne.fit_transform(clean_pca)

# The dataset has 2 columns
tsne_features.shape

In [None]:
# Prepare to plot the dataset

# The first column of transformed features
clean_df2["x"] = tsne_features[:,0]

# The second column of transformed features
clean_df2["y"] = tsne_features[:,1]

# Visualize the clusters
plt.scatter(clean_df2["x"],clean_df2["y"])
plt.show()

In [None]:
labels = clean_df["ADJ Salary"]
labels.value_counts()

In [None]:
# Visualize the clusters with color
plt.scatter(clean_df2["x"],clean_df2["y"], c=labels)
plt.show()

# Perform a Cluster Analysis with K-means

In [None]:
#Windows KMeans bug fix. 
import os
os.environ["OMP_NUM_THREADS"] = '1'

In [None]:
# Finding the best value for k using the Elbow Curve
inertia = []
k = list(range(1, 9))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(clean_df_pca)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow

In [None]:
# Plot the elbow curve to find the best candidate(s) for k
plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(list(range(11)))
plt.title('Elbow Curve')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# If possible, determine where the elbow of the plot is, and at which value of k it appears.
# Create a function called `get_clusters(k, data)` that finds the `k` clusters using K-Means on `data`. The function should return a DataFrame copy of `Data` that should include a new column containing the clusters found.

def get_clusters(k, data):
    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Train the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data

In [None]:
clusters = get_clusters(4, clean_df_pca)
clusters

In [None]:
def show_clusters(df):
    plt.scatter(df['principal component 1'], df['principal component 2'], c=df['class'])
    plt.xlabel('principal component 1')
    plt.ylabel('principal component 2')
    plt.title("ADJ Salary clusters")
    plt.show()

In [None]:
show_clusters(clusters)

# **PCA END BLOCK**