# **LINEAR REGRESSION BLOCK START**

In [1]:
# Import dependencies

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [15]:
df = pd.read_csv("Hitters_Adjusted_Salary.csv")

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15023 entries, 0 to 15022
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   yearID      15023 non-null  int64  
 1   ADJ Salary  15023 non-null  float64
 2   GS          15023 non-null  float64
 3   InnOuts     15023 non-null  float64
 4   PO          15023 non-null  int64  
 5   A           15023 non-null  int64  
 6   E           15023 non-null  float64
 7   DP          15023 non-null  int64  
 8   G           15023 non-null  int64  
 9   AB          15023 non-null  int64  
 10  R           15023 non-null  int64  
 11  H           15023 non-null  int64  
 12  2B          15023 non-null  int64  
 13  3B          15023 non-null  int64  
 14  HR          15023 non-null  int64  
 15  RBI         15023 non-null  float64
 16  SB          15023 non-null  float64
 17  CS          15023 non-null  float64
 18  BB          15023 non-null  int64  
 19  SO          15023 non-nul

In [5]:
df.sample(25)

Unnamed: 0.1,Unnamed: 0,yearID,playerID,salary,ADJ Salary,GS,InnOuts,PO,A,E,DP,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
12166,12166,2010,schiena01,416500,494344.5,41.0,1627.0,118,7,1.0,1,SFN,NL,137,227,34,55,13,3,3,17.0,4.0,5.0,20,38.0,5.0,3.0,1.0,1.0,3.0
1827,1827,1989,phelpke01,655000,1367106.0,5.0,141.0,47,1,1.0,4,OAK,AL,11,9,0,1,1,0,0,0.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0,0.0
6570,6570,1999,velarra01,1600000,2485580.0,156.0,4075.0,298,493,14.0,104,ANA,AL,95,376,57,115,15,4,9,48.0,13.0,4.0,43,56.0,1.0,4.0,2.0,0.0,8.0
5642,5642,1997,nieveme01,335000,540197.4,84.0,2275.0,187,4,4.0,1,DET,AL,116,359,46,82,18,1,20,64.0,1.0,7.0,39,157.0,6.0,5.0,0.0,2.0,3.0
5800,5800,1997,francma01,160000,258004.7,23.0,723.0,61,52,4.0,11,NYN,NL,112,163,21,45,5,0,5,21.0,1.0,0.0,13,23.0,4.0,0.0,0.0,0.0,4.0
270,270,1985,kearnbo01,183000,440171.1,95.0,2442.0,529,50,3.0,7,SEA,AL,108,305,24,74,14,1,6,27.0,1.0,1.0,11,59.0,1.0,4.0,5.0,1.0,7.0
6297,6297,1998,newfima01,318750,506110.5,49.0,1107.0,73,3,3.0,0,MIL,NL,93,186,15,44,7,0,3,25.0,0.0,1.0,19,29.0,1.0,1.0,0.0,3.0,7.0
7449,7449,2000,hernara02,211000,317126.1,118.0,3188.0,764,43,13.0,7,OAK,AL,143,419,52,101,19,0,14,62.0,1.0,0.0,38,64.0,1.0,7.0,10.0,5.0,14.0
8868,8868,2003,perezti01,312500,439556.7,85.0,2233.0,180,6,2.0,1,NYN,NL,127,346,32,93,21,0,4,42.0,5.0,6.0,18,29.0,1.0,2.0,7.0,9.0,5.0
9990,9990,2006,lopezja01,9000000,11554060.0,34.0,926.0,222,14,1.0,2,BAL,AL,76,279,30,74,15,1,8,31.0,0.0,0.0,18,60.0,0.0,2.0,0.0,0.0,5.0


In [16]:
df = df.drop(columns=["Unnamed: 0", "playerID", "teamID", "lgID", "salary"])

In [17]:
df

Unnamed: 0,yearID,ADJ Salary,GS,InnOuts,PO,A,E,DP,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,1985,1.310892e+06,67.0,1698.0,314,35,4.0,1,70,208,12,42,6,0,0,20.0,0.0,1.0,22,12.0,1.0,1.0,4.0,2.0,8.0
1,1985,1.503317e+06,76.0,2097.0,384,48,6.0,4,96,282,15,61,9,0,3,25.0,0.0,3.0,29,25.0,1.0,1.0,0.0,4.0,15.0
2,1985,1.924245e+06,27.0,814.0,299,25,1.0,31,101,170,16,40,7,0,3,21.0,0.0,0.0,18,22.0,4.0,0.0,0.0,1.0,5.0
3,1985,6.013267e+05,124.0,3299.0,215,10,5.0,0,138,492,58,130,15,2,17,72.0,9.0,9.0,44,76.0,4.0,3.0,1.0,2.0,13.0
4,1985,3.607960e+06,125.0,3196.0,917,119,11.0,111,130,483,61,129,25,3,27,89.0,1.0,1.0,50,57.0,4.0,1.0,0.0,6.0,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15018,2016,6.739673e+06,80.0,2141.0,170,2,2.0,1,103,350,44,76,9,7,2,24.0,14.0,5.0,18,34.0,0.0,3.0,2.0,2.0,12.0
15019,2016,5.768082e+05,39.0,1071.0,285,12,2.0,30,104,196,16,46,4,0,5,26.0,0.0,0.0,20,38.0,0.0,2.0,1.0,5.0,4.0
15020,2016,5.650542e+05,48.0,1376.0,125,5,1.0,0,76,221,28,51,11,0,7,16.0,14.0,3.0,14,77.0,0.0,1.0,0.0,1.0,2.0
15021,2016,2.343639e+07,133.0,3474.0,201,5,1.0,1,143,525,84,128,28,0,21,69.0,5.0,1.0,71,139.0,0.0,4.0,0.0,6.0,17.0


In [None]:
df["ADJ Salary"] = df["ADJ Salary"].

In [None]:
def load_dataset():
    """ Reads dataset csv and returns pandas dataframe """
    
    filepath = "Hitters_Adjusted_Salary.csv"

    df = pd.read_csv(filepath, encoding="utf-8", low_memory=False)
    
    return df

In [None]:
def clean_dataset(a_df):
    """ Returns deduped, na-dropped, index-reset dataframe """    
    
    a_df = a_df.drop_duplicates()   
        
    a_df = a_df.dropna()
    
    a_df = a_df.drop(["Unnamed: 0", "yearID", "playerID", "teamID", "lgID", "salary", "PO", "A", "E", "DP"], axis=1)
        
    a_df = a_df.reset_index(drop=True)
    
    return a_df   

In [None]:
def rename_columns(a_df):
    """ Returns dataframe with meaningful column names """    
    
    abbr_dict = {"GS": "games_started", "InnOuts": "inning_outs", "G": "games_played", "AB": "at_bats",\
                "R": "runs", "H": "hits", "2B": "doubles", "3B": "triples", "HR": "home_runs",\
                "RBI": "runs_batted_in", "SB": "stolen_bases", "CS": "caught_stealing", "BB": "base_on_balls",\
                "SO": "strike_outs", "IBB": "intentional_walks", "HBP": "hit_by_pitch", "SH": "sacrifice_hits",\
                "SF": "sacrifice_flies", "GIDP": "ground_into_double_play"}
    
    a_df = a_df.rename(columns=abbr_dict)
    
    return a_df   

In [None]:
def examine_dataset(a_df):
    """ Provides summary info and visualizations of dataset """
    
    print(a_df.info())
    
    a_df.hist(figsize = (15, 15))  
    

In [None]:
def round_salaries(a_df):
    """ Returns dataframe with salary column made into int and rounded """

    a_df["ADJ Salary"] = a_df["ADJ Salary"].astype("int").round()

    return a_df

In [None]:
def run_regression(a_df, salary_col_name="ADJ Salary"):
    """ Runs linear regression on dataframe, prints model scores """

    # Assign X and y

    X = a_df.drop([salary_col_name], axis=1)
    y = a_df[salary_col_name]
    
    # Split the data into X_train, X_test, y_train, y_test

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Create a scaler to standardize the data

    scaler = StandardScaler()

    # Train the scaler with the X_train data.

    scaler.fit(X_train)

    # Transform X_train and X_test.

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = LinearRegression().fit(X_train_scaled, y_train)
    
    predicted = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)
    
    print(f"MSE: {mse}")
    print(f"R2: {r2}") 
    
    # Score the model

    print(f"Linear Regression Training Data Score: {model.score(X_train_scaled, y_train)}")
    print(f"Linear Regression Testing Data Score: {model.score(X_test_scaled, y_test)}")  
    
    plt.bar(np.arange(len(model.coef_)), model.coef_)
    plt.title(f'Linear Regression coefficient plot')
    plt.show()  

In [None]:
def run_LASSO(a_df):
    """ Runs LASSO regression on dataframe, prints model scores """

    # Assign X and y

    X = a_df.drop(["log_of_salary"], axis=1)
    y = a_df["log_of_salary"]
    
    # Split the data into X_train, X_test, y_train, y_test

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Create a scaler to standardize the data

    scaler = StandardScaler()

    # Train the scaler with the X_train data.

    scaler.fit(X_train)

    # Transform X_train and X_test.

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    lasso_reg = Lasso().fit(X_train, y_train)

    predicted = lasso_reg.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)
    
    print(f"MSE: {mse}")
    print(f"R2: {r2}") 

    # Score the model

    print(f"LASSO Regression Training Data Score: {lasso_reg.score(X_train_scaled, y_train)}")
    print(f"LASSO Regression Testing Data Score: {lasso_reg.score(X_test_scaled, y_test)}")
    
    plt.bar(np.arange(len(lasso_reg.coef_)), lasso_reg.coef_)
    plt.title(f'LASSO Regression coefficient plot')
    plt.show()  
    
    sel = SelectFromModel(lasso_reg)
    sel.fit(X_train_scaled, y_train)
    SelectFromModel(estimator=Lasso())
    
    X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)
    
    scaler = StandardScaler().fit(X_selected_train)
    
    X_selected_train_scaled = scaler.transform(X_selected_train)
    X_selected_test_scaled = scaler.transform(X_selected_test)

    new_lasso_reg = LinearRegression().fit(X_selected_train_scaled, y_train)
    print(f"New linear regression score: {new_lasso_reg.score(X_selected_test_scaled, y_test)}")

In [None]:
def run_Ridge(a_df):
    """ Runs Ridge regression on dataframe, prints model scores """

    # Assign X and y

    X = a_df.drop(["log_of_salary"], axis=1)
    y = a_df["log_of_salary"]
    
    # Split the data into X_train, X_test, y_train, y_test

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Create a scaler to standardize the data

    scaler = StandardScaler()

    # Train the scaler with the X_train data.

    scaler.fit(X_train)

    # Transform X_train and X_test.

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    ridge_reg = Ridge().fit(X_train, y_train)

    predicted = ridge_reg.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)
    
    print(f"MSE: {mse}")
    print(f"R2: {r2}") 

    # Score the model

    print(f"Ridge Regression Training Data Score: {ridge_reg.score(X_train_scaled, y_train)}")
    print(f"Ridge Regression Testing Data Score: {ridge_reg.score(X_test_scaled, y_test)}")

    plt.bar(np.arange(len(ridge_reg.coef_)), ridge_reg.coef_)
    plt.title(f'Ridge Regression coefficient plot')
    plt.show()     
    
    sel = SelectFromModel(ridge_reg)
    sel.fit(X_train_scaled, y_train)
    SelectFromModel(estimator=Ridge())
    
    X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)
    
    scaler = StandardScaler().fit(X_selected_train)
    
    X_selected_train_scaled = scaler.transform(X_selected_train)
    X_selected_test_scaled = scaler.transform(X_selected_test)

    new_ridge_reg = LinearRegression().fit(X_selected_train_scaled, y_train)
    print(f"New linear regression score: {new_ridge_reg.score(X_selected_test_scaled, y_test)}")

In [None]:
def run_ElasticNet(a_df):
    """ Runs ElasticNet regression on dataframe, prints model scores """

    # Assign X and y

    X = a_df.drop(["log_of_salary"], axis=1)
    y = a_df["log_of_salary"]
    
    # Split the data into X_train, X_test, y_train, y_test

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Create a scaler to standardize the data

    scaler = StandardScaler()

    # Train the scaler with the X_train data.

    scaler.fit(X_train)

    # Transform X_train and X_test.

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    elasticnet_reg = ElasticNet().fit(X_train, y_train)
      
    predicted = elasticnet_reg.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)
    
    print(f"MSE: {mse}")
    print(f"R2: {r2}") 

    # Score the model

    print(f"ElasticNet Regression Training Data Score: {elasticnet_reg.score(X_train_scaled, y_train)}")
    print(f"ElasticNet Regression Testing Data Score: {elasticnet_reg.score(X_test_scaled, y_test)}")
    
    plt.bar(np.arange(len(elasticnet_reg.coef_)), elasticnet_reg.coef_)
    plt.title(f'ElasticNet Regression coefficient plot')
    plt.show()  
    
    sel = SelectFromModel(elasticnet_reg)
    sel.fit(X_train_scaled, y_train)
    SelectFromModel(estimator=ElasticNet())
    
    X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)
    
    scaler = StandardScaler().fit(X_selected_train)
    
    X_selected_train_scaled = scaler.transform(X_selected_train)
    X_selected_test_scaled = scaler.transform(X_selected_test)

    new_elasticnet_reg = LinearRegression().fit(X_selected_train_scaled, y_train)
    print(f"New linear regression score: {new_elasticnet_reg.score(X_selected_test_scaled, y_test)}")

In [None]:
loaded_df = load_dataset()

In [None]:
loaded_df["ADJ Salary"] = loaded_df["ADJ Salary"].astype(int)

loaded_df = loaded_df.loc[~(loaded_df["ADJ Salary"] <= 0), :]

In [None]:
clean_df = clean_dataset(loaded_df)

In [None]:
clean_df = rename_columns(clean_df)

In [None]:
clean_df.head(10)

In [None]:
examine_dataset(clean_df)

In [None]:
clean_df = round_salaries(clean_df)

In [None]:
clean_df["log_of_salary"] = np.log(clean_df["ADJ Salary"])

In [None]:
clean_df["log_of_salary"].describe()

In [None]:
run_regression(clean_df, "ADJ Salary")

In [None]:
run_regression(clean_df, "log_of_salary")

In [None]:
run_LASSO(clean_df)

In [None]:
run_Ridge(clean_df)

In [None]:
run_ElasticNet(clean_df)

# Results:

- Using the logarithm of the ADJ Salary column values improved the regression score
    - I had to eliminate any salary values <= 0 to make this work
- Ridge regression achieved the best score: 0.7291495360671353

# To Do:

- Add / remove features from the dataset?


# **LINEAR REGRESSION BLOCK END**

# **PCA START BLOCK**

In [None]:
clean_df.head(10)

In [None]:
clean_df.columns

In [None]:
#import dependencies
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
# Drop ADJ Salary column
clean_df2= clean_df.drop(["ADJ Salary"], axis=1)
clean_df2.head()

In [None]:
clean_df2.shape

In [None]:
# Standardize the dataset so that columns that contain larger values do not influence the outcome more than columns with smaller values.
clean_scaled = StandardScaler().fit_transform(clean_df2)
print(clean_scaled)

In [None]:
# Perform dimensionality reduction with PCA preserving 90% of the explained variance ( n_components=0.90)
# Initialize PCA model
pca = PCA(n_components=0.90)

# Get two principal components for the iris data.
clean_pca = pca.fit_transform(clean_scaled)

In [None]:
clean_pca.shape

In [None]:
clean_df_pca = pd.DataFrame(data=clean_pca)
clean_df_pca

In [None]:
# Transform PCA data to a DataFrame
clean_df_pca = pd.DataFrame(data=clean_pca, columns=["principal component 1", "principal component 2",
                                                                "principal component 3","principal component 4",
                                                                 "principal component 5","principal component 6",
                                                                 "principal component 7","principal component 8"])
clean_df_pca.head()

In [None]:
clean_df_pca.columns

In [None]:
# Fetch the explained variance
pca.explained_variance_ratio_

In [None]:
# Further reduce the dataset dimensions with t-SNE

# Initialize t-SNE model
tsne = TSNE(learning_rate = 50)

# Reduce dimensions
tsne_features = tsne.fit_transform(clean_pca)

# The dataset has 2 columns
tsne_features.shape

In [None]:
# Prepare to plot the dataset

# The first column of transformed features
clean_df2["x"] = tsne_features[:,0]

# The second column of transformed features
clean_df2["y"] = tsne_features[:,1]

# Visualize the clusters
plt.scatter(clean_df2["x"],clean_df2["y"])
plt.show()

In [None]:
labels = clean_df["ADJ Salary"]
labels.value_counts()

In [None]:
# Visualize the clusters with color
plt.scatter(clean_df2["x"],clean_df2["y"], c=labels)
plt.show()

# Perform a Cluster Analysis with K-means

In [None]:
#Windows KMeans bug fix. 
import os
os.environ["OMP_NUM_THREADS"] = '1'

In [None]:
# Finding the best value for k using the Elbow Curve
inertia = []
k = list(range(1, 9))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(clean_df_pca)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow

In [None]:
# Plot the elbow curve to find the best candidate(s) for k
plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(list(range(11)))
plt.title('Elbow Curve')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# If possible, determine where the elbow of the plot is, and at which value of k it appears.
# Create a function called `get_clusters(k, data)` that finds the `k` clusters using K-Means on `data`. The function should return a DataFrame copy of `Data` that should include a new column containing the clusters found.

def get_clusters(k, data):
    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Train the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data

In [None]:
clusters = get_clusters(2, clean_df_pca)
clusters

In [None]:
def show_clusters(df):
    plt.scatter(df['principal component 1'], df['principal component 2'], c=df['class'])
    plt.xlabel('principal component 1')
    plt.ylabel('principal component 2')
    plt.title("ADJ Salary clusters")
    plt.show()

In [None]:
show_clusters(clusters)

# **PCA END BLOCK**