# **LINEAR REGRESSION BLOCK START**

In [1]:
# Import dependencies

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def load_dataset():
    """ Reads dataset csv and returns pandas dataframe """
    
    filepath = "Hitters_Adjusted_Salary.csv"

    df = pd.read_csv(filepath, encoding="utf-8", low_memory=False)
    
    return df

In [4]:
def clean_dataset(a_df):
    """ Returns deduped, na-dropped, index-reset dataframe """    
    
    a_df = a_df.drop_duplicates()   
        
    a_df = a_df.dropna()
    
    a_df = a_df.drop(["Unnamed: 0", "playerID", "teamID", "lgID", "salary", "PO", "A", "E", "DP"], axis=1)
    
    totalhits = a_df["H"]
    doubles = a_df["2B"]
    triples = a_df["3B"]
    homeruns = a_df["HR"]
    atbats = a_df["AB"]
    walks = a_df["BB"]
    hit_by_pitch = a_df["HBP"]
    sac_hits = a_df["SH"]
    sac_flies = a_df["SF"]
    singles = (totalhits - homeruns - triples - doubles)
        
    a_df["slug_%"] = (singles + 2*doubles + 3*triples + 4*homeruns) / atbats
    a_df["slug_%"] = a_df["slug_%"].fillna(0)
    a_df["avg"] = totalhits / atbats
    a_df["avg"] = a_df["avg"].fillna(0)
    a_df["plate_appearances"] = atbats + walks + hit_by_pitch + sac_hits + sac_flies
    a_df["avg"] = a_df["avg"].fillna(0)
    a_df["on_base_%"] = (totalhits + walks + hit_by_pitch) / (atbats + walks + hit_by_pitch + sac_flies)
    a_df["on_base_%"] = a_df["on_base_%"].fillna(0)
    a_df["1B"] = singles
    a_df["1B"] = a_df["1B"].fillna(0)
    a_df["year_for_salary"] = a_df["yearID"] + 1 
    a_df = a_df.drop(["yearID"], axis=1)
     
    a_df = a_df.reset_index(drop=True)
    
    return a_df  

In [5]:
def rename_columns(a_df):
    """ Returns dataframe with meaningful column names """    
    
    abbr_dict = {"GS": "games_started", "InnOuts": "inning_outs", "G": "games_played", "AB": "at_bats",\
                "R": "runs", "H": "total_hits", "1B": "singles", "2B": "doubles", "3B": "triples", "HR": "home_runs",\
                "RBI": "runs_batted_in", "SB": "stolen_bases", "CS": "caught_stealing", "BB": "base_on_balls",\
                "SO": "strike_outs", "IBB": "intentional_walks", "HBP": "hit_by_pitch", "SH": "sacrifice_hits",\
                "SF": "sacrifice_flies", "GIDP": "ground_into_double_play"}
    
    a_df = a_df.rename(columns=abbr_dict)
    
    return a_df   

In [6]:
def examine_dataset(a_df):
    """ Provides summary info and visualizations of dataset """
    
    print(a_df.info())
          
    a_df.hist(figsize = (15, 15))  

In [7]:
def round_salaries(a_df):
    """ Returns dataframe with salary column made into int and rounded """

    a_df["ADJ Salary"] = a_df["ADJ Salary"].astype("int").round()

    return a_df

In [8]:
def run_regression(a_df, salary_col_name="ADJ Salary"):
    """ Runs linear regression on dataframe, prints model scores """

    # Assign X and y

    X = a_df.drop([salary_col_name], axis=1)
    y = a_df[salary_col_name]
    
    # Split the data into X_train, X_test, y_train, y_test

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Create a scaler to standardize the data

    scaler = StandardScaler()

    # Train the scaler with the X_train data.

    scaler.fit(X_train)

    # Transform X_train and X_test.

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = LinearRegression().fit(X_train_scaled, y_train)
    
    predicted = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)
    
    print(f"MSE: {mse}")
    print(f"R2: {r2}") 
    
    # Score the model

    print(f"Linear Regression Training Data Score: {model.score(X_train_scaled, y_train)}")
    print(f"Linear Regression Testing Data Score: {model.score(X_test_scaled, y_test)}")  
    
    plt.bar(np.arange(len(model.coef_)), model.coef_)
    plt.title(f'Linear Regression coefficient plot')
    plt.show()  

In [9]:
def run_LASSO(a_df):
    """ Runs LASSO regression on dataframe, prints model scores """

    # Assign X and y

    X = a_df.drop(["log_of_salary"], axis=1)
    y = a_df["log_of_salary"]
    
    # Split the data into X_train, X_test, y_train, y_test

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Create a scaler to standardize the data

    scaler = StandardScaler()

    # Train the scaler with the X_train data.

    scaler.fit(X_train)

    # Transform X_train and X_test.

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    lasso_reg = Lasso().fit(X_train, y_train)

    predicted = lasso_reg.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)
    
    print(f"MSE: {mse}")
    print(f"R2: {r2}") 

    # Score the model

    print(f"LASSO Regression Training Data Score: {lasso_reg.score(X_train_scaled, y_train)}")
    print(f"LASSO Regression Testing Data Score: {lasso_reg.score(X_test_scaled, y_test)}")
    
    plt.bar(np.arange(len(lasso_reg.coef_)), lasso_reg.coef_)
    plt.title(f'LASSO Regression coefficient plot')
    plt.show()  
    
    sel = SelectFromModel(lasso_reg)
    sel.fit(X_train_scaled, y_train)
    SelectFromModel(estimator=Lasso())
    
    X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)
    
    scaler = StandardScaler().fit(X_selected_train)
    
    X_selected_train_scaled = scaler.transform(X_selected_train)
    X_selected_test_scaled = scaler.transform(X_selected_test)

    new_lasso_reg = LinearRegression().fit(X_selected_train_scaled, y_train)
    print(f"New linear regression score: {new_lasso_reg.score(X_selected_test_scaled, y_test)}")

In [10]:
def run_Ridge(a_df):
    """ Runs Ridge regression on dataframe, prints model scores """

    # Assign X and y

    X = a_df.drop(["log_of_salary"], axis=1)
    y = a_df["log_of_salary"]
    
    # Split the data into X_train, X_test, y_train, y_test

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Create a scaler to standardize the data

    scaler = StandardScaler()

    # Train the scaler with the X_train data.

    scaler.fit(X_train)

    # Transform X_train and X_test.

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    ridge_reg = Ridge().fit(X_train, y_train)

    predicted = ridge_reg.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)
    
    print(f"MSE: {mse}")
    print(f"R2: {r2}") 

    # Score the model

    print(f"Ridge Regression Training Data Score: {ridge_reg.score(X_train_scaled, y_train)}")
    print(f"Ridge Regression Testing Data Score: {ridge_reg.score(X_test_scaled, y_test)}")

    plt.bar(np.arange(len(ridge_reg.coef_)), ridge_reg.coef_)
    plt.title(f'Ridge Regression coefficient plot')
    plt.show()     
    
    sel = SelectFromModel(ridge_reg)
    sel.fit(X_train_scaled, y_train)
    SelectFromModel(estimator=Ridge())
    
    X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)
    
    scaler = StandardScaler().fit(X_selected_train)
    
    X_selected_train_scaled = scaler.transform(X_selected_train)
    X_selected_test_scaled = scaler.transform(X_selected_test)

    new_ridge_reg = LinearRegression().fit(X_selected_train_scaled, y_train)
    print(f"New linear regression score: {new_ridge_reg.score(X_selected_test_scaled, y_test)}")

In [11]:
def run_ElasticNet(a_df):
    """ Runs ElasticNet regression on dataframe, prints model scores """

    # Assign X and y

    X = a_df.drop(["log_of_salary"], axis=1)
    y = a_df["log_of_salary"]
    
    # Split the data into X_train, X_test, y_train, y_test

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Create a scaler to standardize the data

    scaler = StandardScaler()

    # Train the scaler with the X_train data.

    scaler.fit(X_train)

    # Transform X_train and X_test.

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    elasticnet_reg = ElasticNet().fit(X_train, y_train)
      
    predicted = elasticnet_reg.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)
    
    print(f"MSE: {mse}")
    print(f"R2: {r2}") 

    # Score the model

    print(f"ElasticNet Regression Training Data Score: {elasticnet_reg.score(X_train_scaled, y_train)}")
    print(f"ElasticNet Regression Testing Data Score: {elasticnet_reg.score(X_test_scaled, y_test)}")
    
    plt.bar(np.arange(len(elasticnet_reg.coef_)), elasticnet_reg.coef_)
    plt.title(f'ElasticNet Regression coefficient plot')
    plt.show()  
    
    sel = SelectFromModel(elasticnet_reg)
    sel.fit(X_train_scaled, y_train)
    SelectFromModel(estimator=ElasticNet())
    
    X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)
    
    scaler = StandardScaler().fit(X_selected_train)
    
    X_selected_train_scaled = scaler.transform(X_selected_train)
    X_selected_test_scaled = scaler.transform(X_selected_test)

    new_elasticnet_reg = LinearRegression().fit(X_selected_train_scaled, y_train)
    print(f"New linear regression score: {new_elasticnet_reg.score(X_selected_test_scaled, y_test)}")

In [12]:
loaded_df = load_dataset()

In [13]:
loaded_df["ADJ Salary"] = loaded_df["ADJ Salary"].astype(int)

loaded_df = loaded_df.loc[~(loaded_df["ADJ Salary"] <= 0), :]

In [14]:
loaded_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15022 entries, 0 to 15022
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  15022 non-null  int64  
 1   yearID      15022 non-null  int64  
 2   playerID    15022 non-null  object 
 3   salary      15022 non-null  int64  
 4   ADJ Salary  15022 non-null  int32  
 5   GS          15022 non-null  float64
 6   InnOuts     15022 non-null  float64
 7   PO          15022 non-null  int64  
 8   A           15022 non-null  int64  
 9   E           15022 non-null  float64
 10  DP          15022 non-null  int64  
 11  teamID      15022 non-null  object 
 12  lgID        15022 non-null  object 
 13  G           15022 non-null  int64  
 14  AB          15022 non-null  int64  
 15  R           15022 non-null  int64  
 16  H           15022 non-null  int64  
 17  2B          15022 non-null  int64  
 18  3B          15022 non-null  int64  
 19  HR          15022 non-nul

In [15]:
loaded_df.sample(100)

Unnamed: 0.1,Unnamed: 0,yearID,playerID,salary,ADJ Salary,GS,InnOuts,PO,A,E,DP,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
7184,7184,2000,offerjo01,5750000,8642063,102.0,2642.0,403,231,11.0,62,BOS,AL,116,451,73,115,14,3,9,41.0,0.0,8.0,70,70.0,0.0,1.0,2.0,3.0,9.0
8832,8832,2003,chaveen01,300000,421974,112.0,3100.0,279,9,3.0,2,MON,NL,141,483,66,121,25,5,5,47.0,18.0,7.0,31,59.0,3.0,0.0,9.0,3.0,7.0
12468,12468,2011,navardi01,1000000,1150583,46.0,1284.0,421,27,7.0,5,LAN,NL,64,176,13,34,6,1,5,17.0,0.0,0.0,20,35.0,4.0,1.0,3.0,2.0,3.0
3618,3618,1993,puckeki01,5300000,9492721,139.0,3667.0,312,13,2.0,2,MIN,AL,156,622,89,184,39,3,22,89.0,8.0,6.0,47,93.0,7.0,7.0,1.0,5.0,15.0
7793,7793,2001,aloumo01,5250000,7672262,130.0,3350.0,205,10,2.0,3,HOU,NL,136,513,79,170,31,1,27,108.0,5.0,1.0,57,57.0,14.0,3.0,0.0,8.0,18.0
8515,8515,2002,polanpl01,1750000,2517616,129.0,3610.0,120,313,9.0,47,PHI,NL,53,206,28,61,13,1,4,22.0,2.0,2.0,14,14.0,0.0,4.0,4.0,0.0,3.0
13908,13908,2014,mccanbr01,17000000,18585204,112.0,2976.0,959,71,3.0,11,NYA,AL,140,495,57,115,15,1,23,75.0,0.0,0.0,32,77.0,1.0,7.0,0.0,4.0,16.0
87,87,1985,oestero01,550000,1322918,148.0,3964.0,366,457,9.0,100,CIN,NL,152,526,59,155,26,3,1,34.0,5.0,0.0,51,65.0,17.0,0.0,2.0,5.0,13.0
4438,4438,1995,devermi01,1350000,2292617,88.0,2553.0,228,4,3.0,1,ATL,NL,29,55,7,14,3,0,1,8.0,2.0,0.0,2,11.0,0.0,0.0,0.0,0.0,1.0
14121,14121,2015,cunnito01,507500,554165,19.0,552.0,47,1,0.0,1,ATL,NL,39,86,13,19,4,0,0,4.0,2.0,1.0,5,17.0,1.0,2.0,0.0,0.0,1.0


In [None]:
clean_df = clean_dataset(loaded_df)

In [None]:
clean_df = rename_columns(clean_df)

In [None]:
clean_df.head(10)

In [None]:
examine_dataset(clean_df)

In [None]:
clean_df = round_salaries(clean_df)

In [None]:
clean_df["log_of_salary"] = np.log(clean_df["ADJ Salary"])

In [None]:
clean_df["log_of_salary"].describe()

In [None]:
run_regression(clean_df, "ADJ Salary")

In [None]:
run_regression(clean_df, "log_of_salary")

In [None]:
run_LASSO(clean_df)

In [None]:
run_Ridge(clean_df)

In [None]:
run_ElasticNet(clean_df)

# Results:

- Using the logarithm of the ADJ Salary column values improved the regression score
    - I had to eliminate any salary values <= 0 to make this work
- Ridge regression achieved the best score: 0.7291495360671353

# To Do:

- Add / remove features from the dataset?


# **LINEAR REGRESSION BLOCK END**

# **PCA START BLOCK**

# **PCA END BLOCK**
