In [None]:
# Import dependencies

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
def load_dataset():
    """ Reads dataset csv and returns pandas dataframe """
    
    filepath = "Hitters_Adjusted_Salary.csv"

    df = pd.read_csv(filepath, encoding="utf-8", low_memory=False)
    
    return df

In [None]:
def clean_dataset(a_df):
    """ Returns deduped, na-dropped, index-reset dataframe """    
    
    a_df = a_df.drop_duplicates()   
        
    a_df = a_df.dropna()
    
    a_df = a_df.drop(["Unnamed: 0", "yearID", "playerID", "teamID", "lgID", "salary", "PO", "A", "E", "DP"], axis=1)
        
    a_df = a_df.reset_index(drop=True)
    
    return a_df   

In [None]:
def rename_columns(a_df):
    """ Returns dataframe with meaningful column names """    
    
    abbr_dict = {"GS": "games_started", "InnOuts": "inning_outs", "G": "games_played", "AB": "at_bats",\
                "R": "runs", "H": "hits", "2B": "doubles", "3B": "triples", "HR": "home_runs",\
                "RBI": "runs_batted_in", "SB": "stolen_bases", "CS": "caught_stealing", "BB": "base_on_balls",\
                "SO": "strike_outs", "IBB": "intentional_walks", "HBP": "hit_by_pitch", "SH": "sacrifice_hits",\
                "SF": "sacrifice_flies", "GIDP": "ground_into_double_play"}
    
    a_df = a_df.rename(columns=abbr_dict)
    
    return a_df   

In [None]:
def examine_dataset(a_df):
    """ Provides summary info and visualizations of dataset """
    
    print(a_df.info())
           
    print(f'\n\nADJ SALARY VALUE COUNTS: \n {a_df["ADJ Salary"].value_counts()}\n\n')
    
    a_df.hist(figsize = (15, 15))  
    
    sns.PairGrid(a_df[["hits", "runs_batted_in", "stolen_bases", "runs", "ground_into_double_play"]]).map_upper(plt.scatter) # just scatter plot the float values

In [None]:
def scale_dataset(a_df):
    """ Returns dataframe with target column removed, data scaled with standard scaler, data normalized, and labels """
    
    salary_labels = a_df["ADJ Salary"]    
    no_target_df = a_df.drop(columns=["ADJ Salary"])
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(no_target_df)    
    print(f"SHAPE AFTER SCALING: {scaled_data.shape}")

    normalized_data = normalize(no_target_df)    
    print(f"SHAPE AFTER NORMALIZING: {normalized_data.shape}")

    return no_target_df, scaled_data, normalized_data, salary_labels

In [None]:
loaded_df = load_dataset()

In [None]:
clean_df = clean_dataset(loaded_df)

In [None]:
clean_df = rename_columns(clean_df)

In [None]:
clean_df.head(10)

In [None]:
examine_dataset(clean_df)

In [None]:
clean_df["ADJ Salary"] = clean_df["ADJ Salary"].astype("int").round()

In [None]:
clean_df["ADJ Salary"]

# Fit model

In [None]:
# Assign X and y

X = clean_df.drop(["ADJ Salary"], axis=1)
y = clean_df["ADJ Salary"]

In [None]:
# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
# Create a scaler to standardize the data

scaler = StandardScaler()

# Train the scaler with the X_train data.

scaler.fit(X_train)

In [None]:
# Transform X_train and X_test.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model = LinearRegression().fit(X_train_scaled, y_train)

In [None]:
# Score the model

print(f"Linear Regression Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Linear Regression Testing Data Score: {model.score(X_test_scaled, y_test)}")