In [None]:
# Import dependencies

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# """ Reads dataset csv and returns pandas dataframe """
filepath = "cleaned_hitter.csv"

df = pd.read_csv(filepath, encoding="utf-8", low_memory=False)
    
df.head()

In [None]:
# """ Returns deduped, na-dropped, index-reset dataframe """    
    
df =df.drop_duplicates()   
        
df = df.dropna()

df.head()

In [None]:
#  """ Returns dataframe with meaningful column names """    
    
abbr_dict = {"G": "games_played", "AB": "at_bats", "R": "runs", "H": "hits", "RBI": "runs_batted_in",\
            "yearID": "year", "teamID": "team", "playerID": "player_id"}
    
df = df.rename(columns=abbr_dict)

df.head()

In [None]:
# """ Returns dataframe with salary column made into int and rounded """

df["ADJ Salary"] = df["ADJ Salary"].astype("int").round()

df.head()

In [None]:
df.describe().astype("int").round()

In [None]:
zero = df.loc[(df["ADJ Salary"]==0)]
zero

In [None]:
df.drop(df.loc[df['ADJ Salary']==0].index, inplace=True)

In [None]:
df.head(3395)

In [None]:
df.describe().astype("int").round()

In [None]:
bin_one = df.loc[(df["ADJ Salary"]<1000000)]

bin_one.count()

In [None]:
# Create bins in which to place values based upon ADJ Salary
bins = [0, 999999, 5999999, 10000000, 40000000]

# Create labels for these bins
group_labels = ["< 1 mill", "1 mill to 5 mill", "6 mill to 10 mill", "> 10 mill"]

In [None]:
# Slice the data and place it into bins
pd.cut(df["ADJ Salary"], bins, labels=group_labels).head()

In [None]:
df["ADJ Salary Group"] = pd.cut(df["ADJ Salary"], bins, labels=group_labels)
df.head()

In [None]:
#Create a GroupBy object based upon ADJ Salary Group
salary_group = df.groupby("ADJ Salary Group")

# Find how many rows fall into each bin
print(salary_group["player_id"].count())

