## Project Description

## Notebook Description

### Initialize packages and read in pickled data

In [2]:
# ! pip install scrapy
# ! pip install psycopg2
# ! pip install sqlalchemy
# ! pip install missingno --quiet
# ! pip install scipy

In [3]:
% run __init__.py

In [4]:
cd ..

/home/jovyan


In [5]:
df_model = pd.read_pickle('data/df_model.p')

In [6]:
df_model.shape

(127052, 88)

### Set up target and predictors

In [7]:
df_model.drop('player_id', axis=1, inplace=True)

In [8]:
target = df_model['hit_flag']
predictors = df_model.drop('hit_flag', axis=1)

BoxCox requires all positive values, so I'll start this workflow by using a `MinMaxScaler` on my data

### `MinMaxScaler`

In [9]:
df_model_proc_all = predictors.copy()

In [10]:
min_max = MinMaxScaler(feature_range=(1E-10,1))

In [11]:
df_model_mm = pd.DataFrame(min_max.fit_transform(df_model_proc_all), 
                           index=df_model_proc_all.index, 
                           columns=df_model_proc_all.columns)

### Skew-Normalize Features

#### `box_cox`

In [12]:
def box_cox(predictors):
    '''Input dataframe to deskew it'''
    df_model_bc = pd.DataFrame()
    for col in predictors.columns:
        box_cox, lmbda = boxcox(predictors[col])
        df_model_bc[col] = pd.Series(box_cox)
    
    df_model_bc.set_index(predictors.index, inplace=True)
    
    return df_model_bc

In [13]:
df_model_skewnorm = box_cox(df_model_mm)

  llf -= N / 2.0 * np.log(np.sum((y - y_mean)**2. / N, axis=0))


In [14]:
df_model_skewnorm.head(3)

Unnamed: 0_level_0,mph,ev_mph,dist,spin_rate,launch_angle,zone_1.0,zone_11.0,zone_12.0,zone_13.0,zone_14.0,...,full_pitch_Knuckle-curve,full_pitch_Knuckleball,full_pitch_Pitch out,full_pitch_Screwball,full_pitch_Slider,full_pitch_Two-Seam Fastball,full_pitch_Unidentified,pitch_rollup_fastball,pitch_rollup_offspeed,pitch_rollup_other
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
434378-8,-0.18044,-0.090954,-0.468592,-0.302719,-0.225151,-112843700.0,-1165759000.0,-14457840000.0,-183800400.0,-13685550.0,...,-1.275003e+21,-3.990265e+98,-1.341265e+154,-1.340852e+154,0.0,-1496.250076,-5.213913e+128,-10.838925,0.0,-6.834172e+115
434378-14,-0.116816,-0.191493,-0.571264,-0.328504,-0.204774,0.0,-1165759000.0,-14457840000.0,-183800400.0,-13685550.0,...,-1.275003e+21,-3.990265e+98,-1.341265e+154,-1.340852e+154,-2602.974371,-1496.250076,-5.213913e+128,0.0,-65.018139,-6.834172e+115
434378-16,-0.161768,-0.261724,-0.721032,-0.320188,-0.241653,-112843700.0,-1165759000.0,-14457840000.0,-183800400.0,-13685550.0,...,-1.275003e+21,-3.990265e+98,-1.341265e+154,-1.340852e+154,0.0,-1496.250076,-5.213913e+128,-10.838925,0.0,-6.834172e+115


### Standardize Features

**`StandardScaler()`**

In [15]:
standardized = (StandardScaler().fit_transform(df_model_skewnorm))
df_standardized = pd.DataFrame(standardized, columns=df_model_skewnorm.columns, index=df_model_skewnorm.index)

In [16]:
df_standardized.shape

(127052, 86)

In [17]:
target.shape

(127052,)

### Feature Selection

#### `SelectFromModel` with L1 penatly estimator

In [19]:
sfm = SelectFromModel(LogisticRegression(penalty='l1'), threshold='mean')
sfm.fit(df_standardized, target)

SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        prefit=False, threshold='mean')

In [20]:
sfm_feats = np.where(sfm.get_support())[0]
sfm_feats

array([ 1,  2,  4,  8,  9, 18])

In [22]:
columns = list(df_standardized.columns)

sfm_feats_names = []
for i in sfm_feats:
    sfm_feats_names.append(columns[i])
    
sfm_feats_names

['ev_mph', 'dist', 'launch_angle', 'zone_13.0', 'zone_14.0', 'zone_unknown']

### Create dataframe with only selected features

In [23]:
df_slim = df_standardized[sfm_feats_names]

In [24]:
df_slim.shape, target.shape

((127052, 6), (127052,))

### K Neighbors Classifier - Cross Validation

In [31]:
knn = KNeighborsClassifier()
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()

In [26]:
scores = cross_val_score(knn, df_slim, target, cv=10, scoring='accuracy')
scores.mean()

0.80237220839127377

### Logistic Regression - Cross Validation

In [32]:
logreg_scores = cross_val_score(logreg, df_slim, target, cv=10, scoring='accuracy')
logreg_scores.mean()

0.74770958453183334

### Decision Tree Classifier - Cross Validation

In [33]:
dtree_scores = cross_val_score(dtree, df_slim, target, cv=10, scoring='accuracy')
dtree_scores.mean()

0.76027135491869713

After all preprocessing steps, cross-validation confirms our findings from single runs through our models...K Neighbors is the best performer for this data. For the remaining notebooks up until PowerHouse_Models, only K Neighbors will be used.