# Hyper Parameter Optimaztion

In [25]:
import os
import numpy as np
import pandas as pd
%matplotlib inline

In [26]:
data_path = os.path.join(os.pardir, 'data', 'processed')

test_data_path = os.path.join(data_path, 'test.csv')
train_data_path = os.path.join(data_path, 'train.csv')

In [27]:
test_df = pd.read_csv(test_data_path, index_col="PassengerId")
train_df = pd.read_csv(train_data_path, index_col="PassengerId")

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X = train_df.loc[:, :"AgeState_Child"].as_matrix().astype('float')
Y = train_df.Survived.ravel()

In [30]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.2, random_state=0)

In [31]:
print("Shape of Input(training) : {}".format(train_X.shape))
print("Shape of Output(training) : {}".format(train_Y.shape))
print("Shape of Input(testing) : {}".format(test_X.shape))
print("Shape of Output(testing) : {}".format(test_Y.shape))

Shape of Input(training) : (712, 23)
Shape of Output(training) : (712,)
Shape of Input(testing) : (179, 23)
Shape of Output(testing) : (179,)


In [32]:
from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression(random_state=0)

In [33]:
from sklearn.model_selection import GridSearchCV

In [34]:
parameters = {'C':[1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty':['l1', 'l2']}

# cv= K-fold, in this case k = 3
# C = lambda in regularization
grid_search = GridSearchCV(model_LR, param_grid=parameters, cv=3)

In [35]:
grid_search.fit(train_X, train_Y)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [36]:
grid_search.best_params_

{'C': 1.0, 'penalty': 'l2'}

In [37]:
print("Best score is : {}".format(grid_search.best_score_))

Best score is : 0.824438202247191


In [38]:
print("Logistic Regression Testing Score : {}".format(grid_search.score(test_X, test_Y)))

Logistic Regression Testing Score : 0.8268156424581006


In [39]:
def get_submission_file(model, filename):
    # create a test matrix with float values
    test_X = test_df.as_matrix().astype("float")
    print(test_X.shape)
    # make prediction on the model
    predictions = model.predict(test_X)
    # create a dataframe to submit
    df_submit = pd.DataFrame({'PassengerId':test_df.index, 'Survived': predictions})
    # define submit path and save the data frame in csv format
    submit_path = os.path.join(os.pardir, "data", "external")
    submit_file_path = os.path.join(submit_path, filename)
    # index=False --> so that no extra column is added
    df_submit.to_csv(submit_file_path, index=False)

In [40]:
get_submission_file(grid_search, '03_LR_GridSearch.csv')

(418, 23)


## Feature Normalization & Feature Standardization

In [41]:
# Feature Standardization is similar to feature normalization.
# In this case, instead of scaling our feature, e scale our distiribution of each feature
# such that each one has a mean = 0, var = 1.0

from sklearn.preprocessing import MinMaxScaler, StandardScaler

### Feature Normalization

In [42]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(train_X)

In [43]:
# Check min and max value for column 1
X_train_scaled[:, 0:].min(), X_train_scaled[:, 0:].max()

(0.0, 1.0)

In [44]:
# Scale the test data
X_test_scaled = scaler.fit_transform(test_X)

### Feature Standardization

In [45]:
distro_scaler = StandardScaler()

X_train_scaled = distro_scaler.fit_transform(train_X)
X_test_scaled = distro_scaler.fit_transform(test_X)

#### Create Model after standardization

In [46]:
model_LR = LogisticRegression(random_state=0)

parameters = {'C':[1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty':['l1', 'l2']}

# cv= K-fold, in this case k = 3
# C = lambda in regularization
grid_search = GridSearchCV(model_LR, param_grid=parameters, cv=3)

grid_search.fit(train_X, train_Y)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [47]:
grid_search.best_score_

0.824438202247191

In [48]:
print("Logistic Regression Testing Score : {}".format(grid_search.score(test_X, test_Y)))

Logistic Regression Testing Score : 0.8268156424581006


# Model Persistence

In [49]:
import pickle

In [50]:
model_file_path = os.path.join(os.pardir, 'models', 'LR_MODEL.pkl')

In [51]:
model_file_pickel = open(model_file_path, 'wb')

In [52]:
pickle.dump(grid_search, model_file_pickel)

In [53]:
model_file_pickel.close()

## Check Persisted file

In [54]:
model_file_pickel = open(model_file_path, 'rb')

grid_search_loaded = pickle.load(model_file_pickel)

model_file_pickel.close()

In [55]:
grid_search_loaded

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [56]:
print("Logistic Score : {}".format(grid_search_loaded.score(test_X, test_Y)))

Logistic Score : 0.8268156424581006


# -------> The End <--------