In [40]:
import numpy as np
from numpy import mean,std
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,train_test_split,cross_val_score,cross_validate

In [41]:
X= pd.read_csv('../data/processed/X_train', header=0, index_col=0)
X_final_test = pd.read_csv('../data/processed/X_final_test', header=0, index_col=0)
y = pd.read_csv('../data/processed/y_train', header=0, index_col=0)
y_final_test = pd.read_csv('../data/processed/y_final_test', header=0, index_col=0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# Feature Scaling

In [43]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(X_train)
print(X_test)

# LogisticRegression Model Building

In [None]:
# Training the Logistic Regression model on the Training set
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy:', accuracy_score(y_test, y_pred)*100,'%')
print(classification_report(y_test,y_pred))

# Cross Fold Validation

In [None]:
cv = KFold(n_splits=10, random_state=42, shuffle=True)
model = LogisticRegression()
print(cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1).mean())
#print(cross_val_score(model, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1).mean())
#print("Test set:", cross_val_score(y_test, model.predict(X_test),scoring='accuracy'))

In [None]:
#Need to be clear
#
import sklearn
model = sklearn.base.clone(model)
model.fit(X_train, y_train)
print("Test set RMSE:", mean_squared_error(y_test, model.predict(X_test), squared=False))
print("Mean validation RMSE:", -scores["test_score"].mean())

# Hyperparameter tuning for logistic regression model
Here, we are using grid searching for parameter tuning. The following hyperparameter are chosen for tuning:
solver[‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’]
There are five solver used in sklearn. Each solver uses to find the parameter weights that minimize the cost function.
penalty[‘none’, ‘l1’, ‘l2’, ‘elasticnet’]
Regularization addresses the problem of over-fitting by penalizing the parameters. We set the four penalties.
C [100, 10, 1.0, 0.1, 0.01]
The C parameter controls the penalty strength, which can also be effective.
max_iteration [20, 50, 100, 200, 500, 1000]
We set some random values in max_iteration to find out which iteration will be best for the solver to converge.

In [49]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [None]:
# define models and parameters
model = LogisticRegression()
solvers = ['lbfgs','newton-cg','liblinear','sag','saga']
penalty = ['l1', 'l2', 'elasticnet', 'none']
c_values = [100, 10, 1.0, 0.1, 0.01]
max_iteration= [20, 50, 100, 200, 500, 1000]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values,max_iter=max_iteration)
cv = KFold(n_splits=10, random_state=42, shuffle=True)
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Result Summary:

First, we find 77.08% accuracy of our logistic regression model for our dataset using training_test_split method.
Secondly, we apply 10-fold cross validation for our model and find accuracy of 82.57%.
Finally, we use hyperparameter tuning method for our model and find accuracy of 84.71% which is an improvement than previous results.