In [None]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [None]:
fname = "data2.txt"
A = np.loadtxt(fname, delimiter=",")
data=pd.DataFrame(A)

data.columns = ['test1','test2','released']
data.info()

In [None]:
X = data.ix[:,:2].values
y = data.ix[:,2].values

In [None]:
plt.scatter(X[y == 1, 0], X[y == 1, 1], c='green', label='ok')
plt.scatter(X[y == 0, 0], X[y == 0, 1], c='red', label='fail')
plt.xlabel("test1")
plt.ylabel("test2")
plt.legend();

In [None]:
def plot_boundary(clf, X, y, grid_step=.01, poly_featurizer=None):
    x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
    y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, grid_step),
    np.arange(y_min, y_max, grid_step))

    Z = clf.predict(poly_featurizer.transform(np.c_[xx.ravel(), yy.ravel()]))
    Z = Z.reshape(xx.shape)
    plt.contour(xx, yy, Z, cmap=plt.cm.Paired)

In [None]:
poly = PolynomialFeatures(degree=10)
X_poly = poly.fit_transform(X)

In [None]:
C = 1 # C=1/Lambda
logit = LogisticRegression(C=C, n_jobs=-1, random_state=17)
logit.fit(X_poly, y)

plot_boundary(logit, X, y, grid_step=.01, poly_featurizer=poly)

plt.scatter(X[y == 1, 0], X[y == 1, 1], c='green', label='ok')
plt.scatter(X[y == 0, 0], X[y == 0, 1], c='red', label='fail')
plt.xlabel("test1")
plt.ylabel("test 2")
plt.legend();

print("Percentage of correct answers of the classifier on the training set:", 
round(logit.score(X_poly, y), 3))

In [None]:
C= 220
logit = LogisticRegression(C=C, n_jobs=-1, random_state=17)
logit.fit(X_poly, y)

plot_boundary(logit, X, y, grid_step=.005, poly_featurizer=poly)

plt.scatter(X[y == 1, 0], X[y == 1, 1], c='green', label='ok')
plt.scatter(X[y == 0, 0], X[y == 0, 1], c='red', label='fail')
plt.xlabel("test 1")
plt.ylabel("test 2")
plt.legend();

print("Percentage of correct answers of the classifier on the training set:", 
round(logit.score(X_poly, y), 3))

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

c_values = np.logspace(-2, 3, 500)

logit_searcher = LogisticRegressionCV(Cs=c_values, cv=skf, verbose=1, n_jobs=-1)
logit_searcher.fit(X_poly, y)

In [None]:
logit_searcher.scores_[1].max()

In [None]:
plt.plot(c_values, np.mean(logit_searcher.scores_[1], axis=0))

In [None]:
plt.plot(c_values, np.mean(logit_searcher.scores_[1], axis=0))
plt.xlim(50,300)