In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression, SGDClassifier

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [None]:
y = all_banks_2010Q4_df['Target']
X = all_banks_2010Q4_df.drop(['Target', 'AsOfDate'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [None]:
log_reg = LogisticRegression(C=1000.0, random_state=1)
log_reg.fit(X_train_std,y_train)

In [None]:
log_reg.coef_

In [None]:
log_reg.score(X_train_std, y_train)

In [None]:
log_reg.score(X_test_std, y_test)

In [None]:
weights, params = [], []
for c in np.arange(0.01, 10): 
    lr = LogisticRegression(C=2**c, random_state=0, solver='lbfgs')
    lr.fit(X_train_std, y_train)
    weights.append(lr.coef_[0])
    params.append(10**c)

weights = np.array(weights)

# Decision region drawing
import matplotlib.pyplot as plt

plt.plot(params, weights[:, 0], color='blue', marker='x', label='first')
plt.plot(params, weights[:, 1], color='green',  marker='o', label='second')
plt.plot(params, weights[:, 2], color='r',  marker='>', label='third')
plt.ylabel('weight coefficient')
plt.xlabel('C')
plt.legend(loc='right')
plt.xscale('log')
plt.show()

In [None]:

sgd_clf = SGDClassifier(random_state=1)
sgd_clf.fit(X_train_std, y_train)

In [None]:
sgd_clf.score(X_train_std, y_train)

In [None]:
sgd_clf.score(X_test_std, y_test)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train_std, y_train, cv=3, scoring='accuracy')

In [None]:
cross_val_score(log_reg, X_train_std, y_train, cv=3, scoring='accuracy')

In [None]:
# Accuracy score is generally not a good performance measure for classifiers, especially when you are dealing with 
# skewed datasets (when some classes are much more frequent than others) -- check with confusing matrix below:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
sgd_y_train_pred = cross_val_predict(sgd_clf, X_train_std, y_train, cv=3)
confusion_matrix(y_train, sgd_y_train_pred)

In [None]:
# Same terrible results for logit regression:
log_y_train_pred = cross_val_predict(log_reg, X_train_std, y_train, cv=3)
confusion_matrix(y_train, log_y_train_pred)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
precision_score(y_train, sgd_y_train_pred) #SGD precision

In [None]:
precision_score(y_train, log_y_train_pred) #Logit precision

In [None]:
recall_score(y_train, sgd_y_train_pred) #SGD precision

In [None]:
recall_score(y_train, log_y_train_pred) #Logit recall

In [None]:
f1_score(y_train, log_y_train_pred)

In [None]:
# Precision/Recall tradeoff 
from sklearn.metrics import precision_recall_curve

sgd_y_scores = cross_val_predict(sgd_clf, X_train_std, y_train, cv=3, method='decision_function')
sgd_precisions, sgd_recalls, sgd_thresholds = precision_recall_curve(y_train, sgd_y_scores)

In [None]:
 #Represent Precision and Recall graphically
def plot_precision_recall(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[: -1], 'b--', label='Precision')
    plt.plot(thresholds, recalls[: -1], 'g-', label='Recall')
    plt.xlabel('Threshold')
    plt.legend(loc='center left')
    plt.ylim([0,1])

plot_precision_recall(sgd_precisions, sgd_recalls, sgd_thresholds)
plt.show()

In [None]:
# Logit needs predict_proba output fist

# log_y_scores = cross_val_predict(log_reg, X_train_std, y_train, cv=3, method='decison_function')
# log_precisions, log_recalls, log_thresholds = precision_recall_curve(y_train, log_y_scores)
# plot_precision_recall(log_precisions, log_recalls, log_thresholds)
# plt.show()

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train, sgd_y_scores)

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0,1], 'k--')
    plt.axis([0,1,0,1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Negatove Rate')