- [Load dataset](#Load-dataset)
- [The Default data set](#Figure-4.1---Default-data-set)
- [4.3 Logistic Regression](#4.3-Logistic-Regression)
- [4.4 Linear Discriminant Analysis](#4.4-Linear-Discriminant-Analysis)
- [Lab: 4.6.3 Linear Discriminant Analysis](#4.6.3-Linear-Discriminant-Analysis)
- [Lab: 4.6.4 Quadratic Discriminant Analysis](#4.6.4-Quadratic-Discriminant-Analysis)
- [Lab: 4.6.5 K-Nearest Neighbors](#4.6.5-K-Nearest-Neighbors)
- [Lab: 4.6.6 An Application to Caravan Insurance Data](#4.6.6-An-Application-to-Caravan-Insurance-Data)

# Chapter 4 - Classification

In [None]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn import preprocessing
from sklearn import neighbors

%matplotlib inline
plt.style.use('seaborn-white')

## Bug fixes - statsmodels not compatible with current stable version of scipy/pandas

In [None]:
# Statsmodels bug fix:
from pandas.core import datetools

import statsmodels.api as sm
import statsmodels.formula.api as smf

# Workaround to fix bug in statsmodels .summary() - missing stats.chisqprob function
# https://github.com/statsmodels/statsmodels/issues/3931
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

### Load dataset

In [None]:
df = pd.read_excel('../../_data/Default.xlsx')

# Note: factorize() returns two objects: a label array and an array with the unique values.
# We are only interested in the first object. 
df['default2'] = df.default.factorize()[0]
df['student2'] = df.student.factorize()[0]
df.head(3)

In [None]:
df.isnull().any().sum()
df.info()

### Stratify labels

In [None]:
df.default.value_counts()

In [None]:
# Take a fraction of the samples where target value (default) is 'no'
df_no = df[df.default2==0].sample(frac=0.1)

# Take all samples where target value is 'yes'
df_yes = df[df.default2==1]
df_ = df_no.append(df_yes)

###  Figure 4.1 - Default data set

In [None]:
fig = plt.figure(figsize=(12,5))
gs = mpl.gridspec.GridSpec(1, 4)

ax1 = plt.subplot(gs[0, :-2])
ax2 = plt.subplot(gs[0, -2])
ax3 = plt.subplot(gs[0, -1])

ax1.scatter(df_[df_.default == 'No'].balance, df_[df_.default == 'No'].income, s=40, marker='o',
            edgecolor='lightblue', facecolor='None', alpha=1) # linewidths and facecolor='None' don't go together
ax1.scatter(df_[df_.default == 'Yes'].balance, df_[df_.default == 'Yes'].income, s=40, c='orange', marker='+',
            linewidths=1)

ax1.set_ylim(ymin=0)
ax1.set_ylabel('Income')
ax1.set_xlim(xmin=-100)
ax1.set_xlabel('Balance')

c_palette = {'No':'lightblue', 'Yes':'orange'}
sns.boxplot('default', 'balance', data=df, orient='v', ax=ax2, palette=c_palette)
sns.boxplot('default', 'income', data=df, orient='v', ax=ax3, palette=c_palette)
gs.tight_layout(plt.gcf());

## 4.3 Logistic Regression
### Figure 4.2

In [None]:
X_train = df.balance.values.reshape(-1,1) 
y = df.default2

# Create array of test data
X_test = np.arange(df.balance.min(), df.balance.max()).reshape(-1,1)

In [None]:
clf = skl_lm.LogisticRegression(solver='newton-cg').fit(X_train, y)

# Calculate the classification probability and predicted classification.
prob = clf.predict_proba(X_test)

### Visualise regression value, label, probability(logit)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5), sharey=True)

# Left plot
_ = sns.regplot(df.balance, df.default2, order=1, ci=None,
            scatter_kws={'color':'orange'},
            line_kws={'color':'lightblue', 'lw':2}, ax=ax1)

# middle plot
_ = ax2.scatter(X_train, y, color='orange')
_ = ax2.plot(X_test, clf.predict(X_test), color='lightblue')


# Right plot
_ = ax3.scatter(X_train, y, color='orange')
_ = ax3.plot(X_test, prob[:, 1], color='lightblue')

for ax in fig.axes:
    _ = ax.hlines(1, xmin=ax.xaxis.get_data_interval()[0],
              xmax=ax.xaxis.get_data_interval()[1], linestyles='dashed', lw=1)
    _ = ax.hlines(0, xmin=ax.xaxis.get_data_interval()[0],
              xmax=ax.xaxis.get_data_interval()[1], linestyles='dashed', lw=1)
    _ = ax.set_ylabel('Probability of default')
    _ = ax.set_xlabel('Balance')
    _ = ax.set_yticks([0, 0.25, 0.5, 0.75, 1.])
    _ = ax.set_xlim(xmin=-100)

### Table 4.1

In [None]:
y = df.default2
X_train = df.balance.values.reshape(-1,1)

In [None]:
X_train[:10]

### Add a bias/intercept (column of ones) to an array

The original values with a constant (column of ones) as the first or last column.  
Return type: array, recarray or DataFrame

In [None]:
X_train = sm.add_constant(df.balance)
X_train[:10]
type(X_train)

#### scikit-learn

In [None]:
# Using newton-cg solver, the coefficients are equal/closest to the ones in the book. 
# I do not know the details on the differences between the solvers.
clf = skl_lm.LogisticRegression(solver='newton-cg').fit(X_train, y)

print(clf)
print('classes: ',clf.classes_)
print('coefficients: ',clf.coef_)
print('intercept :', clf.intercept_)

#### statsmodels

In [None]:
# X_train = sm.add_constant(df.balance)  ## Adds a column of ones to an array
est = smf.Logit(y.ravel(), X_train).fit()
est.summary().tables[1]
# const = intercept = bias

### Table 4.2

In [None]:
X_train = sm.add_constant(df.student2)
X_train[:3]
y = df.default2

est = smf.Logit(y, X_train).fit()
est.summary().tables[1]

### Table 4.3 - Multiple Logistic Regression

In [None]:
X_train = sm.add_constant(df[['balance', 'income', 'student2']])
est = smf.Logit(y, X_train).fit()
est.summary().tables[1]

### Figure 4.3 - Confounding

In [None]:
# balance and default vectors for students
X_train = df[df.student == 'Yes'].balance.values.reshape(df[df.student == 'Yes'].balance.size,1) 
y = df[df.student == 'Yes'].default2

# balance and default vectors for non-students
X_train2 = df[df.student == 'No'].balance.values.reshape(df[df.student == 'No'].balance.size,1) 
y2 = df[df.student == 'No'].default2

# Vector with balance values for plotting
X_test = np.arange(df.balance.min(), df.balance.max()).reshape(-1,1)

clf = skl_lm.LogisticRegression(solver='newton-cg').fit(X_train, y)
clf2 = skl_lm.LogisticRegression(solver='newton-cg').fit(X_train2, y2)

prob = clf.predict_proba(X_test)
prob2 = clf2.predict_proba(X_test)

### Confusion matrix / pivot

In [None]:
def pivot(y, y_pred, labels=['Negative', 'Positive']):
    dict_labels = {k:v for k, v in zip(np.unique(y), labels)}
    try:
        y_name, y_pred_name = y.name, y_pred.name
    except:
        y_name, y_pred_name = 'True label', 'Predicted label'
    df = pd.DataFrame({y_name: y, y_pred_name: y_pred})  
    df.replace(to_replace=dict_labels, inplace=True)
    return df.groupby([y_name, y_pred_name]).size().unstack(y_pred_name)

In [None]:
pivot(df.student, df.default)

In [None]:
df_p = pivot(df.student, df.default)
df_p['Support'] = df_p.sum(1)
# df_p.append(df_p.sum(0), ignore_index=True)
df_p

In [None]:
# creating plot
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5))

# Left plot
ax1.plot(X_test, pd.DataFrame(prob)[1], color='orange', label='Student')
ax1.plot(X_test, pd.DataFrame(prob2)[1], color='lightblue', label='Non-student')

ax1.hlines(127/2817, colors='orange', label='Overall Student',
           xmin=ax1.xaxis.get_data_interval()[0],
           xmax=ax1.xaxis.get_data_interval()[1], linestyles='dashed')
ax1.hlines(206/6850, colors='lightblue', label='Overall Non-Student',
           xmin=ax1.xaxis.get_data_interval()[0],
           xmax=ax1.xaxis.get_data_interval()[1], linestyles='dashed')

ax1.set_ylabel('Default Rate')
ax1.set_xlabel('Credit Card Balance')
ax1.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.])
ax1.set_xlim(450,2500)
ax1.legend(loc=2)

# Right plot
sns.boxplot('student', 'balance', data=df, orient='v', ax=ax2,  palette=c_palette);

## 4.4 Linear Discriminant Analysis
### Table 4.4 

In [None]:
X = df[['balance', 'income', 'student2']].as_matrix()
y = df.default2.as_matrix()

lda = LinearDiscriminantAnalysis(solver='svd')
y_pred = lda.fit(X, y).predict(X)

df_ = pd.DataFrame({'True default status': y,
                    'Predicted default status': y_pred})
X.shape, y.shape, df_.shape

In [None]:
pivot(df_['Predicted default status'], df_['True default status'], ['No', 'Yes']).T

In [None]:
pivot(y, y_pred, ['No', 'Yes'])

In [None]:
print(classification_report(y, y_pred, target_names=['No', 'Yes']))

### Table 4.5
Instead of using the probability of 50% as decision boundary, we say that a probability of default of 20% is to be classified as 'Yes'.

In [None]:
decision_prob = 0.2
y_prob = lda.fit(X, y).predict_proba(X)

df_ = pd.DataFrame({'True default status': y,
                    'Predicted default status': y_prob[:,1] > decision_prob})

In [None]:
pivot(df_['Predicted default status'], df_['True default status'], ['No', 'Yes'])

#  Lab

### 4.6.3 Linear Discriminant Analysis

Linear Discriminant Analysis (LDA) tries to identify attributes that account for the most variance between classes. In particular, LDA, in contrast to PCA, is a supervised method, using known class labels.

LDA is a classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes’ rule.
The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix.

The fitted model can also be used to reduce the dimensionality of the input by projecting it to the most discriminative directions.

In [None]:
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.decomposition import PCA
# from sklearn.lda import LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

iris = datasets.load_iris()

X = iris.data
y = iris.target
target_names = iris.target_names

pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)

lda = LDA(n_components=2)
X_r2 = lda.fit(X, y).transform(X)

# Percentage of variance explained for each components
print('explained variance ratio (first two components): {}'.format(
      str(pca.explained_variance_ratio_)))

plt.figure()
for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
plt.legend()
plt.title('PCA of IRIS dataset')

plt.figure()
for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
    plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], c=c, label=target_name)
plt.legend()
plt.title('LDA of IRIS dataset')

plt.show();

In [None]:
df = pd.read_csv('../../_data/Smarket.csv', usecols=range(1, 10), index_col=0, parse_dates=True) ## parse datetime
df.sample(10)
df.info()

In [None]:
X_train = df[:'2004'][['Lag1','Lag2']]
y_train = df[:'2004']['Direction']

X_test = df['2005':][['Lag1','Lag2']]
y_test = df['2005':]['Direction']

lda = LinearDiscriminantAnalysis().fit(X_train, y_train)
pred = lda.predict(X_test)

#### Priors - class means

In [None]:
np.mean(y_train=='Down'), np.mean(y_train=='Up')

In [None]:
lda.priors_

#### Feature means per class label

In [None]:
M = df[:'2004']['Direction']=='Down'
df.loc[M.values, 'Lag1'].mean()

M = df[:'2004']['Direction']=='Down'
df.loc[M.values, 'Lag2'].mean()

M = df[:'2004']['Direction']=='Up'
df.loc[M.values, 'Lag1'].mean()

M = df[:'2004']['Direction']=='Up'
df.loc[M.values, 'Lag2'].mean()

In [None]:
lda.means_

#### Model coefficients

In [None]:
# These do not seem to correspond to the values from the R output in the book?
lda.coef_

In [None]:
confusion_matrix(y_test, pred)

In [None]:
print(classification_report(y_test, pred, digits=3))

In [None]:
pred_p = lda.predict_proba(X_test)

In [None]:
np.unique(pred_p[:, 1]>0.5, return_counts=True)

In [None]:
np.unique(pred_p[:, 1]>0.9, return_counts=True)

### LDA as dimension reduction

In [None]:
X_train = df[:'2004'][['Lag1', 'Lag2','Lag3','Lag4','Lag5']]
y_train = (df[:'2004']['Direction']=='Down').values*1
# y_train

X_test = df['2005':][['Lag1', 'Lag2','Lag3','Lag4','Lag5']]
y_test = (df['2005']['Direction']=='Down').values*1
# y_test

lda2 = LinearDiscriminantAnalysis(n_components=2).fit(X_train, y_train)
pred = lda2.predict(X_test)

In [None]:
lda2.priors_

In [None]:
lda2.means_

In [None]:
# These do not seem to correspond to the values from the R output in the book?
lda2.coef_

In [None]:
lda2.get_params

In [None]:
confusion_matrix(y_test, pred)

In [None]:
print(classification_report(y_test, pred, digits=3))

In [None]:
lda2.predict_proba(X_test)[:10]

### Number of predictions around decision boundary # TODO

In [None]:
margin = 0.03
np.unique(pred_p[:, 1] < 0.5+margin, return_counts=True), 'smaller than .5+margin'
np.unique(0.5-margin < pred_p[:, 1], return_counts=True), 'bigger than .5-margin'
'Predictions between margin({}) [false true] : {}'.format(margin, 
np.unique(pred_p[:, 1] < 0.5+margin, return_counts=True)[1] + np.unique(0.5-margin < pred_p[:, 1], return_counts=True)[1])

In [None]:
np.mean(pred_p[:, 1]), np.var(pred_p[:, 1])**.5

### 4.6.4 Quadratic Discriminant Analysis

In [None]:
qda = QuadraticDiscriminantAnalysis()
pred = qda.fit(X_train, y_train).predict(X_test)

In [None]:
qda.priors_

In [None]:
qda.means_

In [None]:
confusion_matrix(y_test, pred)

In [None]:
print(classification_report(y_test, pred, digits=3))

### 4.6.5 K-Nearest Neighbors

In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors=1)
pred = knn.fit(X_train, y_train).predict(X_test)

print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred, digits=3))

In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
pred = knn.fit(X_train, y_train).predict(X_test)

print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred, digits=3))

### 4.6.6 An Application to Caravan Insurance Data

#### K-Nearest Neighbors

In [None]:
# In R, I exported the dataset from package 'ISLR' to a csv file
df = pd.read_csv('../../_data/Caravan.csv')
y = df.Purchase
X = df.drop('Purchase', axis=1).astype('float64')
X_scaled = preprocessing.scale(X)

X_train = X_scaled[1000:,:]
y_train = y[1000:]
X_test = X_scaled[:1000,:]
y_test = y[:1000]

In [None]:
def KNN(n_neighbors=1, weights='uniform'):
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    score = clf.score(X_test, y_test)
    return(pred, score, clf.classes_)

In [None]:
def plot_confusion_matrix(cm, classes, n_neighbors, title='Confusion matrix (Normalized)',
                          cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Normalized confusion matrix: KNN-{}'.format(n_neighbors))
    plt.colorbar()
    plt.xticks(np.arange(2), classes)
    plt.yticks(np.arange(2), classes)
    plt.tight_layout()
    plt.xlabel('True label',rotation='horizontal', ha='right')
    plt.ylabel('Predicted label')
    plt.show()

In [None]:
for i in [1,3,5]:
    pred, score, classes = KNN(i)
    cm = confusion_matrix(y_test, pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plot_confusion_matrix(cm_normalized.T, classes, n_neighbors=i)
    cm_df = pd.DataFrame(cm.T, index=classes, columns=classes)
    cm_df.index.name = 'Predicted'
    cm_df.columns.name = 'True'
    print(cm_df)    
    print(pd.DataFrame(precision_score(y_test, pred, average=None),
                       index=classes, columns=['Precision']))        

####  Logistic Regression

In [None]:
regr = skl_lm.LogisticRegression()
regr.fit(X_train, y_train)

In [None]:
pred = regr.predict(X_test)
cm_df = pd.DataFrame(confusion_matrix(y_test, pred), index=regr.classes_,
                     columns=regr.classes_)
cm_df.index.name = 'Predicted'
cm_df.columns.name = 'True'
print(cm_df)
print(classification_report(y_test, pred))

In [None]:
pred_p = regr.predict_proba(X_test)

cm_df = pd.DataFrame({'True': y_test, 'Pred': pred_p[:,1] > .25})

cm_df.Pred.replace(to_replace={True:'Yes', False:'No'}, inplace=True)
print(cm_df.groupby(['True', 'Pred']).size().unstack('True').T)

print(classification_report(y_test, cm_df.Pred))