# Supervised Learning

## Parametric models

### Logistic Regression

# 5 Logistic Regression with `statsmodels`

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats
import sklearn as sk
import matplotlib.pyplot as plt

<a href='http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data'>dataset</a>


<a href='http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.names'>dataset description</a>

## 5.1 data cleaning and EDA

In [None]:
cred = pd.read_csv('data/credit approval.csv', names = ['sex','age','debt','married','bank_customer','edu','ethnicity',
                                                       'yrs_employed', 'prior_default','employed','cscore','driver_lic',
                                                       'citizen','zipcode','income','approval'])

In [None]:
cred.shape

In [None]:
cred = cred.dropna().reset_index(drop = True)
cred.shape

In [None]:
cred.columns

every variable in this dataset is encoded to hide the actual information

In [None]:
cred = cred.drop(['sex','married', 'bank_customer', 'edu', 'ethnicity','driver_lic','citizen','zipcode'], axis = 1)
cred.head()

In [None]:
cred.replace(to_replace = {'prior_default':{'t':0, 'f':1},
                           'employed':{'t':1, 'f':0},
                           'approval':{'-':0,'+':1}},
                             inplace = True)

In [None]:
cred.prior_default = cred.prior_default.astype('category')
cred.employed = cred.employed.astype('category')

In [None]:
cred.head()

In [None]:
cred.dtypes

&nbsp; 

* **unalikability** is a measure of variability in a categorical variable.  
* based on the following <a href = 'http://ww2.amstat.org/publications/jse/v15n2/kader.html'>publication</a> (sec 2.5) a coefficient of variability for a categorical variable can be calculated as 

$$u = 1 - \Sigma_i\ p_i^2 $$

with $p_i=\frac{k_i}{n}$ where $k_i$ is the count of observations for a single category and $n$ is the total number of observations for that variable.

* variables with low coefficient do not have a lot of variability in the data and can be ignored or dropped form the table. 


In [None]:
def unalike(c_var):
    c_var = np.unique(np.array(c_var), return_counts = True)[1]
    obs_num = sum(c_var)
    return(1 - sum([(i/obs_num)**2 for i in c_var]))

In [None]:
un = pd.Series(list(map(unalike, (cred.prior_default, cred.employed,))), name = 'coef')

nm = pd.Series(['prior_default', 'employed'], name = 'variable')

In [None]:
pd.concat([nm, un], axis = 1).sort_values(by = 'coef', ascending = False)

In [None]:
cred.head()

* normalize variables

#### EDA

In [None]:
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, scale

In [None]:
age_norm = scale(cred.age, with_mean = False)

fig, ax = plt.subplots(1,2)
fig.set_size_inches(12, 5)
plt.figure(figsize = (10,7))
sns.distplot(cred.age, ax = ax[0])
sns.distplot(age_norm, ax = ax[1])

In [None]:
cred.replace({'debt': {0: 10**-16},
              'yrs_employed':{0:10**-16},
              'cscore':{0:10**-16},
              'income':{0:10**-16}}, inplace = True)

In [None]:
debt_log = np.log(cred.debt)
debt_norm = scale(debt_log, with_mean = False)

fig, ax = plt.subplots(1,3)
fig.set_size_inches(12, 5)
plt.figure(figsize = (10,7))
sns.distplot(cred.debt, ax = ax[0])
sns.distplot(debt_log, ax = ax[1])
sns.distplot(debt_norm, ax = ax[2])

In [None]:
yrs_employed_log = np.log(cred.yrs_employed)
yrs_employed_norm = scale(yrs_employed_log, with_mean = False)

fig, ax = plt.subplots(1,3)
fig.set_size_inches(12, 5)
plt.figure(figsize = (10,7))
sns.distplot(cred.yrs_employed, ax = ax[0])
sns.distplot(yrs_employed_log, ax = ax[1])
sns.distplot(yrs_employed_norm, ax = ax[2])

In [None]:
cscore_log = np.log(cred.cscore)
cscore_norm = scale(cscore_log, with_mean = False)

fig, ax = plt.subplots(1,3)
fig.set_size_inches(12, 5)
plt.figure(figsize = (10,7))
sns.distplot(cred.cscore, ax = ax[0])
sns.distplot(cscore_log, ax = ax[1])
sns.distplot(cscore_norm, ax = ax[2])

In [None]:
income_log = np.log(cred.income)
income_norm = scale(income_log, with_mean = False)

fig, ax = plt.subplots(1,3)
fig.set_size_inches(12, 5)
plt.figure(figsize = (10,7))
sns.distplot(cred.income, ax = ax[0])
sns.distplot(income_log, ax = ax[1])
sns.distplot(income_norm , ax = ax[2])

In [None]:
cred.age = age_norm
cred.debt = debt_norm
cred.yrs_employed = yrs_employed_norm
cred.cscore = cscore_norm
cred.income = income_norm

In [None]:
cred.head()

In [None]:
cred.shape

In [None]:
np.unique(cred.approval, return_counts = True)

In [None]:
#save the dataset after cleaning
cred.to_csv('data/cred_clean.csv', index = False)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, confusion_matrix, log_loss, roc_curve

import statsmodels.api as sm
import statsmodels.formula.api as smf


%matplotlib inline

## 5.2 model estimation

In [None]:
cred_train, cred_test = train_test_split(cred, test_size = 0.2, random_state = 65) 

In [None]:
cred_train = cred_train.reset_index(drop = True)
cred_test = cred_test.reset_index(drop = True)

In [None]:
function_call = 'approval ~ ' + ' + '.join(cred_train.columns[:-1])
function_call

In [None]:
binom_fit = smf.glm(function_call,family = sm.families.Binomial(),data = cred_train).fit()

In [None]:
results = binom_fit.summary()
print(results)

In [None]:
%load udf/signif.py 

In [None]:
signif(results.tables)

In [None]:
binom_fit = smf.glm('approval ~ prior_default + yrs_employed + cscore',family = sm.families.Binomial(),data = cred_train).fit()
results_1 = binom_fit.summary()
signif(results_1.tables)

## 5.3 model validation

In [None]:
print('Null deviance: {} on {} degrees of freedom'.format(binom_fit.null_deviance, binom_fit.df_model+binom_fit.df_resid))
print('Residual deviance: {} on {} degrees of freedom'.format(binom_fit.deviance, binom_fit.df_resid))
print('aic: ',binom_fit.aic)
print('log lik: ',binom_fit.llf)

In [None]:
predicted_prob = binom_fit.predict(cred_test)

In [None]:
predicted_thresh = [0 if i < 0.5 else 1 for i in predicted_prob]

In [None]:
pd.DataFrame(confusion_matrix(y_true = cred_test.approval.astype(np.int64), y_pred = predicted_thresh))

In [None]:
fpr, tpr, _ = roc_curve(y_true = cred_test.approval, y_score = predicted_prob)

In [None]:
AUC = auc(x = fpr, y = tpr )

In [None]:
plt.figure(figsize = (10,7))
plt.plot(fpr,tpr,label='AUC = {}'.format(AUC), color = 'orange')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC curve')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.legend(loc="lower right")

In [None]:
log_loss(y_true = cred_test.approval, y_pred = predicted_prob)

&nbsp;

is this a good or a bad logloss ?

In [None]:
prob = np.linspace(.0001,.9999999,1000)
log_prob = -np.log(prob)

In [None]:
plt.figure(figsize = (15,7))

plt.plot(prob, log_prob,'-')
plt.plot(prob, [log_loss(y_true = cred_test.approval, y_pred = predicted_prob) for i in range(1000)], '--', color = 'coral')
plt.legend(['log_loss','logistic regression'], handlelength = 5, prop = {'size': 15} )

&nbsp;

# 6 Logistic Regression with `scikit-learn`

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

## 6.1 OneHotEncoder

In [None]:
cred.head()

In [None]:
cred_categ = cred.loc[:,['prior_default', 'employed']]

In [None]:
enc = OneHotEncoder(categorical_features = [3,4])
enc.fit(cred)

In [None]:
enc.feature_indices_

In [None]:
cred_dummy = pd.DataFrame(enc.transform(cred).toarray())

In [None]:
cred_dummy.head()

&nbsp;

drop one (the first) dummy column for every variable according to the `feature_indides_` obtained from the `enc.fit` object

In [None]:
cred_dummy.drop(enc.feature_indices_[:-1], axis = 1, inplace = True)
cred_dummy.head()

In [None]:
cred_dummy.columns = ['pdT','empT'] + cred.columns[0:3].tolist() + cred.columns[5:].tolist() 
cred_dummy.approval = cred_dummy.approval.astype(np.int64)
cred_dummy.pdT = cred_dummy.pdT.astype(np.int64)
cred_dummy.empT = cred_dummy.empT.astype(np.int64)


In [None]:
cred_dummy.head()

In [None]:
cred.head()

In [None]:
cred_dummy.to_csv('data/cred_ohe.csv', index = False)

&nbsp;

## 6.2 model estimation

In [None]:
x_train, x_test, y_train, y_test = train_test_split(cred_dummy.iloc[:,:-1], cred_dummy.iloc[:,-1], 
                                                    test_size = 0.2, random_state = 65) 

In [None]:
log_reg = LogisticRegression(fit_intercept = True)
log_reg.fit(x_train, y_train)

In [None]:
log_reg.intercept_

In [None]:
log_reg.coef_

as a reminder we can comapre to the `results.tables` table:

In [None]:
signif(results.tables)

* `scikit-learn` does not return any model diagnostics that will help us assess the importance of variables.    
* one way around this is to build different models using combinations of independent variables and compare the confusion matrix and ROC curve for each one.   
* another way to assess variable importance is by using lasso regression.   

&nbsp;

## 6.3 lasso and ridge regression 

In [None]:
n_alphas = 20
alphas = np.logspace(-5, 5, n_alphas)

coefs = []
for c in alphas:
    clf = LogisticRegression(C = c, penalty = 'l2' )
    clf.fit(x_train, y_train)
    coefs.append(clf.coef_[0])

In [None]:
alphas

In [None]:
ax = plt.figure(figsize = (15,10)).gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('C')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.legend(cred_dummy.columns, loc = 'best', handlelength = 5, prop={'size': 15})
plt.axis('tight')
plt.show()

In [None]:
signif(results_1.tables)

In [None]:
cred_mod = pd.concat([cred_dummy.loc[:,['pdT', 'empT']], cred.loc[:,['cscore','approval']]], axis = 1)

cred_mod.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(cred_mod.iloc[:,:-1], cred_mod.iloc[:,-1], 
                                                    test_size = 0.2, random_state = 65) 

In [None]:
log_reg = LogisticRegression(fit_intercept = True)
log_reg.fit(x_train, y_train)

In [None]:
y_pred = log_reg.predict(x_test)
y_prob = log_reg.predict_proba(x_test)
y_probone = list(map(lambda x: x[1], y_prob))

In [None]:
pd.DataFrame(confusion_matrix(y_true = y_test, y_pred = y_pred))

In [None]:
fpr, tpr, _ = roc_curve(y_true = y_test, y_score = y_probone)
AUC = auc(x = fpr, y = tpr )
plt.figure(figsize = (10,7))
plt.plot(fpr,tpr,label='AUC = {}'.format(AUC), color = 'orange')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC curve')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.legend(loc="lower right")

In [None]:
log_loss(y_true = y_test, y_pred = y_probone)