<a href="https://colab.research.google.com/github/ath0217/hello-github/blob/main/Lab_Session_05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!mkdir data

In [None]:
import gdown

urls = ['https://drive.google.com/uc?export=download&id=1LE4HPyxDcx3-QB-tTPUft8RTlzgybn7k', # Default data  https://drive.google.com/file/d/1LE4HPyxDcx3-QB-tTPUft8RTlzgybn7k/view?usp=sharing
        ]
outputs = ['Default.csv']
for url,output in zip(urls,outputs):
  gdown.download(url, f'data/{output}', quiet=False)

**Default** **data**

In [None]:
import sklearn.linear_model as skl_lm
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn import preprocessing

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
df = pd.read_csv('data/Default.csv')
df.head(3)

In [None]:
df.default.factorize()

In [None]:
df['default2'] = df.default.factorize()[0]
df['student2'] = df.student.factorize()[0]
df.head(3)

In [None]:
# Take a fraction of the samples where target value (default) is 'no'
df_no = df[df.default2 == 0].sample(frac=0.15, random_state=714)
# Take all samples  where target value is 'yes'
df_yes = df[df.default2 == 1]
df2 = df_no.append(df_yes)

fig, axes = plt.subplots(ncols=3,figsize=(12,5),gridspec_kw={'width_ratios': [3, 1,1]})

c_palette = {'No':'lightblue', 'Yes':'orange'}

sns.scatterplot(data=df2, x='balance',y='income',hue='default', style='default',palette=c_palette,alpha=0.6,ax=axes[0])
sns.boxplot('default', 'balance', data=df, orient='v', ax=axes[1], palette=c_palette)
sns.boxplot('default', 'income', data=df, orient='v', ax=axes[2], palette=c_palette)

plt.tight_layout()

In [None]:
fig, axes = plt.subplots(ncols=3,figsize=(18,5))
sns.histplot(x=df.balance, hue=df.default2, multiple='stack',element='step', ax=axes[0])
axes[0].set_title('Stacked histogram')
sns.kdeplot(x=df.balance, hue=df.default2, multiple='fill', ax=axes[1])
axes[1].set_title('Conditional density plot (kde)')
sns.histplot(x=df.balance, hue=df.default2, multiple='fill', element='step',ax=axes[2])
axes[2].set_title('Conditional density plot (histogram)')

**Logistic Regression**

In [None]:
sns.regplot(x=df.balance, y=df.default2, order=1, logistic=True, ci=None, n_boot=5,
            scatter=False,
            line_kws={'color':'blue', 'lw':2})
sns.scatterplot(x=df.balance, y=df.default2, hue=df.default)

In [None]:
X_train = df[['balance']]
y = df.default2

# Create array of test data. Calculate the classification probability
# and predicted classification.
X_test = np.arange(df.balance.min(), df.balance.max()).reshape(-1,1)

clf = skl_lm.LogisticRegression(penalty='none')
clf.fit(X_train,y)
prob = clf.predict_proba(X_test)

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5))
# Left plot
sns.histplot(x=df.balance, hue=df.default2, multiple='fill', element='step',ax=ax1)
sns.scatterplot(x=df.balance, y=df.default2, hue=df.default,ax=ax1)
sns.regplot(x=df.balance, y=df.default2, order=1, ci=None, scatter=False,
            line_kws={'color':'blue', 'lw':2}, ax=ax1)
# Right plot
sns.histplot(x=df.balance, hue=df.default2, multiple='fill', element='step',ax=ax2)
sns.scatterplot(x=df.balance, y=df.default2, hue=df.default,ax=ax2)
sns.regplot(x=df.balance, y=df.default2, order=1, logistic=True, ci=None, n_boot=5,
            scatter=False,
            line_kws={'color':'blue', 'lw':2}, ax=ax2)

for ax in fig.axes:
    ax.set_ylabel('Probability of default')

**Using Scikit-learn**

In [None]:
clf = skl_lm.LogisticRegression(penalty='none')#,solver='newton-cg')
X_train = df[['balance']]
clf.fit(X_train,y)
print(clf)
print('classes: ',clf.classes_)
print('coefficients: ',clf.coef_)
print('intercept :', clf.intercept_)

In [None]:
X_train.head(3)

**Using statsmodel**

In [None]:
est = sm.Logit(y.ravel(), X_train).fit()
est.summary2().tables[1]

In [None]:
X_train = sm.add_constant(X_train['balance'])
est = sm.Logit(y.ravel(), X_train).fit()
est.summary2().tables[1]

In [None]:
# using the formula api
est = smf.logit('default2 ~ balance',df).fit()
est.summary2().tables[1]

In [None]:
est = smf.logit('default2 ~ student2',df).fit()
est.summary2().tables[1]

In [None]:
est = smf.logit('default2 ~ balance + income + student2',df).fit()
est.summary2().tables[1]

**Confounding**

In [None]:
est_student = smf.logit('default2 ~ balance ',df[df['student']=='Yes']).fit()
print(est_student.summary2().tables[1])
est_no_student = smf.logit('default2 ~ balance ',df[df['student']=='No']).fit()
print(est_no_student.summary2().tables[1])

In [None]:
X_test = pd.DataFrame(np.arange(df.balance.min(), df.balance.max()), columns=['balance'])
X_test.head(3)

In [None]:
# creating plot
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5))

# Left plot
ax1.plot(X_test, est_student.predict(X_test), color='orange', label='Student')
ax1.plot(X_test, est_no_student.predict(X_test), color='lightblue', label='Non-student')
ax1.hlines(127/2817, colors='orange', label='Overall Student',
           xmin=ax1.xaxis.get_data_interval()[0],
           xmax=ax1.xaxis.get_data_interval()[1], linestyles='dashed')
ax1.hlines(206/6850, colors='lightblue', label='Overall Non-Student',
           xmin=ax1.xaxis.get_data_interval()[0],
           xmax=ax1.xaxis.get_data_interval()[1], linestyles='dashed')
ax1.set_ylabel('Default Rate')
ax1.set_xlabel('Credit Card Balance')
ax1.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.])
ax1.set_xlim(450,2500)
ax1.legend(loc=2)

# Right plot
sns.boxplot('student', 'balance', data=df, orient='v', ax=ax2,  palette=c_palette);

**Decision boundaries**

In [None]:
from mlxtend.plotting import plot_decision_regions

In [None]:
X_train=df[['balance','income']].values
y_train=df['default2'].values

clf = skl_lm.LogisticRegression(penalty='none')
clf.fit(X_train,y_train)

In [None]:
fig = plt.figure(figsize=(6,4))
scatter_kwargs = {'s': 20, 'edgecolor': None, 'alpha': 0.8}
plot_decision_regions(X=X_train, y=y_train, clf=clf, scatter_kwargs=scatter_kwargs)

**k-NN Classifier**


In [None]:
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier

In [None]:
X, y = make_classification(n_samples=300,n_features=2, n_redundant=0, n_clusters_per_class=1, random_state=714)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
sns.scatterplot(x=X[:,0],y=X[:,1],hue=y, style=y,alpha=0.6)

In [None]:
%%time
K=[3,10,50,100]
fig,axes = plt.subplots(ncols=len(K),figsize=(6*len(K),4))
for i, k in enumerate(K):
  clf = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
  clf.fit(X,y)
  plot_decision_regions(X=X, y=y, clf=clf, ax=axes[i])
  axes[i].set_title(f'{k}-NN classification')

**Linear Discriminant Analysis (LDA)**

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

Decision boundaries
2 classes *

In [None]:
X, y = make_classification(n_samples=1000, n_classes=2, n_features=2, n_redundant=0, n_clusters_per_class=1, random_state=714)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
c_palette = {0:'red', 1:'blue', 2: 'green'}
sns.scatterplot(x=X[:,0],y=X[:,1],hue=y, style=y,alpha=0.6, palette=c_palette)

In [None]:
lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()
nb = GaussianNB()
clfs = [lda, qda, nb]
fig,axes = plt.subplots(ncols=len(clfs),figsize=(6*len(clfs),4))
for i, clf in enumerate(clfs):  
  clf.fit(X,y)
  plot_decision_regions(X=X, y=y, clf=clf, ax=axes[i], colors='red,blue,green')
  axes[i].set_title(str(clf))

**3 classes**

In [None]:
X, y = make_classification(n_samples=1000, n_classes=3, n_features=2, n_redundant=0,class_sep=0.8, n_clusters_per_class=1, random_state=714)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)

c_palette = {0:'red', 1:'blue', 2: 'green'}
sns.scatterplot(x=X[:,0],y=X[:,1],hue=y, style=y,alpha=0.6, palette=c_palette)

In [None]:
lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()
nb = GaussianNB()
clfs = [lda, qda, nb]
fig,axes = plt.subplots(ncols=len(clfs),figsize=(6*len(clfs),4))
for i, clf in enumerate(clfs):  
  clf.fit(X,y)
  plot_decision_regions(X=X, y=y, clf=clf, ax=axes[i], colors='red,blue,green')
  axes[i].set_title(str(clf))

**Default data**

In [None]:
X = df[['balance', 'income', 'student2']]
y = df.default2

lda = LinearDiscriminantAnalysis()
y_pred = lda.fit(X, y).predict(X)
cm = confusion_matrix(y,y_pred)
print(cm)
ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=lda.classes_).plot()

In [None]:
alpha = 0.2
y_prob = lda.fit(X, y).predict_proba(X)
y_pred_02 = (y_prob[:, 1] > alpha).astype('float')
cm = confusion_matrix(y,y_pred_02)
print(cm)
ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=lda.classes_).plot()