# Logistic Regression

### Table of Contents

1. [EDA](#1.-EDA)
2. [Logistic regression](#2.-Logistic-regression)
3. [Reference](#3.-Reference)

### 1. EDA

In [None]:
# Import libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
# Reading data
defaults = pd.read_csv('../data/default.csv')

defaults.head()

In [None]:
## get dummy variables for 'student' plotting

student_dummies = pd.get_dummies(defaults['student'], prefix='student')
defaults['student_yes'] = student_dummies['student_Yes']
defaults.head()

In [None]:
features = ['balance', 'income', 'student_yes']
target = 'default'

In [None]:
# Spliting the data into train and test sets

defaults_train, defaults_test = train_test_split(defaults[features + [target]],
                                                 test_size=0.3,
                                                 random_state=1)

for split in [defaults_train, defaults_test]:
    split = pd.DataFrame(data=split, columns = features + [target])

#### Histogram of all variables along with target variable for training data

In [None]:
# Create a histogram of all variables along with target variable for training data

# If you want to plot multiple plots, call matplotlib's .subplots()
# This returns two objects: fig (the overall picture), and ax, which you can subset for individual plots
fig, ax = plt.subplots(2,2)

ax[0,0].hist(defaults_train['balance'])
ax[0,0].set_title('balance')
ax[1,0].hist(defaults_train['income'])
ax[1,0].set_title('income')
ax[0,1].hist(defaults_train['student_yes'])
ax[0,1].set_title('student_yes')
ax[1,1].hist(defaults_train['default'])
ax[1,1].set_title('default')

# the tight_layout() command cleans up issues when text/objects bleed into each other
plt.tight_layout()
plt.show()

#### Histogram of all variables along with target variable for testing data

Since we're repeating this process, let's create a reusable function.

In [None]:
def plot_histograms_columns(df, columns, nrows, ncols):
    """
    Parameters
    ----------
    df : pandas DataFrame
    columns : list
        Columns of df to plot in each histogram
    nrows : int
        Number of rows to use for subplot arrangement
    ncols : int
        Number of columns to use for subplot arrangement
    """
    fig, ax = plt.subplots(nrows, ncols)
    
    assert len(columns) <= nrows * ncols, 'There are too many columns: len(columns) < nrows * ncols'
    
    row = 0
    col = 0
    
    for column in columns:
        ax[row, col].hist(df[column])
        ax[row, col].set_title(column)
        if row < nrows - 1:
            row += 1
        else:
            row = 0
            col += 1
    
    plt.tight_layout()
    plt.show()

In [None]:
plot_histograms_columns(defaults_train, features + [target], 2, 2)

In [None]:
plot_histograms_columns(defaults_test, features + [target], 2, 2)

#### Scatter plot: balance and income for training data

In [None]:
# Create a scatter plot of the income vs. balance

defaults_train.plot(x='balance', y='income',
                    kind='scatter',
                    alpha=0.3)

plt.ylim([0,80000])
plt.xlim([0, 2800])

plt.title("Scatter plot: balance and income, training data")
plt.xlabel("Balance")
plt.ylabel("Income")

plt.show()

#### Scatter plot: balance and income for training data, non-default vs. default records

In [None]:
# Mark defaults with a different color and symbol
defaults_train_nd = defaults_train.loc[defaults_train['default'] == 0]
defaults_train_d = defaults_train.loc[defaults_train['default'] == 1]

plt.figure()
plt.scatter(defaults_train_nd['balance'],
            defaults_train_nd['income'],
            alpha=0.3,
            marker='+',
            c='green')

plt.scatter(defaults_train_d['balance'],
            defaults_train_d['income'],
            marker='o', 
            edgecolors='red',
            facecolors='none')

plt.ylim([0,80000])
plt.xlim([0, 2800])

plt.title("Scatter plot between Balance and Income for Non-Default and Default trainig data")
plt.xlabel("Balance")
plt.ylabel("Income")

plt.legend(('no default', 'default'),
           loc='upper right')
plt.show()

# What can you infer from this plot?
# It appears that the balance is more correlated with default than income

### 2. Logistic regression

Run a logistic regression to predict the variable `default` using only `balance`
- What are the beta values?

In [None]:
lr = LogisticRegression()
lr.fit(defaults_train[['balance']], defaults_train['default'])

B1 = lr.coef_[0][0]
B0 = lr.intercept_[0]

print('Coefficient and Intercept for LR fit between balance and default data are {}, {}'.format(B1, B0))

Predict the probability of default for two people, one with a balance of `$1700` and another with a balance of `$2500`

In [None]:
pred_prob = lr.predict_proba(pd.DataFrame({'balance': [1700, 2500]}))

pred_class = lr.predict(pd.DataFrame({'balance': [1700, 2500]}))

print('Predicted class and its probability for a balance of $1700 are {}, {}'.format(pred_class[0],
                                                                                    pred_prob[0][0]))

print('Predicted class and its probability for a balance of $2500 are {}, {}'.format(pred_class[1],
                                                                                    pred_prob[1][1]))

What does beta mean? Let's create some plots to find out!

In [None]:
x = np.linspace(defaults_test['balance'].min(), defaults_test['balance'].max(),500)
beta = [B0,B1]

y = np.exp(beta[0] + beta[1]*x) / (1 + np.exp(beta[0] + beta[1]*x))
odds = np.exp(beta[0] + beta[1]*x)
log_odds = beta[0] + beta[1]*x

In [None]:
plt.figure(figsize=(7, 8))

plt.subplot(311)
plt.plot(x, y, 'r', linewidth=2)
plt.ylabel('Probability')
plt.text(500, 0.7, r'$\frac{e^{\beta_o + \beta_1x}}{1+e^{\beta_o + \beta_1x}}$', fontsize=25)

plt.subplot(312)
plt.plot(x, odds, 'k', linewidth=2)
plt.ylabel('Odds')
plt.text(500, 10, r'$e^{\beta_o + \beta_1x}$', fontsize=20)

plt.subplot(313)
plt.plot(x, log_odds, 'c', linewidth=2)
plt.ylabel('Log(Odds)')
plt.xlabel('x')
plt.text(500, 1, r'$\beta_o + \beta_1x$', fontsize=15)

plt.show()

In [None]:
# Plot the fitted logistic function overtop of the data points
plt.figure()
plt.scatter(defaults_test['balance'], defaults_test['default'], alpha=0.1)
plt.plot(x, y, 'r', linewidth=2)
plt.xlabel("Balance")
plt.ylabel("Probability of Default")
plt.ylim([-0.05,1.05]); plt.xlim([0, 2800])
plt.show()

In [None]:
# Create predictions using the balance model on the test set
defaults_test['pred_class'] = lr.predict(defaults_test[['balance']])

### 3. Reference
- [Logistic regression](https://towardsdatascience.com/logistic-regression-detailed-overview-46c4da4303bc)
- [`scikit-learn` logistic regression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
- [Scatter plots](https://matplotlib.org/api/_as_gen/matplotlib.pyplot.scatter.html)