## import necessary packages

In [1]:
## basics
import seaborn as sns
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')

## data preprocessing
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

## models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
## ensemble
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
# import catboost as cb
from mlxtend.classifier import StackingClassifier

## model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV


## model evaluation metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

## Read the data

In [2]:
df = pd.read_csv('/Volumes/Backup Plus/Documents/Data Science/Projects/fannieMae_project/processed_data/Fannie_loans.csv')

In [3]:
df.isnull().sum()

Unnamed: 0            0
OrInterestRate        0
OrUnpaidPrinc         0
OrLoanTerm            0
OrLTV                 0
OrCLTV                0
NumBorrowers          0
DTIRat                0
CreditScore           0
NumUnits              0
MortInsPerc           0
CoCreditScore         0
MortInsType           0
Default               0
Channel               0
SellerName            0
FTHomeBuyer           0
LoanPurpose           0
PropertyType          0
OccType               0
PropertyState         0
ProductType           0
RelocationMortgage    0
Zip                   0
first_payment_year    0
loan_year             0
dtype: int64

In [4]:
df["Default"].mean()

0.016885376097890665

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3194421 entries, 0 to 3194420
Data columns (total 26 columns):
Unnamed: 0            int64
OrInterestRate        float64
OrUnpaidPrinc         int64
OrLoanTerm            int64
OrLTV                 float64
OrCLTV                float64
NumBorrowers          float64
DTIRat                float64
CreditScore           float64
NumUnits              int64
MortInsPerc           float64
CoCreditScore         float64
MortInsType           float64
Default               int64
Channel               object
SellerName            object
FTHomeBuyer           object
LoanPurpose           object
PropertyType          object
OccType               object
PropertyState         object
ProductType           object
RelocationMortgage    object
Zip                   int64
first_payment_year    int64
loan_year             int64
dtypes: float64(9), int64(8), object(9)
memory usage: 633.7+ MB


In [7]:
df_new=df.loc[df['loan_year']==2007]

In [8]:
df_new.shape

(92340, 26)

In [9]:
df_new = pd_new.get_dummies(df, drop_first=True)
df_new.shape

(3194421, 179)

In [None]:
# Get column names first
names = df_new.columns
# Create the Scaler object
scaler = StandardScaler()
# Fit your data on the scaler object
scaled_df = scaler.fit_transform(df_new)
scaled_df = pd.DataFrame(df_new, columns=names)

In [None]:
y = scaled_df['Default'].values
X = scaled_df.drop(['Default'], axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)

### KNN model 

In [None]:
knn = KNeighborsClassifier(5)
knn.fit(X_train, y_train)
knn_pred=knn.predict(X_test)

### decision tree model

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred=dt.predict(X_test)

### regular logistic model

In [None]:
lr = LogisticRegression(random_state=1, class_weight="balanced")
lr.fit(X_train, y_train)
lr_pred=lr.predict(X_test)

### accuracy scores

In [None]:
a_score_knn=accuracy_score(y_test, knn_pred)
a_score_dt=accuracy_score(y_test, dt_pred)
a_score_lr=accuracy_score(y_test, lr_pred)

# Print the accuracy scores
print(a_score_knn)
print(a_score_dt)
print(a_score_lr)

### F1 Score:
The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is:

$$ F1 = 2 * \frac{precision * recall}{precision + recall} $$

In [None]:
from sklearn.metrics import f1_score

score_knn = f1_score(y_test, knn_pred)
score_dt = f1_score(y_test, dt_pred)
score_lr = f1_score(y_test, lr_pred)

# Print the f1 scores
print(score_knn)
print(score_dt)
print(score_lr)

## heterogeneous ensemble 
* wisedom of the crowd
* use fine-tuned models
* small amount of estimators
* **voting and average**

### 1) voting:

In [None]:
from sklearn.ensemble import VotingClassifier

# Instantiate the individual models

knn = KNeighborsClassifier(5)
dt = DecisionTreeClassifier()
lr = LogisticRegression(random_state=1, class_weight="balanced")

# Create and fit the voting classifier
clf_vote = VotingClassifier(
    estimators=[('knn', knn), ('dt', dt), ('lr', lr)]
)
clf_vote.fit(X_train, y_train)

# Calculate the predictions using the voting classifier
pred_vote = clf_vote.predict(X_test)

# Calculate the accuracy score of the voting classifier
score_vote = accuracy_score(y_test, pred_vote)
print('voting accuracy-Score: {:}'.format(score_vote))


### 2) averaging (soft voting):

In [None]:
from sklearn.ensemble import VotingClassifier

# Instantiate the individual models

knn = KNeighborsClassifier(5)
rf = DecisionTreeClassifier()
lr = LogisticRegression(random_state=1, class_weight="balanced")

# Create and fit the voting classifier
clf_avg = VotingClassifier(
    estimators=[('knn', knn), ('dt', dt), ('lr', lr)],
    voting='soft',
    weights=[2, 1, 1]
)


clf_avg.fit(X_train, y_train)

# Calculate the predictions using the voting classifier
pred_avg = clf_avg.predict(X_test)

# Evaluate model performance
acc_avg = accuracy_score(y_test,  pred_avg)
print('averaging accuracy: {:}'.format(acc_avg))

## homogeneous ensemble

* use the small model (weak model) 
* large amount of estimators
* **bagging and boosting**
* **random forest** is a special case of bagging

Condorcet's jury theorm:
1) models are independent
2) models are slightly better than random guess
3) all individual models have similar performance

weak model satisfies 2) and 3), bagging algorithm trains individual models using a random subsample for each which guarantee 1). Bootsraping guarantees some of the characteristics of the crowd. Wisedom of the crowd needs to be divers, through using either different algorithms or datasets.

Boostrapping requires:
* random subsamples
* using replacement

Boostrapping guarantees:
* diverse crowd (different datasets)
* indepenent (separately sampled)

### why bagging?
pro: 
* bagging usually reduce variance
* Overfitting can be avoided by the ensemble itself

con:
* computational expensive: time and space

In [None]:
from sklearn.ensemble import BaggingClassifier
# Instantiate the base model
clf_dt = DecisionTreeClassifier(max_depth=4)

# Build and train the Bagging classifier
clf_bag = BaggingClassifier(
  base_estimator=clf_dt,
  n_estimators=21,
  random_state=500)
clf_bag.fit(X_train, y_train)

# Predict the labels of the test set
pred = clf_bag.predict(X_test)

# Show the accuracy score
print('decision tree bagging accuracy score: {:}'.format(accuracy_score(y_test, pred)))

In [None]:
rf = RandomForestClassifier(random_state=1, class_weight="balanced")
rf.fit(X_train, y_train)
rf_pred=rf.predict(X_test)

# Evaluate the performance on the test set to compare
print('randomforest accuracy: {:}'.format(accuracy_score(y_test, rf_pred)))

In [None]:
# Build a balanced logistic regression
clf_lr = LogisticRegression(class_weight='balanced')

# Build and fit a bagging classifier
clf_bag = BaggingClassifier(base_estimator=clf_lr, max_features=10, oob_score=True, random_state=500)
clf_bag.fit(X_train, y_train)

# Evaluate the accuracy on the test set and show the out-of-bag score
pred = clf_bag.predict(X_test)
print('logistic classifier accuracy:  {:}'.format(accuracy_score(y_test, pred)))
print('OOB-Score: {:}'.format(clf_bag.oob_score_))

## booster
gradual learning
* iterating learning
* dependent estimators
* learning different tasks for the same goal
* sequential building

Possible steps in gradual learning:
1. First attempt (initial model)
2. Feedback (model evaluation)
3. Correct errors (subsequent model)


### Adaptive boosting

Instances are drawn using a sample distribution
* Difcult instances have higher weights
* Initialized to be uniform

Estimators are combined with a weighted
* majority voting
* Good estimators are given higher weights

Guaranteed to improve

Classication and Regression

* base_estimator Default: Decision Tree (max_depth=1)
* n_estimators Default: 50
* learning_rate Default: 1.0
* loss default linear (can be change to square or exponential)
* Trade-off between n_estimators and learning_rate

In [None]:
# Build and fit a tree-based AdaBoost classifier
reg_ada = AdaBoostClassifier(n_estimators=12, random_state=500)
reg_ada.fit(X_train, y_train)

# Calculate the predictions on the test set
pred = reg_ada.predict(X_test)

# Evaluate the performance using the RMSE
print('Adaptive boosting classifier accuracy:  {:}'.format(accuracy_score(y_test, pred)))

### Gradient boosting

Objective: $$y=f(X) $$
1. Initial model (weak estimator): $y = f_{1}(X)$
2. New model ts to residuals: $y - f_{1}(X) = f_{2}(X)$
3. New additive model: $y = f_{1}(X) + f_{2}(X)$
4. Repeat n times or until error is small enough
5. Final additive model: $y = f_{1}(X) + f_{2}(X)... + f_{n}(X) = \Sigma_{1}^{n}f_{X}(X)$

**Equivalence to gradient descent**
Residual: $$y-f_{i}(X)$$

Gradient Descent:
$$ loss = \frac{(f_{i}(X)-y)^2}{2}$$

$$gradient = \frac{\partial loss}{\partial f_i} = f_{i}(X)-y$$

Residuals = Negative Gradient:
$$y-f_{i}(X) = - \frac{\partial loss}{\partial f_i(X)}$$


* n_estimators Default: 100
* learning_rate Default: 0.1
* max_depth Default: 3
* min_samples_split
* min_samples_leaf
* max_features

In [None]:
# Build and fit a Gradient Boosting classifier
clf_gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=500)
clf_gbm.fit(X_train, y_train)

# Calculate the predictions on the test set
pred = clf_gbm.predict(X_test)

# Evaluate the performance based on the accuracy
acc = accuracy_score(y_test, pred)
print('Accuracy: {:}'.format(acc))

### Variations of gradient boosting
* Extreme Gradient Boosting: XGBoost
* Light Gradient Boosting Machine: LightGBM
* Categorical Boosting: CatBoost


### Extreme Gradient Boosting
* Optimized for distributed computing
* Paralleltraining by nature
* Scalable, portable, and accurate

### Light Gradient Boosting Machine
* Released by Microsoft (2017)
* Faster training and more efcient
* Lighter in terms of space
* Optimized for parallel and GPU processing
* Useful for problems with big datasets and constraints of speed or memory

### Categorical Boosting
* Open sourced by Yandex (April 2017)
* Built-in handling of categorical features
* Accurate and robust
* Fast and scalable
* User-friendly API


## XGBClassifier

In [None]:
model=xgb.XGBClassifier(max_depth=2, objective="reg:logistic")
model.fit(X_train, y_train)
pred=model.predict(X_test)

# Evaluate the performance based on the accuracy
acc = accuracy_score(y_test, pred)
print('Accuracy: {:}'.format(acc))

## stacking
* alike relay race

Very similar to the voting and averaging methods where the whole dataset is used by each model to make prediction, except instead of simply voting or averaging as the combiner to get the final prediction, stacking has a second layer of model which has all the predictions as the input features in addition to the original data. 

## mlxtend 

In [None]:
# Create the first-layer models
clf_knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree')
clf_dt = DecisionTreeClassifier(min_samples_leaf=5, min_samples_split=15, random_state=500)
clf_nb = GaussianNB()

# Create the second-layer model (meta-model)
clf_lr = LogisticRegression()

# Create and fit the stacked model
clf_stack = StackingClassifier(classifiers=[clf_knn, clf_dt, clf_nb], meta_classifier=clf_lr)
clf_stack.fit(X_train, y_train)

# Evaluate the stacked model’s performance
print("Accuracy: {:}".format(accuracy_score(y_test, clf_stack.predict(X_test))))

## cross validation with Gradient Boosting

In [None]:
clf_gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=500)

# Perform cross-validation
cross_val_scores = cross_val_score(clf_gbm, X_train, y_train, scoring="accuracy", cv=3)

# Print avg. accuracy
print("3-fold accuracy:", np.mean(cross_val_scores))

## cross validation and parameter tuning with Gradient Boosting

In [None]:
clf_gbm = GradientBoostingClassifier(random_state=500)

# Create the parameter grid
gbm_param_grid = {
    'clf_gbm__learning_rate': np.arange(0.05, 1, 0.05),
    'clf_gbm__n_estimators': np.arange(50, 200, 50)
}

In [None]:
# Perform RandomizedSearchCV
randomized_acc = RandomizedSearchCV(estimator=clf_gbm,param_distributions=gbm_param_grid, n_iter=2, scoring="accuracy", cv=2,verbose=1)

# Fit the estimator
randomized_acc.fit(X_train,y_train)

# Compute metrics
print(randomized_acc.best_score_)
print(randomized_acc.best_estimator_)

In [None]:
pred = randomized_acc.predict(X_test)
# Evaluate the performance based on the accuracy
acc = accuracy_score(y_test, pred)
print('Accuracy: {:}'.format(acc))

In [None]:
estimator.get_params().keys()