In [56]:
# models
# from https://www.kaggle.com/vipulgandhi/a-comprehensive-guide-to-ensemble-learning

In [62]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

from collections import Counter

In [7]:
df = pd.read_csv('data_uploan_nov2021.csv')
df.head(3)

Unnamed: 0,loan_number,loan_type,employer,employment_tenure,gross_monthly_income,loan_release_date,loan_release_mo,loan_release_year,payroll_cut-off,periodic_payment,...,_dpd2,_pd90,bucket,restructured_nov_30,_active_pd,status,_new_seasoned,delinquent_reason,last_repayment_paid_nov_20,principal_oustanding_par90
0,LN096458,LOAN ORIGINATIONS,WNS GLOBAL SERVICES,9,23764,12/30/2020,12,2020,13 AND 29,1729.22,...,Current,less than 90dpd\,Current,N,Active,SEPARATED,New Loan,unknown,1/13/2022,
1,LN053391,LOAN ORIGINATIONS,IBIDEN PHILS,6,23868,1/13/2021,1,2021,10 AND 25,2163.33,...,Current,less than 90dpd\,Current,N,Active,CURRENT,New Loan,unknown,10/25/2021,
2,LN096940,LOAN ORIGINATIONS,"METRO COMBINED LOGISTICS SOLUTIONS, INC.",11,53300,1/12/2021,1,2021,5 AND 20,4437.67,...,Current,less than 90dpd\,Current,N,Active,CURRENT,New Loan,unknown,10/25/2021,


In [63]:
# Select col types
cols_cat = df.select_dtypes(include='object').columns
cols_num = df.select_dtypes(include='number').columns

cols_X = ['employer', 'employment_tenure','gross_monthly_income', 
     'loan_release_mo', 'loan_release_year', 'payroll_cut-off', 'periodic_payment',
       'total_payable_assumed_balance', 'total_principal', 'total_interest',
       'no_of_remaining_repayments', 'outstanding_bal_nov_20', 'principal',
       'interest']
cols_y = ['target']

# Label encoding
ln_loan_type = LabelEncoder()
ln_employer = LabelEncoder()
ln_payroll = LabelEncoder()

df['loan_type'] = ln_loan_type.fit_transform(df['loan_type'])
df['employer'] = ln_employer.fit_transform(df['employer'])
df['payroll_cut-off'] = ln_payroll.fit_transform(df['payroll_cut-off'])

# Select X and y
X  = df[cols_X]
y = df.target

In [64]:
# Convert target to numeric
ln_target = LabelEncoder()
y = ln_target.fit_transform(df['target'])

In [65]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [66]:
from collections import Counter
print(Counter(y_train))
print(Counter(y_test))

Counter({2: 180, 0: 49, 1: 36})
Counter({2: 46, 0: 12, 1: 9})


In [72]:
# Scale
scaler_train = StandardScaler()
scaler_test = StandardScaler()

fit_scaler_X_test = scaler_test.fit(X_test)
fit_scaler_X_train = scaler_train.fit(X_train)

X_test_scaled = fit_scaler_X_test.transform(X_test)
X_train_scaled = fit_scaler_X_train.transform(X_train)

In [53]:
# Random Forest
# Random Forest is an ensemble of Decision Trees.
# If we look at a single Decision Tree, important features are likely to appear closer to the root of the 
# tree, while unimportant features will often appear closer to the leaves (or not at all). It is possible 
# to get an estimate of a feature’s importance by computing the average depth at which it appears across all 
# trees in the forest.

model_rf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
print('Accuracy: %s' %accuracy_score(y_test, y_pred_rf))

# Feature importance
print('Feature Importance:')
for name, score in zip(X.columns, model_rf.feature_importances_):
    print(name, score)

Accuracy: 0.7164179104477612
Feature Importance:
employer 0.11720558267413946
employment_tenure 0.08155488964468935
gross_monthly_income 0.04146031402187824
loan_release_mo 0.03847925041163239
loan_release_year 0.010766755124965692
payroll_cut-off 0.054076522383988926
periodic_payment 0.060437028160277906
total_payable_assumed_balance 0.08435470964960833
total_principal 0.060570438422571794
total_interest 0.15648496154440814
no_of_remaining_repayments 0.06318784697217597
outstanding_bal_nov_20 0.08447978835459351
principal 0.054575886218060865
interest 0.09236602641700957


In [54]:
# Ada Boost
# Boosting refers to a family of algorithms that are able to convert weak learners to strong learners. 
# The main principle of boosting is to fit a sequence of weak learners− models that are only slightly 
# better than random guessing, such as small decision trees to weighted versions of the data. More weight 
# is given to examples that were misclassified by earlier rounds.
# The predictions are then combined through a weighted majority vote (classification) or a weighted sum (regression)
# to produce the final prediction.

# Boosting technique cannot be parallelized (or only partially) because each predictor can only be trained after 
# the previous predictor has been trained and evaluated. As a result, it does not scale as well as bagging / pasting.

# The predictors(classifier/ regressor) fit the training set in sequence. The next predictor corrects its predecessor 
# by paying more attention to the training instances that the predecessor underfitted. This results in new predictors 
# focusing more and more on the hard cases.

# To build an AdaBoost classifier, each instance's weight is set to an initial value. A base classifier (eg. Decision Tree)
# is trained and makes predictions on the training set. The relative weight of misclassified training instances is then 
# increased. The second classifier is trained on the training set using the updated weights and again it makes predictions
# on the training set and update the weights. The algorithm stops when the desired number of predictors is reached, or
# when a perfect predictor is found.


model_ada = AdaBoostClassifier(n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
model_ada.fit(X_train, y_train)
y_pred_ada = model_ada.predict(X_test)
print('Accuracy: %s' %accuracy_score(y_test, y_pred_ada))

# Feature importance
print('Feature Importance:')
for name, score in zip(X.columns, model_ada.feature_importances_):
    print(name, score)

Accuracy: 0.5970149253731343
Feature Importance:
employer 0.075
employment_tenure 0.02
gross_monthly_income 0.035
loan_release_mo 0.055
loan_release_year 0.04
payroll_cut-off 0.015
periodic_payment 0.13
total_payable_assumed_balance 0.04
total_principal 0.065
total_interest 0.225
no_of_remaining_repayments 0.065
outstanding_bal_nov_20 0.03
principal 0.105
interest 0.1


In [57]:
# Gradient Boosting
# Gradient Boosting works by sequentially adding predictors to an ensemble, each one correcting its predecessor. 
# However, instead of tweaking the instance weights at every iteration like AdaBoost, Gradient Boosting tries to 
# fit the new predictor to the residual errors made by the previous predictor.

model_gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
model_gbrt.fit(X_train, y_train)

# ...read thru gbr further

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [170]:
# Stacking
# Stacking is an ensemble learning technique that uses predictions from multiple models 
# (for example decision tree, knn or svm) to build a new model. This model is used for making 
# predictions on the test set.

# First, the training set is split in two subsets. The first subset is used to train the predictors in the first 
# layer. Next, the predictors in the first layer are used to make predictions on the second(hold-out) set. 
# Now (in example above) for each instance in the hold-out set there are four predicted values. A new 
# training set is created using these predicted values as input features and keeping the target values. 
# The blender is trained on this new training set, it learns to predict the target value where inputs are 
# the the first layer’s predictions.

# It is possible to train several different blenders on the top of one another (e.g., one using Linear 
# Regression, another using Random Forest Regression etc). The training set should be divided equal to the number of layers(see image above).

model_rf = RandomForestClassifier(n_estimators=10, random_state=1)
model_et = ExtraTreesClassifier(n_estimators=10, random_state=1)
model_ada = AdaBoostClassifier(n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
# mlp_clf = MLPClassifier(random_state=42)

estimators = [model_rf, model_et, model_ada]

for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

[estimator.score(X_test, y_test) for estimator in estimators]

X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

model_rf_blend = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
model_rf_blend.fit(X_test_predictions, y_test)

print('OOB Score: %s' %model_rf_blend.oob_score_)

X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)
    
y_pred = model_rf_blend.predict(X_test_predictions)
print('Accuracy Score: %s' %accuracy_score(y_test, y_pred))

Training the RandomForestClassifier(n_estimators=10, random_state=1)
Training the ExtraTreesClassifier(n_estimators=10, random_state=1)
Training the AdaBoostClassifier(learning_rate=0.5, n_estimators=200)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

OOB Score: 0.7611940298507462


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Accuracy Score: 0.8059701492537313


In [89]:
# Linear Model
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

model_ln = linear_model.LinearRegression()
model_ln.fit(X_train, y_train).score(X_test, y_test)

predict_ln = model_ln.predict(X_test)

print('MSE: %s' %mean_squared_error(y_test, predict_ln))
print('R2: %s' %r2_score(y_test, predict_ln))

MSE: 0.49336356429261485
R2: 0.18875126735914005


In [106]:
# Ridge Regression
# Cannot handle multi-class
from sklearn import linear_model

model_ridge = linear_model.Ridge(alpha=0.5)
print('Score: {}'.format(model_ridge.fit(X_train, y_train).score(X_test, y_test)))

predict_ridge = model_ridge.predict(X_test)

model_ridge.coef_

# RidgeCV implements ridge regression with built-in cross-validation of the alpha parameter. 
# The object works in the same way as GridSearchCV except that it defaults to Leave-One-Out Cross-Validation:

model_ridge_cv = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
model_ridge_cv.fit(X_train, y_train).score(X_test, y_test)
model_ridge_cv.alpha_
np.logspace(-6, 6, 13)

Score: 0.18746924869674975


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
       1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06])

In [61]:
# RidgeClassifier!!!
#...fit transform first...

model_ridge_clf = linear_model.RidgeClassifier()
print(model_ridge_clf.fit(X_train_scaled, y_train).score(X_test_scaled, y_test))

predict_ridge_clf = model_ridge_clf.predict(X_test_scaled)
print('Accuracy: %s' %accuracy_score(y_test, predict_ridge_clf))
model_ridge_clf.coef_

0.7164179104477612


NameError: name 'accuracy_score' is not defined

In [48]:
# Stochastic Gradient Descent on Linear Models
import pandas as pd
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

model_sgb = make_pipeline(scaler, linear_model.SGDClassifier(max_iter=1000, tol=1e-3))
model_sgb.fit(X_train, y_train).score(X_test, y_test)
predict_sgb = model_sgb.predict(X_test)

model_ridge_clf = make_pipeline(scaler, linear_model.RidgeClassifier())
model_ridge_clf.fit(X_train, y_train).score(X_test, y_test)
predict_ridge_clf = model_ridge_clf.predict(X_test)

cols = ['predict_sgb','predict_ridge_clf']
foo = pd.DataFrame(zip(predict_sgb, predict_ridge_clf, y_test))
foo.rename(columns={0:'predict_sgb', 1:'predict_ridge_clf',2:'actual'}, inplace=True)

print(pd.crosstab(foo['actual'], [foo['predict_sgb']], aggfunc='count', values='actual'))
print('')
print(pd.crosstab(foo['actual'], [foo['predict_ridge_clf']], aggfunc='count', values='actual'))


predict_sgb    0    1     2
actual                     
0            5.0  1.0   6.0
1            1.0  2.0   6.0
2            NaN  5.0  41.0

predict_ridge_clf    0     2
actual                      
0                  2.0  10.0
1                  NaN   9.0
2                  NaN  46.0


In [106]:
# Perceptron
from sklearn.linear_model import Perceptron
from sklearn.model_selection import cross_val_predict

model_per = make_pipeline(linear_model.Perceptron(random_state=1))
model_per.fit(X_train_scaled, y_train).score(X_test_scaled, y_test)

cross_val_score(model_per, X_test_scaled, y_test, cv=5)

<IPython.core.display.Javascript object>

array([0.57142857, 0.35714286, 0.46153846, 0.53846154, 0.76923077])

In [122]:
predict_per = pd.DataFrame(cross_val_predict(model_per, X_test_scaled, y_test), columns=['predict_per'])
foo = pd.concat([predict_per, pd.Series(y_test)], axis=1)
foo.rename(columns={0:'actual'}, inplace=True)
pd.crosstab(foo['predict_per'], foo['actual'], aggfunc='count', values='actual')

actual,0,1,2
predict_per,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5.0,1.0,8.0
1,,1.0,8.0
2,7.0,7.0,30.0


In [145]:
# Polynomial regression: extending linear models with basis functions¶
# One common pattern within machine learning is to use linear models trained on nonlinear functions of the data. 
# This approach maintains the generally fast performance of linear methods, while allowing them to fit a much wider
# range of data. For example, a simple linear regression can be extended by constructing polynomial features 
# from the coefficients. In the standard linear regression case, you might have a model that looks like this 
# for two-dimensional data:

a = pd.Series([1,2,3,4,5,4,3,2,1,0])
b = pd.Series([0,0,1,1,1,1,1,0,0,0])
foo = pd.concat([a,b], axis=1)
foo.rename(columns={0:'a',1:'b'}, inplace=True)
a = foo[['a']]
b = foo.b

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
model_ln_poly = Pipeline([('poly', PolynomialFeatures(degree=2)),
                               ('model_ln',LinearRegression())])
model_ln_poly.fit(a,b).score(a,b)


<IPython.core.display.Javascript object>

0.7511111111111112

In [152]:
# LDA and QDA

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model_lda = LinearDiscriminantAnalysis()
model_lda.fit(X_train, y_train).score(X_test, y_test)
model_lda.coef_

from sklearn import linear_model

model_e1 = make_pipeline(LinearDiscriminantAnalysis(), linear_model.RidgeClassifier())
model_e1.fit(X_train, y_train).score(X_test, y_test)

0.7313432835820896

In [162]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train).score(X_test, y_test)

0.7761194029850746

In [168]:
from scipy.stats import describe
describe(my_score)
model_rf_blend.predict_proba(X_test)



ValueError: X has 14 features, but RandomForestClassifier is expecting 3 features as input.

In [214]:
# Stacking 

from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier

# Define models    
model_rf = RandomForestClassifier(n_estimators=10, random_state=1)
model_et = ExtraTreesClassifier(n_estimators=10, random_state=1)
model_ada = AdaBoostClassifier(n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
mlp_clf = MLPClassifier(random_state=i)

# Create estimators
estimators = [model_rf, model_et, model_ada, mlp_clf]

# Fit each estimator
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

[estimator.score(X_test, y_test) for estimator in estimators]

# Create X_test_predictions to be used by model_rf_blend below
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

# Fit    
model_rf_blend = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
model_rf_blend.fit(X_test_predictions, y_test)
model_rf_blend.predict(X_test_predictions)


Training the RandomForestClassifier(n_estimators=10, random_state=1)
Training the ExtraTreesClassifier(n_estimators=10, random_state=1)
Training the AdaBoostClassifier(learning_rate=0.5, n_estimators=200)
Training the MLPClassifier(random_state=9)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2])

In [245]:
# For predicting...

# Predict first row of X_test
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test.iloc[[50]])

y_pred = model_rf_blend.predict(X_test_predictions)
accuracy_score(y_test, y_pred)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

0.6865671641791045

In [246]:
X_test.iloc[[0]]

Unnamed: 0,employer,employment_tenure,gross_monthly_income,loan_release_mo,loan_release_year,payroll_cut-off,periodic_payment,total_payable_assumed_balance,total_principal,total_interest,no_of_remaining_repayments,outstanding_bal_nov_20,principal,interest
261,29,11,56500,2,2020,3,4550.64,159272.4,131603.48,27668.92,13,59158.36,54843.09,4315.27
