In [56]:
# models
# from https://www.kaggle.com/vipulgandhi/a-comprehensive-guide-to-ensemble-learning

In [76]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

from collections import Counter

In [24]:
df = pd.read_csv('data_uploan_nov2021.csv')
df.head(3)

Unnamed: 0,loan_number,loan_type,employer,employment_tenure,gross_monthly_income,loan_release_date,loan_release_mo,loan_release_year,payroll_cut-off,periodic_payment,...,_dpd2,_pd90,bucket,restructured_nov_30,_active_pd,status,_new_seasoned,delinquent_reason,last_repayment_paid_nov_20,principal_oustanding_par90
0,LN096458,LOAN ORIGINATIONS,WNS GLOBAL SERVICES,9,23764,12/30/2020,12,2020,13 AND 29,1729.22,...,Current,less than 90dpd\,Current,N,Active,SEPARATED,New Loan,unknown,1/13/2022,
1,LN053391,LOAN ORIGINATIONS,IBIDEN PHILS,6,23868,1/13/2021,1,2021,10 AND 25,2163.33,...,Current,less than 90dpd\,Current,N,Active,CURRENT,New Loan,unknown,10/25/2021,
2,LN096940,LOAN ORIGINATIONS,"METRO COMBINED LOGISTICS SOLUTIONS, INC.",11,53300,1/12/2021,1,2021,5 AND 20,4437.67,...,Current,less than 90dpd\,Current,N,Active,CURRENT,New Loan,unknown,10/25/2021,


In [25]:
# Select col types
cols_cat = df.select_dtypes(include='object').columns
cols_num = df.select_dtypes(include='number').columns

cols_X = ['employer', 'employment_tenure','gross_monthly_income', 
     'loan_release_mo', 'loan_release_year', 'payroll_cut-off', 'periodic_payment',
       'total_payable_assumed_balance', 'total_principal', 'total_interest',
       'no_of_remaining_repayments', 'outstanding_bal_nov_20', 'principal',
       'interest']
cols_y = ['target']

# Label encoding
ln_loan_type = LabelEncoder()
ln_employer = LabelEncoder()
ln_payroll = LabelEncoder()

df['loan_type'] = ln_loan_type.fit_transform(df['loan_type'])
df['employer'] = ln_employer.fit_transform(df['employer'])
df['payroll_cut-off'] = ln_payroll.fit_transform(df['payroll_cut-off'])

# Select X and y
X  = df[cols_X]
y = df.target

In [50]:
# Convert target to numeric
ln_target = LabelEncoder()
y = ln_target.fit_transform(df['target'])

In [51]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [52]:
print(Counter(y_train))
print(Counter(y_test))

Counter({2: 180, 0: 49, 1: 36})
Counter({2: 46, 0: 12, 1: 9})


In [53]:
# Random Forest
# Random Forest is an ensemble of Decision Trees.
# If we look at a single Decision Tree, important features are likely to appear closer to the root of the 
# tree, while unimportant features will often appear closer to the leaves (or not at all). It is possible 
# to get an estimate of a feature’s importance by computing the average depth at which it appears across all 
# trees in the forest.

model_rf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
print('Accuracy: %s' %accuracy_score(y_test, y_pred_rf))

# Feature importance
print('Feature Importance:')
for name, score in zip(X.columns, model_rf.feature_importances_):
    print(name, score)

Accuracy: 0.7164179104477612
Feature Importance:
employer 0.11720558267413946
employment_tenure 0.08155488964468935
gross_monthly_income 0.04146031402187824
loan_release_mo 0.03847925041163239
loan_release_year 0.010766755124965692
payroll_cut-off 0.054076522383988926
periodic_payment 0.060437028160277906
total_payable_assumed_balance 0.08435470964960833
total_principal 0.060570438422571794
total_interest 0.15648496154440814
no_of_remaining_repayments 0.06318784697217597
outstanding_bal_nov_20 0.08447978835459351
principal 0.054575886218060865
interest 0.09236602641700957


In [54]:
# Ada Boost
# Boosting refers to a family of algorithms that are able to convert weak learners to strong learners. 
# The main principle of boosting is to fit a sequence of weak learners− models that are only slightly 
# better than random guessing, such as small decision trees to weighted versions of the data. More weight 
# is given to examples that were misclassified by earlier rounds.
# The predictions are then combined through a weighted majority vote (classification) or a weighted sum (regression)
# to produce the final prediction.

# Boosting technique cannot be parallelized (or only partially) because each predictor can only be trained after 
# the previous predictor has been trained and evaluated. As a result, it does not scale as well as bagging / pasting.

# The predictors(classifier/ regressor) fit the training set in sequence. The next predictor corrects its predecessor 
# by paying more attention to the training instances that the predecessor underfitted. This results in new predictors 
# focusing more and more on the hard cases.

# To build an AdaBoost classifier, each instance's weight is set to an initial value. A base classifier (eg. Decision Tree)
# is trained and makes predictions on the training set. The relative weight of misclassified training instances is then 
# increased. The second classifier is trained on the training set using the updated weights and again it makes predictions
# on the training set and update the weights. The algorithm stops when the desired number of predictors is reached, or
# when a perfect predictor is found.


model_ada = AdaBoostClassifier(n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
model_ada.fit(X_train, y_train)
y_pred_ada = model_ada.predict(X_test)
print('Accuracy: %s' %accuracy_score(y_test, y_pred_ada))

# Feature importance
print('Feature Importance:')
for name, score in zip(X.columns, model_ada.feature_importances_):
    print(name, score)

Accuracy: 0.5970149253731343
Feature Importance:
employer 0.075
employment_tenure 0.02
gross_monthly_income 0.035
loan_release_mo 0.055
loan_release_year 0.04
payroll_cut-off 0.015
periodic_payment 0.13
total_payable_assumed_balance 0.04
total_principal 0.065
total_interest 0.225
no_of_remaining_repayments 0.065
outstanding_bal_nov_20 0.03
principal 0.105
interest 0.1


In [57]:
# Gradient Boosting
# Gradient Boosting works by sequentially adding predictors to an ensemble, each one correcting its predecessor. 
# However, instead of tweaking the instance weights at every iteration like AdaBoost, Gradient Boosting tries to 
# fit the new predictor to the residual errors made by the previous predictor.

model_gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
model_gbrt.fit(X_train, y_train)

# ...read thru gbr further

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [77]:
# Stacking
# Stacking is an ensemble learning technique that uses predictions from multiple models 
# (for example decision tree, knn or svm) to build a new model. This model is used for making 
# predictions on the test set.

# First, the training set is split in two subsets. The first subset is used to train the predictors in the first 
# layer. Next, the predictors in the first layer are used to make predictions on the second(hold-out) set. 
# Now (in example above) for each instance in the hold-out set there are four predicted values. A new 
# training set is created using these predicted values as input features and keeping the target values. 
# The blender is trained on this new training set, it learns to predict the target value where inputs are 
# the the first layer’s predictions.

# It is possible to train several different blenders on the top of one another (e.g., one using Linear 
# Regression, another using Random Forest Regression etc). The training set should be divided equal to the number of layers(see image above).

model_rf = RandomForestClassifier(n_estimators=10, random_state=1)
model_et = ExtraTreesClassifier(n_estimators=10, random_state=1)
model_ada = AdaBoostClassifier(n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
mlp_clf = MLPClassifier(random_state=42)

estimators = [model_rf, model_et, model_xgb, mlp_clf]

for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

[estimator.score(X_test, y_test) for estimator in estimators]

X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

model_rf_blend = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
model_rf_blend.fit(X_test_predictions, y_test)

print('OOB Score: %s' %model_rf_blend.oob_score_)

X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)
    
y_pred = model_rf_blend.predict(X_test_predictions)
print('Accuracy Score: %s' %accuracy_score(y_test, y_pred))

TypeError: 'module' object is not callable

In [89]:
# Linear Model
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

model_ln = linear_model.LinearRegression()
model_ln.fit(X_train, y_train).score(X_test, y_test)

predict_ln = model_ln.predict(X_test)

print('MSE: %s' %mean_squared_error(y_test, predict_ln))
print('R2: %s' %r2_score(y_test, predict_ln))

MSE: 0.49336356429261485
R2: 0.18875126735914005


In [106]:
# Ridge Regression
# Cannot handle multi-class
from sklearn import linear_model

model_ridge = linear_model.Ridge(alpha=0.5)
print('Score: {}'.format(model_ridge.fit(X_train, y_train).score(X_test, y_test)))

predict_ridge = model_ridge.predict(X_test)

model_ridge.coef_

# RidgeCV implements ridge regression with built-in cross-validation of the alpha parameter. 
# The object works in the same way as GridSearchCV except that it defaults to Leave-One-Out Cross-Validation:

model_ridge_cv = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
model_ridge_cv.fit(X_train, y_train).score(X_test, y_test)
model_ridge_cv.alpha_
np.logspace(-6, 6, 13)

Score: 0.18746924869674975


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
       1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06])

In [177]:
# RidgeClassifier!!!
#...fit transform first...

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

fit_ss = scaler.fit(X_test)
X_train_scaled = pd.DataFrame(fit_ss.transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(fit_ss.transform(X_test), columns=X_test.columns)

model_ridge_clf = linear_model.RidgeClassifier()
print(model_ridge_clf.fit(X_train_scaled, y_train).score(X_test_scaled, y_test))

predict_ridge_clf = model_ridge_clf.predict(X_test_scaled)
print('Accuracy: %s' %accuracy_score(y_test, predict_ridge_clf))
model_ridge_clf.coef_

0.7164179104477612
Accuracy: 0.7164179104477612


array([[-0.07379902, -0.16192602,  0.0464226 , -0.18013795, -0.44216748,
         0.05326037,  0.75546178, -0.86393837, -0.88852682, -0.70140485,
        -0.04682424,  0.60887214,  0.63777945,  0.4988921 ],
       [-0.19148143, -0.04393293,  0.0687405 , -0.02980247,  0.03522763,
        -0.10922648, -0.46143511,  0.33291434,  0.29341233,  0.34291441,
        -0.17037679, -0.13244197, -0.02855827, -0.33483381],
       [ 0.26528045,  0.20585895, -0.1151631 ,  0.20994042,  0.40693985,
         0.05596611, -0.29402667,  0.53102403,  0.59511449,  0.35849045,
         0.21720103, -0.47643017, -0.60922117, -0.16405829]])

In [187]:
# Stochastic Gradient Descent on Linear Models

from sklearn.pipeline import make_pipeline

model_sgb = make_pipeline(scaler, linear_model.SGDClassifier(max_iter=1000, tol=1e-3))
model_sgb.fit(X_train, y_train).score(X_test, y_test)

model_ridge_clf = make_pipeline(scaler, linear_model.RidgeClassifier())
model_ridge_clf.fit(X_train, y_train).score(X_test, y_test)

predict_ridge_clf = model_ridge_clf.predict(X_test)
accuracy_score(y_test, predict_ridge_clf)

0.7164179104477612

In [None]:
# test commit


'C:\\Users\\adria\\Documents\\agb_github\\sklearn'