# Libraries

In [1]:
# basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt

# preprocessing
from sklearn import preprocessing


# estimators
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression

# tools
from sklearn.model_selection import train_test_split
#from sklearn.cross_validation import train_test_split

#model metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report  
from sklearn.metrics import confusion_matrix

#Hyper parameter tuning
from sklearn.model_selection import RandomizedSearchCV 

# Import Data

In [2]:
# import claned and pre-procesed data
df = pd.read_csv('DF_M2T2_Light.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ID,LIMIT_BAL,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,Y_default
0,1,1,20000.0,2.0,2.0,-1.0,-1.0,-2.0,-2.0,0.0,689.0,0.0,0.0,1
1,2,2,120000.0,-1.0,2.0,0.0,0.0,0.0,2.0,0.0,1000.0,1000.0,1000.0,1
2,3,3,90000.0,0.0,0.0,0.0,0.0,0.0,0.0,1518.0,1500.0,1000.0,1000.0,0
3,4,4,50000.0,0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2019.0,1200.0,1100.0,0
4,5,5,50000.0,-1.0,0.0,-1.0,0.0,0.0,0.0,2000.0,36681.0,10000.0,9000.0,0


In [3]:
d = preprocessing.normalize(df)
scaled_df = pd.DataFrame(d)#, columns=names)
scaled_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,5e-05,5e-05,0.999407,0.0001,0.0001,-5e-05,-5e-05,-0.0001,-0.0001,0.0,0.03443,0.0,0.0,5e-05
1,1.7e-05,1.7e-05,0.999896,-8e-06,1.7e-05,0.0,0.0,0.0,1.7e-05,0.0,0.008332,0.008332,0.008332,8e-06
2,3.3e-05,3.3e-05,0.999596,0.0,0.0,0.0,0.0,0.0,0.0,0.01686,0.01666,0.011107,0.011107,0.0
3,8e-05,8e-05,0.997862,0.0,0.0,0.0,0.0,0.0,0.0,0.039914,0.040294,0.023949,0.021953,0.0
4,7.9e-05,7.9e-05,0.787573,-1.6e-05,0.0,-1.6e-05,0.0,0.0,0.0,0.031503,0.577779,0.157515,0.141763,0.0


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID,LIMIT_BAL,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,Y_default
0,1,1,20000.0,2.0,2.0,-1.0,-1.0,-2.0,-2.0,0.0,689.0,0.0,0.0,1
1,2,2,120000.0,-1.0,2.0,0.0,0.0,0.0,2.0,0.0,1000.0,1000.0,1000.0,1
2,3,3,90000.0,0.0,0.0,0.0,0.0,0.0,0.0,1518.0,1500.0,1000.0,1000.0,0
3,4,4,50000.0,0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2019.0,1200.0,1100.0,0
4,5,5,50000.0,-1.0,0.0,-1.0,0.0,0.0,0.0,2000.0,36681.0,10000.0,9000.0,0


## Define the features and Dependent Values

In [5]:
y = df[['Y_default']]
X = df.drop('Y_default', axis=1)
#y = scaled_df[13]
#X = scaled_df.drop(13, axis=1)

# Phase 0 >> Divide data into: Train - Validation - Test

Split data into: train, validation and test sets, or just train and test (depending on the function you will use).

In [6]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now <train_ratio> % of the entire data set the _junk suffix means that we drop that variable completely
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1-train_ratio) )

# test is now <test_ratio> % of the initial data set
# validation is now <validation_ratio> % of the initial data set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

print('DF: ', df.shape)
print('X:      ', X.shape, 'y:      ', y.shape)
print('X_Train:', X_train.shape,'y_Train:', y_train.shape)
print('X_Valid:', X_val.shape, ' y_Valid:', y_val.shape)
print('X_Test: ', X_test.shape,' y_Test: ', y_test.shape)

DF:  (30000, 14)
X:       (30000, 13) y:       (30000, 1)
X_Train: (22500, 13) y_Train: (22500, 1)
X_Valid: (4500, 13)  y_Valid: (4500, 1)
X_Test:  (3000, 13)  y_Test:  (3000, 1)


# Phase 1

Train the model on the training set and get the first performance measures on the validation set.

In [7]:
# list of possible candidates
algo_Candidates = []

algo_Candidates.append(('Random Forest Regressor', RandomForestRegressor()))
algo_Candidates.append(('Linear Regression', LinearRegression()))
algo_Candidates.append(('Suport Vector Regression', SVR()))
#algo_Candidates.append(('Logistic Regression', LogisticRegression()))
#algo_Candidates.append(('k-NN', KNeighborsClassifier()))

In [8]:
models=[]
performance=[]

for name, model in algo_Candidates:
    # train the model
    model = model.fit(X_train, y_train.values.ravel())
    
    # Get predictions and performance
    y_val_pred = model.predict(X_val)
    
    # Measure
    predictions = model.predict(X_val)
    predRsquared = r2_score(y_val,predictions)
    rmse = sqrt(mean_squared_error(y_val, predictions))
    
    # print results
    print('MODEL: ',name)
    print('R Squared: %.3f' % predRsquared)
    print('RMSE: %.3f' % rmse)
    print('--------------------------------')
    
    # save the models into a list of tuples
    models.append((name,model))

MODEL:  Random Forest Regressor
R Squared: 0.174
RMSE: 0.378
--------------------------------
MODEL:  Linear Regression
R Squared: 0.107
RMSE: 0.393
--------------------------------
MODEL:  Suport Vector Regression
R Squared: -0.087
RMSE: 0.433
--------------------------------


# Phase 2: 

Train with Cross-Validation and select the best candidates

### Cross Validation 

In [9]:
results = []
names = []
for name, model in models:
    result = cross_val_score(model, X_train, y_train.values.ravel(), cv=3, scoring='r2')
    names.append(name)
    results.append(result)

In [10]:
results

[array([0.15760292, 0.17746745, 0.15022825]),
 array([0.11465275, 0.11680465, 0.10910339]),
 array([-0.08732934, -0.08405847, -0.07955903])]

### Define the best candidates

As we used **r2** as scoring in the Cross-Val, we want to use the algorithms with values as close as possible to 1.  

In [11]:
for i in range(len(names)):
    
    print(names[i], ' r2 -> ', round(results[i].mean(),2))

Random Forest Regressor  r2 ->  0.16
Linear Regression  r2 ->  0.11
Suport Vector Regression  r2 ->  -0.08


So the best algorithm is **Random Forest Regressor**. 

The negative value of **SVR** means that this model is not performing better than the most stupid one that's an horizontal line simple line, wich predicts always the same value. 

# Phase 3:

Find a better hyper-parameters values combination

## Random Forest Regressor

In [12]:
model_RF = RandomForestRegressor()

# print the full list of parametrizable parameters
model_RF.get_params(deep=True)

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### using RandomizedSearchCV

GridSearchCV can be computationally expensive, especially if you are searching over a large hyperparameter space and dealing with multiple hyperparameters. A solution to this is to use RandomizedSearchCV, in which not all hyperparameter values are tried out. Instead, a fixed number of hyperparameter settings is sampled from specified probability distributions.

In [13]:
#Setup the parameters and distributions to sample from: param_dist

# numers of trees
n_estimators = [2,5,8,10]
# max number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# max number of levels in tree
max_depth = [5, 10, 15, 20]# 30, 40, 50]
max_depth.append(None)
# min number of samples required to split a node
min_samples_split = [2,5,10,20,30]
# min number of samples required at each leaf node
min_samples_leaf = [1,2,5,10,15]


param_dist = {#'bootstrap': True, 
              #'ccp_alpha': 0.0,
              #'criterion': 'mse',
              'n_estimators': n_estimators, 
              'max_features': max_features,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf
              #'max_leaf_nodes': None,
              #'max_samples': None,
              #'min_impurity_decrease': 0.0, 
              #'min_impurity_split': None, 
              #'min_weight_fraction_leaf': 0.0,           
              #'n_jobs': None, 
              #'oob_score': False, 
              #'random_state': None,
              #'verbose': 0, 
              #'warm_start': False
             }

In [14]:
# instantiate the RandomizedSearchCV object: model_cv
model_CV1 = RandomizedSearchCV(model_RF, param_dist, cv=3)

In [15]:
# Fit it to the data
model_CV1.fit(X_train,y_train.values.ravel())

RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(),
                   param_distributions={'max_depth': [5, 10, 15, 20, None],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 5, 10, 15],
                                        'min_samples_split': [2, 5, 10, 20, 30],
                                        'n_estimators': [2, 5, 8, 10]})

In [16]:
# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(model_CV1.best_params_))

Tuned Decision Tree Parameters: {'n_estimators': 5, 'min_samples_split': 20, 'min_samples_leaf': 15, 'max_features': 'sqrt', 'max_depth': 5}


In [17]:
result_CV1 = cross_val_score(model_CV1, X_val, y_val.values.ravel(), cv=3, scoring='r2')
print(result.mean())

-0.08364894447276616


# Phase 4

In [20]:
 # train the model
model = model_CV1.fit(X_test, y_test.values.ravel())
name = 'RandomForestRegressor'

# Get predictions and performance
y_val_pred = model.predict(X_test)
    
# Measure
predictions = model.predict(X_test)
predRsquared = r2_score(y_test,predictions)
rmse = sqrt(mean_squared_error(y_test, predictions))
    
# print results
print('MODEL: ',name)
print('R Squared: %.3f' % predRsquared)
#print('RMSE: %.3f' % rmse)
print('--------------------------------')

MODEL:  RandomForestRegressor
R Squared: 0.324
--------------------------------
