In [None]:
# TRAINING SCRIPT
# Import and parameter section
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import glob
from imblearn.over_sampling import RandomOverSampler
import glob
import pickle
from sklearn import metrics
from scipy import stats

In [None]:
file_name = glob.glob('../data/*.csv')
df = pd.read_csv(file_name[0],  sep = ",")

In [None]:
df.dropna(inplace=True)
cols_to_drop = ['id','debt_requests_count','housing_base_cost',
                'a_mal_count','a_mal_active_amount',
                'e_mal_count','e_mal_active_amount',
                'contact_channel', 'blanco_amount',
               'inquiries_count','credit_card_amount', 'credit_used','income_employment',
               'income_tax','creditors_count','salary_surplus', 'capital_deduction',
               'credit_count','income_gross',
               'loan_type','customer_postal']

df.drop(cols_to_drop, inplace=True, axis=1)


In [None]:
# Convert big_city column to boolean
bc_dict = {'f': 0, 't': 1}
df.big_city = df.big_city.map(bc_dict) 

# Drop outliers
df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

In [None]:
# Prepare features and label
y = df.pop('target')
x = df.copy()

In [None]:
# Perform oversampling to balance the classes
ros = RandomOverSampler(random_state=0)
X, Y = ros.fit_resample(x, y)

# Divide data into test and training
test_size = 0.25
seed = 7
X_train, X_test, y_train, y_test =  train_test_split(X, Y, test_size=test_size, random_state=seed)

In [None]:
# # Parameter tuning

# param_grid = {'bootstrap': [True, False],
#  'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
#  'max_features': ['auto', 'sqrt'],
#  'min_samples_leaf': [1, 2, 4],
#  'min_samples_split': [2, 5, 10],
#  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

# gsc = GridSearchCV(
#         estimator=RandomForestClassifier(),
#         param_grid=param_grid,
#         cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

# gsc.fit(X_train, y_train)

#print(gsc.best_params_)

In [None]:
# Train RF
model = RandomForestClassifier(bootstrap=True, max_depth=80, max_features='sqrt',
                               min_samples_leaf=1, min_samples_split=5, 
                               n_estimators=1200, n_jobs= -1)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(accuracy_score(y_test, predictions))

In [None]:
# Display confusion matrix to investigate result
df_cm = confusion_matrix(y_test, predictions)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True)

In [None]:
# Produce small report on performance
print('Mean Squared Error:', metrics.mean_squared_error(y_test, predictions))  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predictions))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R2:', metrics.r2_score(y_test, predictions))

In [None]:
# Save the model
pickle.dump(model, open('../models/model.pickle', 'wb'))

In [None]:
# Create and save report csv 
prob = model.predict_proba(X_test)
pr = pd.Series([el[1] for el in prob])
report = pd.concat([pd.Series(y_test), pr], axis=1, ignore_index=True)
report.columns = ['target','pd']
df.to_csv('../data/report.csv', index=False)  