# libraries

In [1]:
# basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt

# estimators
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression

# tools
from sklearn.model_selection import train_test_split
#from sklearn.cross_validation import train_test_split

#model metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

# import Data

In [2]:
# import data
df = pd.read_csv('DF_M2T2_Light.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ID,LIMIT_BAL,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,Y_default
0,1,1,20000.0,2.0,2.0,-1.0,-1.0,-2.0,-2.0,0.0,689.0,0.0,0.0,1
1,2,2,120000.0,-1.0,2.0,0.0,0.0,0.0,2.0,0.0,1000.0,1000.0,1000.0,1
2,3,3,90000.0,0.0,0.0,0.0,0.0,0.0,0.0,1518.0,1500.0,1000.0,1000.0,0
3,4,4,50000.0,0.0,0.0,0.0,0.0,0.0,0.0,2000.0,2019.0,1200.0,1100.0,0
4,5,5,50000.0,-1.0,0.0,-1.0,0.0,0.0,0.0,2000.0,36681.0,10000.0,9000.0,0


# Select Features and dependent value

In [3]:
y = df[['Y_default']]
X = df.drop('Y_default', axis=1)

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)#, stratify=y)

print(y.shape)
print(X.shape)
print(df.shape)

(30000, 1)
(30000, 13)
(30000, 14)


# Cross_val

In [4]:
algoClass = []

algoClass.append(('Random Forest Regressor', RandomForestRegressor()))
algoClass.append(('Linear Regression', LinearRegression()))
algoClass.append(('Suport Vector Regression', SVR()))

algoClass.append(('Logistic Regression', LogisticRegression()))
algoClass.append(('k-NN', KNeighborsClassifier()))


print(algoClass)

[('Random Forest Regressor', RandomForestRegressor()), ('Linear Regression', LinearRegression()), ('Suport Vector Regression', SVR()), ('Logistic Regression', LogisticRegression()), ('k-NN', KNeighborsClassifier())]


In [None]:
results = []
names = []
for name, model in algoClass:
    result = cross_val_score(model, X, y.values.ravel(), cv=4, scoring='r2')
    names.append(name)
    results.append(result)

In [None]:
for i in range(len(names)):
    print(names[i],results[i].mean())

Ok, so the best algorithms to solve this problems seems to be the **Linear Regression** and **Random Forest Regressor**

# MODELS

## Linear Regression

In [None]:
algo_LR = LinearRegression()

model_LR = algo_LR.fit(X_train,y_train.values.ravel())
# Print the accuracy
print(model_LR.score(X_test, y_test))

### Evaulate the model

In [None]:
#Make Predictions
predictions_LR = model_LR.predict(X_test)
predRsquared_LR = r2_score(y_test,predictions_LR)
rmse_LR = sqrt(mean_squared_error(y_test, predictions_LR))
print('R Squared: %.3f' % predRsquared_LR)
print('RMSE: %.3f' % rmse_LR)

### Plot Results

In [None]:
plt.scatter(y_test, predictions_LR, alpha = 0.002)
plt.xlabel('Ground Truth')
plt.ylabel('Predictions')
plt.show();

## Random Forest Regressor

In [None]:
algo_RFR = RandomForestRegressor()

model_RFR = algo_RFR.fit(X_train,y_train.values.ravel())
# Print the accuracy
print(model_RFR.score(X_test, y_test))

### Evulate the model

In [None]:
#Make Predictions
predictions_RFR = model_RFR.predict(X_test)
predRsquared_RFR = r2_score(y_test,predictions_RFR)
rmse_RFR = sqrt(mean_squared_error(y_test, predictions_RFR))
print('R Squared: %.3f' % predRsquared_RFR)
print('RMSE: %.3f' % rmse_RFR)

### Plot Result

In [None]:
plt.scatter(y_test, predictions_RFR, alpha = 0.002, s=80)
plt.xlabel('Ground Truth')
plt.ylabel('Predictions')
plt.show();