# Give me some credit: model with sklearn (scikit-learn)

## Load prepared datasets

In [None]:
from pandas import read_csv
train_prepared = read_csv('../handson-ml2/train.csv', index_col=0)
val_prepared = read_csv('../handson-ml2/val.csv', index_col=0)

When using most ML libraries in Python, we need to present data as  separate variables (arrays) for inputs and outputs, e.g. `X_train` and `y_train` for training data.

Let's start with outputs:

In [None]:
target_column = 'SeriousDlqin2yrs'
y_train = train_prepared[target_column].values
print(y_train)

Inputs:

In [None]:
from pprint import pprint
X_train = train_prepared.drop(target_column, axis=1).values
pprint(X_train)

Likewise for val data:

In [None]:
X_val = val_prepared.drop(target_column, axis=1).values
y_val = val_prepared[target_column].values

## Create model from train set

Initialize model by specifying which learning technique to use:

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier() # this class contains the implementation of the Random Forest learning technique for classification

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier()

This "model" is empty, since it hasn't seen any data yet. Train the model:

In [None]:
import pickle
model.fit(X_train, y_train)
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

## Apply model to val set

In [None]:
y_val_proba = model.predict_proba(X_val)
print(y_val_proba)

In [None]:
y_val_proba = y_val_proba[:,1]
print(y_val_proba)

## Compute AUC

In [None]:
from sklearn import metrics
metrics.roc_auc_score(y_val, y_val_proba)

## Compute confusion matrix

In [None]:
threshold = 0.5
y_val_pred = [1 if p >= threshold else 0 for p in y_val_proba]
pprint(y_val_pred)

In [None]:
metrics.confusion_matrix(y_val, y_val_pred)

## Compute ROC curve

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt

# Compute fpr, tpr, thresholds and roc auc
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
%matplotlib inline
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')  # random predictions curve
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.show()

## Compute Learning Curve

In [None]:
from sklearn.model_selection import learning_curve
import numpy as np
train_sizes, train_scores, test_scores = learning_curve(XGBClassifier(), 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=10,
                                                        # Evaluation metric
                                                        scoring='roc_auc',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 10 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))
# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Draw lines
plt.plot(train_sizes, train_mean, '--', color='navy',  label="Training score")
plt.plot(train_sizes, test_mean, color='red', label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="grey")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="grey")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("AUC Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()