# Supervised Learning

## Non-parametric models

### CART  (Classification and Regression Trees)

# 7 classification trees

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import auc, confusion_matrix, log_loss, roc_curve
from sklearn.model_selection import cross_validate

**conda install -c conda-forge python-graphviz**     
**conda install -c conda-forge pydot**

In [None]:
import graphviz
import pydot
from IPython.display import Image

In [None]:
cred = pd.read_csv('data/cred_ohe.csv', header = 0)
cred.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(cred.iloc[:,:-1], cred.iloc[:,-1], 
                                                    test_size = 0.2, random_state = 65) 

`DecisionTreeClassifier(criterion ='gini',    
                        splitter='best', 
                        max_depth=None,    
                        min_samples_split=2,    
                        min_samples_leaf=1,   
                        max_features=None,    
                        random_state=None,    
                        max_leaf_nodes=None)`

`criterion: {'gini', 'random'}` the method used to measure the quality of the split.     
  - `gini` (default) [0,1] is a measure of the probability of being incorrect if you randomly assign labels to an element in the set if it was randomly labeled according to the distribution of labels in the subset.
  

`max_depth` the max depth of the tree.   
`min_samples_split` the minimum samples requires to split a node. `int: number` or `float: percentage`  
`min_samples_leaf` the minimum number of samples requires to be in a leaf node `int: number` or `float: percentage`  
`max_feataures` max number of feature to consider `int: number`, `float: percentage` or `auto`, `sqrt`, `log2` or `None`    
`max_leaf_nodes` max number of leafs. if `None`then unlimited number of leafs. 




In [None]:
d_tree = DecisionTreeClassifier(max_depth = 10, min_samples_split = 30, min_samples_leaf = 30).fit(x_train, y_train)

In [None]:
varimp_d_tree = pd.DataFrame(d_tree.feature_importances_.tolist(), columns = ['value']).set_index(cred.columns[:-1]).\
                            sort_values(by = ['value'], ascending = False)
    
varimp_d_tree

In [None]:
dot_data = tree.export_graphviz(d_tree, 
                                out_file='my_tree.dot',
                                filled=True, rounded=True,
                                feature_names = x_train.columns, 
                                class_names = ['yes','no'], 
                                special_characters=False)

graph = graphviz.Source(dot_data) 

assining out_file = None produces a pdf

In [None]:
import pydot

In [None]:
(graph,) = pydot.graph_from_dot_file('my_tree.dot')

In [None]:
#graph.write_png('my_tree.png')

In [None]:
Image("my_tree.png") 

&nbsp;

&nbsp;


In [None]:
predicted_prob_tree = d_tree.predict_proba(x_test)
predicted_prob_tree[:10,:]

decision tree/randomForest classifiers upon prediction return an array of two columns

at each row the sum of the two values add up to 1. the first column is the probability of failure and the second column is the probability of success (given that failure is `0` and success is `1`)   

In [None]:
y_prob_tree  = predicted_prob_tree[:,1].tolist()

In [None]:
predicted_d_tree = [0 if i < 0.5 else 1 for i in y_prob_tree]

pd.DataFrame(confusion_matrix(y_true = y_test, y_pred = predicted_d_tree))

In [None]:
fpr, tpr, _ = roc_curve(y_true = y_test, y_score = y_prob_tree)
AUC = auc(x = fpr, y = tpr )
plt.figure(figsize = (10,7))
plt.plot(fpr,tpr,label='AUC = {}'.format(AUC), color = 'purple')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC curve Tree Classifier')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.legend(loc="lower right")

In [None]:
log_loss(y_true = y_test , y_pred = predicted_d_tree)

&nbsp;

# 8 cross validation

the scoring paramtere <a href='http://scikit-learn.org/stable/modules/model_evaluation.html'>link</a>


* cross validation is a method used for model validation to assess the behavior of the model for out-of-sample testing.   

* there are many flavors to cross-validaion such as k-fold cross validation and Leave-One-Out cross validation. 

* the basic idea involves estimating several models to the same dataset but each time reshuffling the train and test sets. 

* the behavior of the model is then evaluated using a chose metric such as `log_loss` or `roc_curve` etc 

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

&nbsp;

`cross_val_score(estimator, 
                X, 
                y=None, 
                scoring=None, 
                cv=None, 
                n_jobs=1)`
                
                
`estimator` the fitted model    
`X` train set   
`y` test set   
`scoring` evaluation metric 
`cv` an integer (k-fold) or a cross-validation generator    

In [None]:
scores = cross_val_score(d_tree,
                         X = cred.iloc[:,:-1], 
                         y = cred.iloc[:,-1],
                         cv=5, 
                         scoring = 'roc_auc')
scores

&nbsp;

the method `shuffleSplit()` is random permutation generator that yields indices to split data into training and test sets.


`ShuffleSplit(n_splits=10,
                test_size='default',
                train_size=None, 
                random_state=None)`

In [None]:
cv_splits = ShuffleSplit(n_splits = 5, test_size = .3, random_state = 12)

In [None]:
scores = cross_val_score(d_tree,
                         cred.iloc[:,:-1],
                         cred.iloc[:,-1], 
                         cv=cv_splits, 
                         scoring = 'roc_auc')

In [None]:
scores

&nbsp;

`cv_splits` which is an instance of the method `ShuffleSplit()` has an instance method `split` which is nothing but a generator, we can expand it using list and retrives the indeces of the train and test folds. 

`cv_splits` is already defined to have `n_splits=5`, `test_size=3` and `random_state=12` , we can extract the indeces using the method `split` as follows   

In [None]:
manual_split = cv_splits.split(X = cred.iloc[:,:-1],
                y = cred.iloc[:,-1])

In [None]:
train_folds = list(manual_split)

In [None]:
train_folds[1]

In [None]:
from matplotlib import colors as mcolors

streamline the extraction process

In [None]:
def cv_tree_roc(split_obj, df):
    
    '''  
    split_obj: a ShuffleSplit.split() generator object defining k-fold train and test indeces in a DataFrame
         
    df: pandas DataFrame 
             
    this method assumes the last column of df constitutes the dependent variable
    '''
    
    plt.figure(figsize = (15,9))
    lsp = list(split_obj)
    folds = len(lsp)
    index = folds -1
    
    auc_ = []
    col_vec = []
    while index > -1:

        
        x_train, x_test = df.iloc[lsp[index][0].tolist(),:-1], df.iloc[lsp[index][1].tolist(),:-1]
        
        y_train, y_test = df.iloc[lsp[index][0].tolist(),-1], df.iloc[lsp[index][1].tolist(),-1]
        
        pine = DecisionTreeClassifier(max_depth = 10, min_samples_split = 30, min_samples_leaf = 30).\
                                                                    fit(x_train, y_train)
        predicted_prob = pine.predict_proba(x_test)
        y_prob = predicted_prob[:,1].tolist()
        
        # ensuring no color redundency 
        col = np.random.randint(0,147)
        if col in col_vec:
            col = np.random.randint(0,147)
        else:
            col_vec.append(col)
        
        
        fpr, tpr, _ = roc_curve(y_true = y_test, y_score = y_prob)
        auc_.append(round(auc(x = fpr, y = tpr),4))
        plt.plot(fpr,tpr, color = list(mcolors.CSS4_COLORS.keys())[col])
        index -= 1
        
        
    plt.title('ROC curves for {}-fold cross validation'.format(folds))
    plt.ylabel('True positive rate')
    plt.xlabel('False positive rate')
    names = list(map(lambda x,y: x+str(y), folds * ['fold '], list(range(1,folds+1))))
    plt.legend(auc_ ,loc="best", prop = {'size':15})
    plt.plot([0, 1], [0, 1], 'k--')
    
    print('mean AUC for {}-fold cross validation: {}'.format(folds,round(np.mean(auc_),4)))
    
    
        


In [None]:
splits = cv_splits.split(cred.iloc[:,:-1],cred.iloc[:,-1])

In [None]:
cv_tree_roc(splits, cred)

In [None]:
help(cv_tree_roc)

&nbsp;


# 9 Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV

`RandomForestClassifier(n_estimators=10,   
                        criterion=’gini’,   
                        max_depth=None,    
                        min_samples_split=2,    
                        min_samples_leaf=1,    
                        min_weight_fraction_leaf=0.0,   
                        max_features=’auto’,    
                        max_leaf_nodes=None,     
                        min_impurity_decrease=0.0,    
                        min_impurity_split=None,     
                        bootstrap=True,    
                        oob_score=False,    
                        n_jobs=1,    
                        random_state=None,    
                        verbose=0,    
                        warm_start=False,    
                        class_weight=None)`

In [None]:
rfc = RandomForestClassifier(max_depth = 3, min_samples_split = 30, min_samples_leaf = 30)

In [None]:
rfc.fit(x_train, y_train)

&nbsp;

check variable importance for decision tree classifire

In [None]:
varimp_d_tree

In [None]:
varimp_rf = pd.DataFrame(rfc.feature_importances_, index = x_train.columns, columns = ['value']).\
                                                                sort_values(by = ['value'], ascending = True)
varimp_rf

In [None]:
width = .7
ind = np.arange(varimp_rf.shape[0])
plt.figure(figsize = (10,5))
plt.tick_params(axis='y', which='major', labelsize=15)
plt.barh(ind ,varimp_rf['value'], width, color = 'deeppink' )
plt.yticks(ind, varimp_rf.index)
plt.title('RF parameter importance', fontdict = {'fontsize':15})

In [None]:
predicted_prob_rf = rfc.predict_proba(x_test)
y_prob_rf = predicted_prob_rf[:,1]

predicted_thresh_rf = [0 if i < 0.5 else 1 for i in y_prob_rf]
pd.DataFrame(confusion_matrix(y_true = y_test, y_pred = predicted_thresh_rf))

In [None]:
fpr, tpr, _ = roc_curve(y_true = y_test, y_score = y_prob_rf)
AUC = auc(x = fpr, y = tpr )
plt.figure(figsize = (10,7))
plt.plot(fpr,tpr,label='AUC = {}'.format(AUC), color = 'purple')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC curve RF classifier')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.legend(loc="lower right")

In [None]:
log_loss(y_true = y_test, y_pred = y_prob_rf)

&nbsp;

## 9.1 Grid Search and paramter tuning

* the principle behind grid search is to estimate the model using combinations of different parameters and find the combination that maximizes the reduction in error


* grid search can be carried out manually for most models however randomForest (in both Python and R) has a dedicated method that can perform grid search


to start define the grid parameters as a dictionary   

In [None]:
grid_params = {'n_estimators': [10,20,50],
               'max_depth': [5,10,15,20,25],
               'min_samples_split': [10,15,20,30],
               'min_samples_leaf': [10,15,20,25]}

In [None]:
rf = RandomForestClassifier()
n_folds = 5

In [None]:
#notice that we are passing the entire dataset

rf_cv = GridSearchCV(rf, 
                     grid_params, 
                     cv=n_folds,  # cv_splits 
                     refit=True, 
                     scoring = 'roc_auc').fit(cred.iloc[:,:-1], cred.iloc[:,-1])

In [None]:
rf_cv.grid_scores_[:10]

In [None]:
rf_cv.best_params_

In [None]:
rf_best = RandomForestClassifier(n_estimators = rf_cv.best_params_['n_estimators'],
                                max_depth = rf_cv.best_params_['max_depth'], 
                                min_samples_split = rf_cv.best_params_['min_samples_split'], 
                                min_samples_leaf = rf_cv.best_params_['min_samples_leaf'])
rf_best.fit(x_train, y_train)

In [None]:
predicted_prob_cv = rf_best.predict_proba(x_test)
y_prob_cv = predicted_prob_cv[:,1]

predicted_thresh_cv = [0 if i < 0.5 else 1 for i in y_prob_cv]
pd.DataFrame(confusion_matrix(y_true = y_test, y_pred = predicted_thresh_cv))

In [None]:
fpr, tpr, _ = roc_curve(y_true = y_test, y_score = y_prob_cv)
AUC = auc(x = fpr, y = tpr )
plt.figure(figsize = (10,7))
plt.plot(fpr,tpr,label='AUC = {}'.format(AUC), color = 'purple')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC curve RF classifier')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.legend(loc="lower right")

In [None]:
log_loss(y_true = y_test, y_pred = y_prob_cv)log_loss(y_true = y_test, y_pred = y_prob_cv)

improved log_loss !