## SVMS, decision trees, Random Forests, and other Ensemble algorithms

The document provides implementation details in Python for some of the Machine learning algorithms.  

### SVMs

Support vector machines are great for classification problems. Details on algorithm and intuition can be found at [Andrew Ng's ML course](http://www.holehouse.org/mlclass/12_Support_Vector_Machines.html)

Importing necessary libraries

In [26]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix 
import warnings
pd.set_option('display.max_columns', 500)
from sklearn import tree


Using a simple Male/Female data to illustrate this model where sex of an individual is predicted from height and weight

In [3]:
#importing data as DataFrame
data = pd.read_csv("weight-height.csv")
data.head()


Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [4]:
data.shape

(10000, 3)

In [5]:
# for illustration just sampling just 500
data = data.sample(500, replace=False)
data.index = range(500)
print(data.head())
data.shape

   Gender     Height      Weight
0  Female  60.725153  122.507463
1  Female  62.934926  132.616525
2    Male  70.523225  207.532839
3  Female  64.962069  141.482514
4    Male  68.985554  186.341902


(500, 3)

Creating generic function for cross validation and GridSearchCV

In [6]:
def cross_valid(X, Y,clf, parameters, score_func, n_folds ):
    if score_func:
        model=  GridSearchCV(estimator=clf, param_grid=parameters, scoring = score_func, cv=n_folds, return_train_score=True)
    else:
        model= GridSearchCV(estimator=clf, param_grid=parameters, cv=n_folds, return_train_score=True)
    model.fit(X, Y)
   
    print("Best estimator")
    print(model.best_estimator_)
    print("BEST parameters")
    print(model.best_params_)

    print("BEST score")
    print(model.best_score_)
    
    return  model, model.best_estimator_

def classify(data, clf, features, label, label_one, test_size,  n_folds, parameters ,score_func =None):
    subdf = data[features]
    X = subdf.values
    Y = (data[label].values==label_one)*1
    itrain, itest = train_test_split(range(data.shape[0]), test_size = test_size,random_state = 43)
    xtrain = X[itrain,]
    ytrain = Y[itrain,]
    #print(X[itrain,].shape, Y[itrain,].shape)
    xtest = X[itest,]
    ytest = Y[itest, ]
    
    model_cv, clf_best =cross_valid(X=xtrain, Y= ytrain, clf=clf, parameters=parameters, score_func=score_func, n_folds=n_folds)
    
    # train on entire data
    clf_best.fit(xtrain,ytrain)
    
    ## scores
    clf=clf_best.fit(xtrain, ytrain)
   
    training_accuracy = clf.score(xtrain,ytrain)
    test_accuracy = clf.score(xtest,ytest)
    print ("############# based on standard predict ################")
    print( "Accuracy on training data: %0.2f" % (training_accuracy))
    print( "Accuracy on test data:     %0.2f" % (test_accuracy))
    print( confusion_matrix(ytest, clf.predict(xtest)))
    print ("########################################################")
    return model_cv, clf_best, xtrain, ytrain, xtest, ytest
    
    
    

In [8]:
## testing with Logsictregression firsy

lm = LogisticRegression()
parameters = {"C": [0.00001,0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}
features = ["Height", "Weight"]
label = "Gender"



model_cv,clf_best, xtrain, ytrain,xtest, ytest =classify(data = data,clf=lm,features=features, label = label, label_one="Male", n_folds=6, parameters=parameters, test_size=0.4, score_func="accuracy" )  
# ignore all caught warnings

	# execute code that will generate warnings


Best estimator
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
BEST parameters
{'C': 1}
BEST score
0.94
############# based on standard predict ################
Accuracy on training data: 0.94
Accuracy on test data:     0.94
[[89  8]
 [ 5 98]]
########################################################




In [9]:
clf_best

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [10]:
pd.DataFrame(model_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,mean_train_score,std_train_score
0,0.001831,0.00152,0.000475,0.000203,1e-05,{'C': 1e-05},0.529412,0.509804,0.52,0.52,0.510204,0.510204,0.516667,0.007278,9,0.514056,0.518072,0.516,0.516,0.517928,0.517928,0.516664,0.001464
1,0.00163,0.000684,0.000608,0.000298,0.001,{'C': 0.001},0.921569,0.921569,0.98,0.92,0.938776,0.918367,0.933333,0.021933,4,0.939759,0.935743,0.924,0.936,0.932271,0.940239,0.934669,0.005469
2,0.001053,0.000392,0.000553,0.000433,0.01,{'C': 0.01},0.901961,0.921569,0.98,0.94,0.959184,0.918367,0.936667,0.026379,2,0.939759,0.943775,0.928,0.94,0.932271,0.940239,0.937341,0.005411
3,0.000726,0.000196,0.000278,0.000101,0.1,{'C': 0.1},0.901961,0.921569,0.98,0.94,0.959184,0.918367,0.936667,0.026379,2,0.943775,0.943775,0.932,0.936,0.936255,0.940239,0.938674,0.004321
4,0.001553,0.001069,0.000432,0.000213,1.0,{'C': 1},0.901961,0.921569,0.98,0.96,0.959184,0.918367,0.94,0.027814,1,0.943775,0.943775,0.932,0.936,0.936255,0.944223,0.939338,0.004791
5,0.000684,6.8e-05,0.000245,3e-05,10.0,{'C': 10},0.901961,0.921569,0.98,0.92,0.959184,0.918367,0.933333,0.027003,4,0.939759,0.943775,0.932,0.94,0.936255,0.944223,0.939335,0.004233
6,0.000699,0.000109,0.000239,2e-05,100.0,{'C': 100},0.901961,0.921569,0.98,0.92,0.959184,0.918367,0.933333,0.027003,4,0.939759,0.939759,0.928,0.94,0.936255,0.944223,0.937999,0.005033
7,0.000717,3.5e-05,0.000227,6e-06,1000.0,{'C': 1000},0.901961,0.921569,0.98,0.9,0.959184,0.918367,0.93,0.029557,8,0.939759,0.939759,0.928,0.944,0.936255,0.944223,0.938666,0.005497
8,0.000709,5.7e-05,0.000232,7e-06,10000.0,{'C': 10000},0.901961,0.941176,0.98,0.9,0.959184,0.918367,0.933333,0.029524,4,0.939759,0.939759,0.928,0.944,0.936255,0.944223,0.938666,0.005497


Now comparing model for **SVM** for linear kernel

In [11]:
## testing with Logsictregression firsy

svm = SVC(kernel="linear")
parameters = {"C": [0.00001,0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000,100000]}
features = ["Height", "Weight"]
label = "Gender"


model_cv_svm,clf_best_svm, xtrain, ytrain,xtest, ytest =classify(data = data,clf=svm,features=features, label = label, label_one="Male", n_folds=6, parameters=parameters, test_size=0.4, score_func="accuracy" )  


Best estimator
SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
BEST parameters
{'C': 0.1}
BEST score
0.9366666666666666
############# based on standard predict ################
Accuracy on training data: 0.94
Accuracy on test data:     0.94
[[89  8]
 [ 5 98]]
########################################################




In [18]:
svm = SVC()
parameters = {"C": [0.00001,0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000,100000], 'gamma' : [0.001, 0.01, 0.1, 1]}
features = ["Height", "Weight"]
label = "Gender"


model_cv_svm,clf_best_svm, xtrain, ytrain,xtest, ytest =classify(data = data,clf=svm,features=features, label = label, label_one="Male", n_folds=6, parameters=parameters, test_size=0.4, score_func="accuracy" )  


Best estimator
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
BEST parameters
{'C': 10, 'gamma': 0.001}
BEST score
0.93
############# based on standard predict ################
Accuracy on training data: 0.93
Accuracy on test data:     0.93
[[89  8]
 [ 6 97]]
########################################################




### Decision trees

In [14]:
parameters = {"max_depth": [1, 2, 3, 4, 5, 6, 7, 8,9], 'min_samples_leaf': [1, 2, 3, 4, 5, 6,8,9,10,12]}
clfTree1 = tree.DecisionTreeClassifier()


model_cv_dt1,clf_best_dt1, xtrain, ytrain,xtest, ytest =classify(data = data,clf=clfTree1,features=features, label = label, label_one="Male", n_folds=6, parameters=parameters, test_size=0.4, score_func="accuracy" )  


Best estimator
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
BEST parameters
{'max_depth': 5, 'min_samples_leaf': 6}
BEST score
0.9233333333333333
############# based on standard predict ################
Accuracy on training data: 0.95
Accuracy on test data:     0.88
[[83 14]
 [10 93]]
########################################################




Lets try with score function as f1 

In [15]:
features = ["Height", "Weight"]
label = "Gender"


parameters = {"max_depth": [1, 2, 3, 4, 5, 6, 7, 8,9], 'min_samples_leaf': [1, 2, 3, 4, 5, 6,8,9,10,12]}
clfTree1 = tree.DecisionTreeClassifier()


model_cv_dt1,clf_best_dt1, xtrain, ytrain,xtest, ytest =classify(data = data,clf=clfTree1,features=features, label = label, label_one="Male", n_folds=6, parameters=parameters, test_size=0.4, score_func="f1" )  


Best estimator
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
BEST parameters
{'max_depth': 4, 'min_samples_leaf': 6}
BEST score
0.9263276980415344
############# based on standard predict ################
Accuracy on training data: 0.95
Accuracy on test data:     0.88
[[83 14]
 [10 93]]
########################################################




### Random Forests

In [21]:
from sklearn.ensemble import RandomForestClassifier
features = ["Height", "Weight"]
label = "Gender"


parameters = {"n_estimators": range(1, 20)}
rfclassifier = RandomForestClassifier()


model_cv_rf,clf_best_rf, xtrain, ytrain,xtest, ytest =classify(data = data,clf=rfclassifier,features=features, label = label, label_one="Male", n_folds=6, parameters=parameters, test_size=0.4, score_func="accuracy" )  


Best estimator
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
BEST parameters
{'n_estimators': 10}
BEST score
0.9266666666666666
############# based on standard predict ################
Accuracy on training data: 0.99
Accuracy on test data:     0.91
[[88  9]
 [ 9 94]]
########################################################




#### Relative Importance
We can get a measure of how important a variable is from a random forest, it's essentially a measure of how well each particular variable is able to predict well when it is selected, for more on this and other details check out [this webpage](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp).

In [31]:
importance_list = clf_best_rf.feature_importances_
pd.concat([pd.Series(features), pd.Series(importance_list)], axis=1)

Unnamed: 0,0,1
0,Height,0.337085
1,Weight,0.662915


### Adaboost classifier

In [32]:
from sklearn.ensemble import AdaBoostClassifier

clfAda = AdaBoostClassifier()

parameters = {"n_estimators": range(10, 60)}


model_cv_ada,clf_best_ada, xtrain, ytrain,xtest, ytest =classify(data = data,clf=clfAda,features=features, label = label, label_one="Male", n_folds=6, parameters=parameters, test_size=0.4, score_func="accuracy" )  


Best estimator
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=18, random_state=None)
BEST parameters
{'n_estimators': 18}
BEST score
0.9133333333333333
############# based on standard predict ################
Accuracy on training data: 0.96
Accuracy on test data:     0.89
[[85 12]
 [10 93]]
########################################################


### Gradient boost classifier

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

clfGB = GradientBoostingClassifier()

parameters = {"n_estimators": range(30, 60), "max_depth": [1, 2, 3, 4, 5]}


model_cv_gboost,clf_best_gboost, xtrain, ytrain,xtest, ytest =classify(data = data,clf=clfGB,features=features, label = label, label_one="Male", n_folds=6, parameters=parameters, test_size=0.4, score_func="accuracy" )  


Best estimator
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=30,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
BEST parameters
{'max_depth': 3, 'n_estimators': 30}
BEST score
0.9266666666666666
############# based on standard predict ################
Accuracy on training data: 0.98
Accuracy on test data:     0.91
[[87 10]
 [ 9 94]]
########################################################


