## SVMS, decision trees, Random Forests, and other Ensemble algorithms

The document provides implementation details in Python for some of the Machine learning algorithms.  

### SVMs

Support vector machines are great for classification problems. Details on algorithm and intuition can be found at [Andrew Ng's ML course](http://www.holehouse.org/mlclass/12_Support_Vector_Machines.html)

Importing necessary libraries

In [122]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix 
import warnings
pd.set_option('display.max_columns', 500)
from sklearn import tree
clfTree1 = tree.DecisionTreeClassifier()

Using a simple Male/Female data to illustrate this model where sex of an individual is predicted from height and weight

In [5]:
#importing data as DataFrame
data = pd.read_csv("weight-height.csv")
data.head()


Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [6]:
data.shape

(10000, 3)

In [34]:
# for illustration just sampling just 500
data = data.sample(500, replace=False)
data.index = range(500)
print(data.head())
data.shape

   Gender     Height      Weight
0    Male  71.318434  211.983344
1  Female  61.146344  105.382922
2  Female  58.187141  104.472382
3    Male  71.427575  201.842809
4  Female  60.550605  108.215883


(500, 3)

Creating generic function for cross validation and GridSearchCV

In [126]:
def cross_valid(X, Y,clf, parameters, score_func, n_folds ):
    if score_func:
        model=  GridSearchCV(estimator=clf, param_grid=parameters, scoring = score_func, cv=n_folds, return_train_score=True)
    else:
        model= GridSearchCV(estimator=clf, param_grid=parameters, cv=n_folds, return_train_score=True)
    model.fit(X, Y)
   
    print("Best estimator")
    print(model.best_estimator_)
    print("BEST parameters")
    print(model.best_params_)

    print("BEST score")
    print(model.best_score_)
    
    return  model, model.best_estimator_

def classify(data, clf, features, label, label_one, test_size,  n_folds, parameters ,score_func =None):
    subdf = data[features]
    X = subdf.values
    Y = (data[label].values==label_one)*1
    itrain, itest = train_test_split(range(data.shape[0]), test_size = test_size,random_state = 43)
    xtrain = X[itrain,]
    ytrain = Y[itrain,]
    #print(X[itrain,].shape, Y[itrain,].shape)
    xtest = X[itest,]
    ytest = Y[itest, ]
    
    model_cv, clf_best =cross_valid(X=xtrain, Y= ytrain, clf=clf, parameters=parameters, score_func=score_func, n_folds=n_folds)
    
    # train on entire data
    clf_best.fit(xtrain,ytrain)
    
    ## scores
    clf=clf_best.fit(xtrain, ytrain)
   
    training_accuracy = clf.score(xtrain,ytrain)
    test_accuracy = clf.score(xtest,ytest)
    print ("############# based on standard predict ################")
    print( "Accuracy on training data: %0.2f" % (training_accuracy))
    print( "Accuracy on test data:     %0.2f" % (test_accuracy))
    print( confusion_matrix(ytest, clf.predict(xtest)))
    print ("########################################################")
    return model_cv, clf_best, xtrain, ytrain, xtest, ytest
    
    
    

In [127]:
## testing with Logsictregression firsy

lm = LogisticRegression()
parameters = {"C": [0.00001,0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}
features = ["Height", "Weight"]
label = "Gender"



model_cv,clf_best, xtrain, ytrain,xtest, ytest =classify(data = data,clf=lm,features=features, label = label, label_one="Male", n_folds=6, parameters=parameters, test_size=0.4, score_func="accuracy" )  
# ignore all caught warnings

	# execute code that will generate warnings


Best estimator
LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
BEST parameters
{'C': 0.001}
BEST score
0.9366666666666666
############# based on standard predict ################
Accuracy on training data: 0.93
Accuracy on test data:     0.92
[[96 12]
 [ 4 88]]
########################################################




In [112]:
clf_best

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [128]:
pd.DataFrame(model_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,mean_train_score,std_train_score
0,0.001086,0.000342,0.000364,6.1e-05,1e-05,{'C': 1e-05},0.529412,0.5,0.5,0.5,0.5,0.510204,0.506667,0.010937,9,0.506024,0.508,0.508,0.508,0.508,0.505976,0.507333,0.000943
1,0.001453,0.000666,0.000553,0.000211,0.001,{'C': 0.001},0.980392,0.88,0.94,0.96,0.94,0.918367,0.936667,0.03177,1,0.927711,0.948,0.932,0.928,0.936,0.940239,0.935325,0.007164
2,0.00071,0.000153,0.000239,1.8e-05,0.01,{'C': 0.01},0.960784,0.88,0.96,0.96,0.9,0.938776,0.933333,0.032082,2,0.927711,0.944,0.928,0.928,0.94,0.932271,0.93333,0.006427
3,0.001202,0.000722,0.000279,4.7e-05,0.1,{'C': 0.1},0.960784,0.88,0.96,0.96,0.9,0.938776,0.933333,0.032082,2,0.927711,0.944,0.924,0.928,0.94,0.932271,0.932664,0.007116
4,0.000708,8.8e-05,0.000232,1.2e-05,1.0,{'C': 1},0.960784,0.88,0.96,0.96,0.9,0.938776,0.933333,0.032082,2,0.927711,0.944,0.924,0.928,0.94,0.932271,0.932664,0.007116
5,0.000702,8.7e-05,0.000227,8e-06,10.0,{'C': 10},0.960784,0.88,0.96,0.96,0.9,0.938776,0.933333,0.032082,2,0.927711,0.944,0.928,0.928,0.94,0.932271,0.93333,0.006427
6,0.000626,4.5e-05,0.000217,2e-06,100.0,{'C': 100},0.960784,0.88,0.96,0.96,0.9,0.938776,0.933333,0.032082,2,0.927711,0.944,0.928,0.928,0.94,0.932271,0.93333,0.006427
7,0.000674,6.7e-05,0.00023,9e-06,1000.0,{'C': 1000},0.960784,0.88,0.96,0.96,0.9,0.938776,0.933333,0.032082,2,0.927711,0.944,0.928,0.928,0.94,0.932271,0.93333,0.006427
8,0.000967,0.000357,0.000281,6.4e-05,10000.0,{'C': 10000},0.960784,0.88,0.96,0.96,0.9,0.938776,0.933333,0.032082,2,0.927711,0.944,0.928,0.928,0.94,0.932271,0.93333,0.006427


Now comparing model for **SVM** for linear kernel

In [132]:
## testing with Logsictregression firsy

svm = SVC(kernel="linear")
parameters = {"C": [0.00001,0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000,100000]}
features = ["Height", "Weight"]
label = "Gender"


model_cv_svm,clf_best_svm, xtrain, ytrain,xtest, ytest =classify(data = data,clf=svm,features=features, label = label, label_one="Male", n_folds=6, parameters=parameters, test_size=0.4, score_func="accuracy" )  


Best estimator
SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
BEST parameters
{'C': 10000}
BEST score
0.9366666666666666
############# based on standard predict ################
Accuracy on training data: 0.94
Accuracy on test data:     0.92
[[96 12]
 [ 4 88]]
########################################################


In [137]:
svm = SVC()
parameters = {"C": [0.00001,0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000,100000], 'gamma' : [0.001, 0.01, 0.1, 1]}
features = ["Height", "Weight"]
label = "Gender"


model_cv_svm,clf_best_svm, xtrain, ytrain,xtest, ytest =classify(data = data,clf=svm,features=features, label = label, label_one="Male", n_folds=6, parameters=parameters, test_size=0.4, score_func="accuracy" )  


Best estimator
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
BEST parameters
{'C': 1, 'gamma': 0.1}
BEST score
0.93
############# based on standard predict ################
Accuracy on training data: 0.94
Accuracy on test data:     0.90
[[95 13]
 [ 8 84]]
########################################################


### Decision trees