In [40]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
import scipy
from sklearn.metrics import accuracy_score
%matplotlib inline

In [41]:
#data_df = pd.read_csv('data4_cumm.csv')
data_df = pd.read_csv('data3.csv')
data_df.shape

(1144, 16)

In [42]:
dataset = data_df.as_matrix()

In [43]:
X = dataset[:,4:14]
y = dataset[:,15]
#X = dataset[:,4:24]
#y = dataset[:,25]
X.shape

(1144, 10)

In [44]:
standardized_X = preprocessing.scale(X)
seed = 58

In [45]:
# Divide records in training and testing sets of 80:20 
X_train, X_test, y_train, y_test = train_test_split(standardized_X, y, test_size=0.20, random_state=seed)

### Logistic Regression with 80-20 train test split

In [46]:
logsplitmodel = LogisticRegression()
logsplitmodel.fit(X_train,y_train)
logsplitmodelpredictions = logsplitmodel.predict(X_test)
print(classification_report(y_test,logsplitmodelpredictions))

             precision    recall  f1-score   support

        0.0       0.74      0.83      0.78       126
        1.0       0.75      0.65      0.70       103

avg / total       0.75      0.75      0.74       229



### Logistic Regression with Cross Validation

In [47]:
logcvmodel = LogisticRegression()
logcvmodel_y_pred = cross_val_predict(logcvmodel,standardized_X, y, cv=10)
print(metrics.classification_report(y, logcvmodel_y_pred))
print(metrics.confusion_matrix(y, logcvmodel_y_pred))

             precision    recall  f1-score   support

        0.0       0.73      0.80      0.76       650
        1.0       0.70      0.62      0.66       494

avg / total       0.72      0.72      0.72      1144

[[518 132]
 [187 307]]


In [48]:
# evaluate the model using 10-fold cross-validation
#For integer/None inputs, if the estimator is a classifier 
#and y is either binary or multiclass, StratifiedKFold is used. In all other cases, KFold is used.

cv_scores = cross_val_score(LogisticRegression(), standardized_X, y, scoring='accuracy', cv=10)
print(cv_scores)
print("The Cross validation score on Logistic Regression:",cv_scores.mean())


[ 0.69565217  0.84347826  0.74782609  0.88695652  0.76315789  0.8245614
  0.60526316  0.52631579  0.70175439  0.61403509]
The Cross validation score on Logistic Regression: 0.720900076278


### Random Forest Classifier with 80-20 train test split

In [49]:
rfc = RandomForestClassifier(n_estimators=100,random_state=58)     
rfc = rfc.fit(X_train,y_train)
rfc_pred = rfc.predict(X_test)
ac_rfc = accuracy_score(y_test,rfc_pred)
print('Random Forest Classifier:\nAccuracy is: ',ac_rfc)
print("*****************************************************************\nConfusion matrix:\n",confusion_matrix(y_test,rfc_pred))
print("*****************************************************************\nClassification Report",classification_report(y_test,rfc_pred))

Random Forest Classifier:
Accuracy is:  0.873362445415
*****************************************************************
Confusion matrix:
 [[114  12]
 [ 17  86]]
*****************************************************************
Classification Report              precision    recall  f1-score   support

        0.0       0.87      0.90      0.89       126
        1.0       0.88      0.83      0.86       103

avg / total       0.87      0.87      0.87       229



### Random Forest Classifier with Cross Validation 

In [50]:
rfccvmodel = RandomForestClassifier()
y_pred = cross_val_predict(rfccvmodel,standardized_X, y, cv=10)
print(metrics.classification_report(y, y_pred))
print(metrics.confusion_matrix(y, y_pred))

             precision    recall  f1-score   support

        0.0       0.76      0.75      0.75       650
        1.0       0.68      0.68      0.68       494

avg / total       0.72      0.72      0.72      1144

[[490 160]
 [159 335]]


### Feature ranking/Importance 

In [51]:
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))


Feature ranking:
1. feature 6 (0.188058)
2. feature 5 (0.186272)
3. feature 8 (0.138219)
4. feature 2 (0.094699)
5. feature 9 (0.091225)
6. feature 1 (0.071492)
7. feature 7 (0.061930)
8. feature 3 (0.061718)
9. feature 4 (0.059057)
10. feature 0 (0.047331)


### SVM: Linear Kernel with train test split

In [52]:
svmlinear = SVC(kernel='linear', C = 45.42)
svmlinear.fit(X_train, y_train)  
y_true, y_pred = y_test, svmlinear.predict(X_test)
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

        0.0       0.78      0.80      0.79       126
        1.0       0.75      0.73      0.74       103

avg / total       0.77      0.77      0.77       229



### SVM: Linear Kernel with cross validation

In [53]:
svmcvlinear = SVC(kernel='linear', C=45.42)
linear_cv_error = cross_val_score(svmcvlinear, standardized_X, y, cv=10)
print(linear_cv_error,"\n***************************************************************************")
print('\nThe {}-fold cross-validation accuracy score for classifier is {:.2f}\n'.format(10, np.average(linear_cv_error)))

[ 0.71304348  0.87826087  0.73043478  0.86086957  0.77192982  0.8245614
  0.61403509  0.52631579  0.66666667  0.60526316] 
***************************************************************************

The 10-fold cross-validation accuracy score for classifier is 0.72



### SVM: RBF Kernel with train test split 

In [54]:
svmrbf = SVC(kernel='rbf', C = 16.57, gamma = 0.28383)
svmrbf.fit(X_train, y_train)  
y_true, y_pred = y_test, svmrbf.predict(X_test)
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

        0.0       0.86      0.91      0.89       126
        1.0       0.89      0.83      0.85       103

avg / total       0.87      0.87      0.87       229



### SVM: RBF Kernel with cross validation

In [55]:
svmcvrbf = SVC(kernel='rbf', C= 31.21,gamma=0.00827)
rbf_cv_error = cross_val_score(svmcvrbf, standardized_X, y, cv=10)
print(rbf_cv_error,"\n***************************************************************************")
print('\nThe {}-fold cross-validation accuracy score for classifier is {:.2f}\n'.format(10, np.average(rbf_cv_error)))

[ 0.80869565  0.8173913   0.67826087  0.82608696  0.86842105  0.85964912
  0.62280702  0.57894737  0.75438596  0.62280702] 
***************************************************************************

The 10-fold cross-validation accuracy score for classifier is 0.74



### SVM Polynomial Kernel with train test split

In [56]:
svmpoly = SVC(kernel='poly', C = 328.342, gamma = 0.08480)
svmpoly.fit(X_train, y_train)  
y_true, y_pred = y_test, svmpoly.predict(X_test)
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

        0.0       0.82      0.91      0.86       126
        1.0       0.88      0.76      0.81       103

avg / total       0.85      0.84      0.84       229



### SVM Polynomial Kernel with cross validation

In [57]:
svmcvpoly = SVC(kernel='poly',C= 35.673,gamma=0.0422)
poly_cv_error = cross_val_score(svmcvpoly, standardized_X, y, cv=10)
print(poly_cv_error,"\n***************************************************************************")
print('\nThe {}-fold cross-validation accuracy score for classifier is {:.2f}\n'.format(10, np.average(poly_cv_error)))

[ 0.72173913  0.87826087  0.65217391  0.86956522  0.73684211  0.72807018
  0.61403509  0.56140351  0.70175439  0.66666667] 
***************************************************************************

The 10-fold cross-validation accuracy score for classifier is 0.71



In [58]:
n_iter_search = 10

In [20]:
#linear kernel
C_range = scipy.stats.expon(scale=100)

# specify parameters and distributions to sample from
param_dist = {"C": C_range}

# run randomized search
linear = RandomizedSearchCV(SVC(kernel='linear'), param_distributions=param_dist,cv = 10, n_iter=n_iter_search,scoring='accuracy',random_state=58)
linear.fit(X_train, y_train)
print(linear.best_params_)

{'C': 45.429656639092947}


In [21]:
#linear kernel
C_range = scipy.stats.expon(scale=100)

# specify parameters and distributions to sample from
param_dist = {"C": C_range}

# run randomized search
linear = RandomizedSearchCV(SVC(kernel='linear'), param_distributions=param_dist,cv = 10, n_iter=n_iter_search,scoring='accuracy',random_state=58)
linear.fit(standardized_X, y)
print(linear.best_params_)

{'C': 45.429656639092947}


In [22]:
#RBF kernel
params = {'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1),'kernel': ['rbf'], 'class_weight':['balanced', None]}

# specify parameters and distributions to sample from
rbf = RandomizedSearchCV(SVC(), params, cv=10,n_iter=n_iter_search,scoring='accuracy',random_state=58)
rbf.fit(X_train, y_train)
print(rbf.best_params_)

{'C': 16.575011703131253, 'class_weight': 'balanced', 'gamma': 0.28383486379970485, 'kernel': 'rbf'}


In [23]:
#RBF kernel
params = {'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1),'kernel': ['rbf'], 'class_weight':['balanced', None]}

# specify parameters and distributions to sample from
rbf = RandomizedSearchCV(SVC(), params, cv=10,n_iter=n_iter_search,scoring='accuracy',random_state=58)
rbf.fit(standardized_X, y)
print(rbf.best_params_)

{'C': 31.219313005785988, 'class_weight': None, 'gamma': 0.0082794569823148745, 'kernel': 'rbf'}


In [24]:
#poly kernel
params = {'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1),'kernel': ['poly'], 'class_weight':['balanced', None]}

# specify parameters and distributions to sample from
polysearch = RandomizedSearchCV(SVC(), params, cv=10,n_iter=10,scoring='accuracy',random_state=58)
polysearch.fit(X_train, y_train)
print(polysearch.best_params_)


{'C': 328.3422467295191, 'class_weight': 'balanced', 'gamma': 0.084807518785928529, 'kernel': 'poly'}


In [25]:
#poly kernel
params = {'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1),'kernel': ['poly'], 'class_weight':['balanced', None]}

# specify parameters and distributions to sample from
polysearch = RandomizedSearchCV(SVC(), params, cv=10,n_iter=10,scoring='accuracy',random_state=58)
polysearch.fit(standardized_X, y)
print(polysearch.best_params_)

{'C': 35.673738407889488, 'class_weight': 'balanced', 'gamma': 0.042273875024883292, 'kernel': 'poly'}


In [39]:
from sklearn.ensemble import VotingClassifier
eclf = VotingClassifier(estimators=[('lr', logcvmodel), ('rf', rfcmodel)], voting='hard')
for clf, label in zip([logcvmodel, rfcmodel, eclf], ['Logistic Regression', 'Random Forest', 'Ensemble']):
    scores = cross_val_score(eclf, standardized_X, y, cv=10, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.69 (+/- 0.11) [Logistic Regression]
Accuracy: 0.71 (+/- 0.09) [Random Forest]
Accuracy: 0.71 (+/- 0.08) [Ensemble]


In [69]:
# Voting Ensemble for Classification
from sklearn import model_selection

seed = 58
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
estimators.append(('logistic', logcvmodel))
estimators.append(('randomforest', rfccvmodel))
estimators.append(('svmlinear', svmcvlinear))
estimators.append(('svmrbf', svmcvrbf))
estimators.append(('svmpoly', svmcvpoly))
# create the ensemble model
ensemble = VotingClassifier(estimators, voting='hard')
results = model_selection.cross_val_score(ensemble, standardized_X, y, cv=kfold)
print(results.mean())

0.739282990084
