In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
%matplotlib inline

RANDOM_STATE = 42
np.random.seed(seed=RANDOM_STATE)

## 1. Dataset1-cardio

### Loading the Dataset

In [17]:
data_train = pd.read_csv("data/cardio_train.csv")
data_test = pd.read_csv("data/cardio_test.csv")

X_train = data_train.drop(data_train.columns[-1],axis=1)
y_train = data_train.iloc[:,-1]

X_test = data_test.drop(data_test.columns[-1],axis=1)
y_test = data_test.iloc[:,-1]
data_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,-2.720311,0.672779,-0.209259,0.573704,-0.679756,-0.058097,-0.280042,1.180919,-0.871294,-0.423521,...,0.343882,-0.564035,-0.40866,-0.503625,-1.437769,-1.085577,-1.54167,-0.224444,-2.143241,0.0
1,-1.455311,-0.496861,-0.209259,0.160018,-0.119417,-0.058097,-0.280042,-1.140345,-0.059491,0.949618,...,-0.653317,-0.232121,-0.749171,2.518126,-0.596508,-0.829241,-0.926272,-0.325692,-0.498742,0.0
2,-0.295728,-0.389596,0.051364,-1.65076,-0.679756,-0.058097,-0.280042,0.169086,-0.639351,-0.423521,...,-1.365602,-0.453397,0.612874,-0.503625,-0.175878,0.003852,-0.17412,-0.561939,1.145758,0.0
3,-0.928228,3.482341,-0.209259,-0.970991,-0.679756,-0.058097,-0.280042,-1.199865,0.868284,-0.423521,...,0.343882,-0.674673,-1.089682,-0.503625,0.004392,0.003852,-0.105742,-0.52819,-0.498742,0.0
4,0.547605,-0.447035,-0.170418,0.703482,1.193561,-0.058097,-0.280042,0.823802,0.404397,-0.423521,...,-1.080688,0.099793,0.612874,-0.503625,0.184662,0.003852,0.304523,0.045553,1.145758,0.0


In [18]:
total_data = data_train.shape[0]+data_test.shape[0]
print("Total dataset size: %d" % total_data)
print("Features: %d" % X_train.shape[1])
print("Classes: 2")

print("Training data size: %d" % data_train.shape[0])
print("Test data size: %d" % data_test.shape[0])

Total dataset size: 1831
Features: 21
Classes: 2
Training data size: 1464
Test data size: 367


### Training

In [101]:
kers = ["linear", "poly", "rbf", "sigmoid"]
Cs = [0.001, 0.01, 0.1, 1, 10, 100]
gammas = [0.001, 0.01, 0.1, 1, 10]
param_grid = {'C': Cs, 'gamma' : gammas, "kernel": kers}
cv_test= KFold(n_splits=10)
# class_weight='balanced' makes the result worse
grid_search = GridSearchCV(SVC(degree=3), param_grid, cv=cv_test, 
                           scoring='average_precision', n_jobs=2 
                           ,pre_dispatch=4)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=2,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             pre_dispatch=4, refit=True, return_train_score=False,
             scoring='average_precision', verbose=0)

In [102]:
print(grid_search.best_params_)

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}


In [103]:
print("Best estimator found by grid search:")
print(grid_search.best_estimator_)

Best estimator found by grid search:
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


### Making predictions

In [104]:
y_pred = grid_search.predict(X_test)

### Evaluating the results

In [105]:
print(classification_report(y_test, y_pred, digits=5))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

         0.0    0.99699   0.99699   0.99699       332
         1.0    0.97143   0.97143   0.97143        35

    accuracy                        0.99455       367
   macro avg    0.98421   0.98421   0.98421       367
weighted avg    0.99455   0.99455   0.99455       367

[[331   1]
 [  1  34]]


NameError: name 'grid_search' is not defined

## 2.  Dataset2-Credit Card

### Loading the Dataset

In [106]:
data_train = pd.read_csv("data/credit_train.csv")
data_test = pd.read_csv("data/credit_test.csv")

X_train = data_train.drop(data_train.columns[-1],axis=1)
y_train = data_train.iloc[:,-1]

X_test = data_test.drop(data_test.columns[-1],axis=1)
y_test = data_test.iloc[:,-1]
data_train.head()

Unnamed: 0,0,1,2,3
0,-0.073142,0.186189,-0.483419,0.0
1,-0.073142,0.38941,-0.296934,0.0
2,-0.073142,-0.811171,0.81505,0.0
3,-0.073142,-0.51564,-0.130089,0.0
4,-0.073142,-0.214577,-0.092824,0.0


In [107]:
total_data = data_train.shape[0]+data_test.shape[0]
print("Total dataset size: %d" % total_data)
print("Features: %d" % X_train.shape[1])
print("Classes: 2")

print("Training data size: %d" % data_train.shape[0])
print("Test data size: %d" % data_test.shape[0])

Total dataset size: 567498
Features: 3
Classes: 2
Training data size: 453998
Test data size: 113500


### Training

In [110]:
kers = ["linear", "poly", "rbf", "sigmoid"]
Cs = [0.001, 0.01, 0.1, 1, 10, 100]
gammas = [0.001, 0.01, 0.1, 1, 10]
param_grid = {'C': Cs, 'gamma' : gammas, "kernel": kers}
cv_test= KFold(n_splits=10)
# class_weight='balanced' makes the result worse
grid_search = GridSearchCV(SVC(degree=3), param_grid, cv=cv_test, 
                           scoring='average_precision', n_jobs=2 
                           ,pre_dispatch=4)
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
print(grid_search.best_params_)

In [None]:
print("Best estimator found by grid search:")
print(grid_search.best_estimator_)

### Making predictions

In [None]:
y_pred = grid_search.predict(X_test)

### Evaluating the results

In [None]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred, digits=5))