In [1]:
# import Python libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import itertools
from itertools import chain
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, learning_curve, train_test_split
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_curve, precision_recall_curve, accuracy_score
import warnings
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff

warnings.filterwarnings('ignore') #ignore warning messages

In [2]:
# Read data
data = pd.read_csv("dataR2.csv")

In [3]:
# Drop useless variables
#data = data.drop(['Age','BMI'],axis = 1)

# Reassign target
#data.diagnosis.replace(to_replace = dict(M = 1, B = 0), inplace = True)

In [4]:
# View first 5 data
data.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,diagnosis
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,0
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,0
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,0
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22,0
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92,0


In [5]:
# describe data
data.describe()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,diagnosis
count,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0
mean,57.301724,27.582111,97.793103,10.012086,2.694988,26.61508,10.180874,14.725966,534.647,0.551724
std,16.112766,5.020136,22.525162,10.067768,3.642043,19.183294,6.843341,12.390646,345.912663,0.499475
min,24.0,18.37,60.0,2.432,0.467409,4.311,1.65602,3.21,45.843,0.0
25%,45.0,22.973205,85.75,4.35925,0.917966,12.313675,5.474282,6.881763,269.97825,0.0
50%,56.0,27.662416,92.0,5.9245,1.380939,20.271,8.352692,10.82774,471.3225,1.0
75%,71.0,31.241442,102.0,11.18925,2.857787,37.3783,11.81597,17.755207,700.085,1.0
max,89.0,38.578759,201.0,58.46,25.050342,90.28,38.04,82.1,1698.44,1.0


In [6]:
# divide the first column into 2 datasets
M = data[(data['diagnosis'] != 0)]
B = data[(data['diagnosis'] == 0)]

In [7]:
# plot of count data
trace = go.Bar(x = (len(M), len(B)), y = ['presence', 'absence'], orientation = 'h', opacity = 0.8, marker=dict(
        color=[ 'red', 'blue'],
        line=dict(color='#000000',width=1.5)))

layout = dict(title =  'Count of diagnosis variable')
                    
fig = dict(data = [trace], layout=layout)
py.iplot(fig)

# plot percentage of data
trace = go.Pie(labels = ['presence','absence'], values = data['diagnosis'].value_counts(), 
               textfont=dict(size=15), opacity = 0.8,
               marker=dict(colors=['blue', 'red'], 
                           line=dict(color='#000000', width=1.5)))


layout = dict(title =  'Distribution of diagnosis variable')
           
fig = dict(data = [trace], layout=layout)
py.iplot(fig)

In [8]:
#identify distribution plot
def plot_distribution(data_select, size_bin) :  
    tmp1 = M[data_select]
    tmp2 = B[data_select]
    hist_data = [tmp1, tmp2]
    
    group_labels = ['malignant', 'benign']
    colors = ['#FF0000', '#0000FF']

    fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, bin_size = size_bin, curve_type='kde')
    
    fig['layout'].update(title = data_select)

    py.iplot(fig, filename = 'Density plot')

In [9]:
#plot distribution 'mean'
#plot_distribution('Age', 1)
#plot_distribution('BMI', 1)
plot_distribution('Glucose', 5)
plot_distribution('Insulin', 1)
plot_distribution('HOMA', .5)
plot_distribution('Leptin', 1)
plot_distribution('Adiponectin', 1)
plot_distribution('Resistin', 1)
plot_distribution('MCP.1', 100)

In [10]:
#define correlation
correlation = data.corr()
#tick labels
matrix_cols = correlation.columns.tolist()
#convert to array
corr_array  = np.array(correlation)

In [11]:
#plot correlation matrix
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   xgap = 2,
                   ygap = 2,
                   colorscale='Magma',
                   colorbar   = dict() ,
                  )
layout = go.Layout(dict(title = 'Correlation Matrix for variables',
                        autosize = False,
                        height  = 720,
                        width   = 800,
                        margin  = dict(r = 0 ,l = 210,
                                       t = 25,b = 210,
                                     ),
                        yaxis   = dict(tickfont = dict(size = 9)),
                        xaxis   = dict(tickfont = dict(size = 9)),
                       )
                  )
fig = go.Figure(data = [trace],layout = layout)
py.iplot(fig)

In [12]:
#compare correlation between 2 features
def plot_feat1_feat2(feat1, feat2) :  
    trace0 = go.Scatter(
        x = M[feat1],
        y = M[feat2],
        name = 'malignant',
        mode = 'markers', 
        marker = dict(color = '#FF0000',
            line = dict(
                width = 1)))

    trace1 = go.Scatter(
        x = B[feat1],
        y = B[feat2],
        name = 'benign',
        mode = 'markers',
        marker = dict(color = '#0000FF',
            line = dict(
                width = 1)))

    layout = dict(title = feat1 +" "+"vs"+" "+ feat2,
                  yaxis = dict(title = feat2,zeroline = False),
                  xaxis = dict(title = feat1, zeroline = False)
                 )

    plots = [trace0, trace1]

    fig = dict(data = plots, layout=layout)
    py.iplot(fig)

In [13]:
#4 samples of positive correlated features
#plot_feat1_feat2('perimeter_mean','radius_mean')
#plot_feat1_feat2('area_mean','perimeter_mean')
#plot_feat1_feat2('radius_worst','area_worst')
#plot_feat1_feat2('perimeter_worst','perimeter_mean')

In [14]:
#seaborn version : 
"""
palette ={0 : 'blue', 1 : 'red'}
edgecolor = 'grey'

# Plot +
fig = plt.figure(figsize=(12,12))

plt.subplot(221)
ax1 = sns.scatterplot(x = data['perimeter_mean'], y = data['radius_mean'], hue = "diagnosis",
                    data = data, palette = palette, edgecolor=edgecolor)
plt.title('perimeter mean vs radius mean')
plt.subplot(222)
ax2 = sns.scatterplot(x = data['area_mean'], y = data['perimeter_mean'], hue = "diagnosis",
                    data = data, palette =palette, edgecolor=edgecolor)
plt.title('area mean vs perimeter mean')
plt.subplot(223)
ax3 = sns.scatterplot(x = data['radius_worst'], y = data['area_worst'], hue = "diagnosis",
                    data = data, palette =palette, edgecolor=edgecolor)
plt.title('radius worst vs area worst')
plt.subplot(224)
ax4 = sns.scatterplot(x = data['perimeter_worst'], y = data['perimeter_mean'], hue = "diagnosis",
                    data = data, palette =palette, edgecolor=edgecolor)
plt.title('perimeter worst vs perimeter mean')

fig.suptitle('Positive correlated features', fontsize = 20)
plt.savefig('1')
plt.show()

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-14-ab35ffd6668f>, line 28)

In [None]:
#4 samples of uncorrelated features
"""
plot_feat1_feat2('symmetry_mean','texture_worst')
plot_feat1_feat2('radius_mean','fractal_dimension_worst')
plot_feat1_feat2('fractal_dimension_worst','area_worst')
plot_feat1_feat2('concavity_mean','texture_se')

In [None]:
# seaborn version : 
"""

fig = plt.figure(figsize=(12,12))

plt.subplot(221)
ax1 = sns.scatterplot(x = data['symmetry_mean'], y = data['texture_worst'], hue = "diagnosis",
                    data = data, palette =palette, edgecolor=edgecolor)
plt.title('symmetry mean vs texture worst')
plt.subplot(222)
ax2 = sns.scatterplot(x = data['radius_mean'], y = data['fractal_dimension_worst'], hue = "diagnosis",
                    data = data, palette =palette, edgecolor=edgecolor)
plt.title('radius mean vs fractal dimension worst')
plt.subplot(223)
ax3 = sns.scatterplot(x = data['fractal_dimension_worst'], y = data['area_worst'], hue = "diagnosis",
                    data = data, palette =palette, edgecolor=edgecolor)
plt.title('fractal dimension worst vs area worst')
plt.subplot(224)
ax4 = sns.scatterplot(x = data['concavity_mean'], y = data['texture_se'], hue = "diagnosis",
                    data = data, palette =palette, edgecolor=edgecolor)
plt.title('concavity mean vs texture se')

fig.suptitle('Uncorrelated features', fontsize = 20)
plt.savefig('2')
plt.show()

In [None]:
#4 samples of negative correlated features
"""
plot_feat1_feat2('smoothness_se','radius_mean')
plot_feat1_feat2('radius_worst','fractal_dimension_mean')
plot_feat1_feat2('texture_se','symmetry_worst')
plot_feat1_feat2('smoothness_se','radius_worst')

In [None]:
# seaborn version
"""
fig = plt.figure(figsize=(12,12))

plt.subplot(221)
ax1 = sns.scatterplot(x = data['smoothness_se'], y = data['radius_mean'], hue = "diagnosis",
                    data = data, palette =palette, edgecolor=edgecolor)
plt.title('smoothness se vs radius mean')
plt.subplot(222)
ax2 = sns.scatterplot(x = data['radius_worst'], y = data['fractal_dimension_mean'], hue = "diagnosis",
                    data = data, palette =palette, edgecolor=edgecolor)
plt.title('radius worst vs fractal dimension mean')
plt.subplot(223)
ax2 = sns.scatterplot(x = data['area_mean'], y = data['smoothness_se'], hue = "diagnosis",
                    data = data, palette =palette, edgecolor=edgecolor)
plt.title('area mean vs smoothness se')
plt.subplot(224)
ax2 = sns.scatterplot(x = data['smoothness_se'], y = data['perimeter_mean'], hue = "diagnosis",
                    data = data, palette =palette, edgecolor=edgecolor)
plt.title('smoothness se vs perimeter mean')

fig.suptitle('Negative correlated features', fontsize = 20)
plt.savefig('3')
plt.show()

In [None]:
# Define Confusion matrix 
def plot_confusion_matrix(cm, classes,
                          normalize = False,
                          title = 'Confusion matrix"',
                          cmap = plt.cm.Reds) :
    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])) :
        plt.text(j, i, cm[i, j],
                 horizontalalignment = 'center',
                 color = 'white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# Show metrics 
def show_metrics():
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]
    print('Accuracy  =     {:.3f}'.format((tp+tn)/(tp+tn+fp+fn)))
    print('Precision =     {:.3f}'.format(tp/(tp+fp)))
    print('Recall    =     {:.3f}'.format(tp/(tp+fn)))
    print('F1_score  =     {:.3f}'.format(2*(((tp/(tp+fp))*(tp/(tp+fn)))/
                                                 ((tp/(tp+fp))+(tp/(tp+fn))))))

In [None]:
# Define Precision-recall curve
def plot_precision_recall():
    plt.step(recall, precision, color = 'b', alpha = 0.2,
             where = 'post')
    plt.fill_between(recall, precision, step ='post', alpha = 0.2,
                 color = 'b')

    plt.plot(recall, precision, linewidth=2)
    plt.xlim([0.0,1])
    plt.ylim([0.0,1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision Recall Curve')
    plt.show();

In [None]:
# Define ROC curve
def plot_roc():
    plt.plot(fpr, tpr, label = 'ROC curve', linewidth = 2)
    plt.plot([0,1],[0,1], 'r--', linewidth = 2)
   # plt.xlim([0.0,0.001])
   # plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show();

In [None]:
#Define Learning curve
def plot_learning_curve(estimator, title, X, y, ylim = None, cv = None,
                        n_jobs = 1, train_sizes = np.linspace(.1, 1.0, 5)):
    """
    Plots a learning curve. http://scikit-learn.org/stable/modules/learning_curve.html
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel('Training examples')
    plt.ylabel('Score')
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv = cv, n_jobs = n_jobs, train_sizes = train_sizes)
    train_scores_mean = np.mean(train_scores, axis = 1)
    train_scores_std = np.std(train_scores, axis = 1)
    test_scores_mean = np.mean(test_scores, axis = 1)
    test_scores_std = np.std(test_scores, axis = 1)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha = 0.1, color = "b")
    plt.plot(train_sizes, train_scores_mean, 'o-', color = "r",
             label = "Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color = "b",
             label = "Cross-validation score")
    plt.legend(loc = "best")
    return plt

In [None]:
# Define cross-validation metrics
def cross_val_metrics(model) :
    scores = ['f1','accuracy', 'precision', 'recall']
    for sc in scores:
        scores = cross_val_score(model, X, y, cv = 5, scoring = sc)
        print('[%s] : %0.5f (+/- %0.5f)'%(sc, scores.mean(), scores.std()))

In [None]:
#Prepare dataset
# Define X(features) and Y(diagnosis(M or B))
y = np.array(data.diagnosis.tolist())
data1 = data.drop('diagnosis', 1)
X = np.array(data1.values)

In [None]:
# Data standardization
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# Split data to train and test set
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)

In [None]:
#Logistic Regression
# Find best hyperparameters by logistic regression and GridSearchCV (accuracy)
log_clf = LogisticRegression(random_state = random_state)
param_grid = {
            'penalty' : ['l2','l1'],  
            'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
            }

CV_log_clf = GridSearchCV(estimator = log_clf, param_grid = param_grid , scoring = 'accuracy', verbose = 1, n_jobs = -1)
CV_log_clf.fit(X_train, y_train)

best_parameters = CV_log_clf.best_params_
print('The best parameters for using this model is', best_parameters)

In [None]:
#Logistic Regression with best hyperparameters
CV_log_clf = LogisticRegression(C = best_parameters['C'], 
                                penalty = best_parameters['penalty'], 
                                random_state = random_state)

CV_log_clf.fit(X_train, y_train)
y_pred = CV_log_clf.predict(X_test)
y_score = CV_log_clf.decision_function(X_test)

# Confusion maxtrix & metrics, plot the matrix and show its value
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Logistic Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()

# ROC curve, plot ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

In [None]:
#Logistic regression with Recursive Feature Elimination (RFE), apply RFE to eliminate from 30 to 15 features
log_clf = LogisticRegression(C = best_parameters['C'], 
                                 penalty = best_parameters['penalty'], 
                                 random_state = random_state)

selector = RFE(log_clf)
selector = selector.fit(X_train, y_train)

y_pred = selector.predict(X_test)
y_score = selector.predict_proba(X_test)[:,1]


# Confusion maxtrix & metrics, plot the matrix and show its value
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Logistic Confusion matrix')
plt.show()

show_metrics()

# ROC curve, plot ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

In [None]:
# support and ranking of RFE
print(selector.support_)
print(selector.ranking_)

In [None]:
#Learning curve of Logistic Regression with best hyperparameter
plot_learning_curve(CV_log_clf, 'Learning Curve For Logistic Model', X, y, (0.5,1), 10)
plt.savefig('7')
plt.show()

In [None]:
#Learning curve of Logistic Regression with RFE
plot_learning_curve(selector, 'Learning Curve For Logistic Model with RFE', X, y, (0.5,1), 10)
plt.show()

In [None]:
# Cross-validation metrics of Logistic Regression 
cross_log = cross_val_metrics(CV_log_clf)

In [None]:
# Cross-validation metrics of Logistic Regression with RFE
cross_selector = cross_val_metrics(selector)

In [None]:
# Select threshold for 100% recall
thresholds_adj = [0.08,0.1,0.2,0.3,0.4,0.5]

plt.figure(figsize = (15,15))

j = 1
for i in thresholds_adj:
    y_score = CV_log_clf.predict_proba(X_test)[:,1] > i
    
    
    plt.subplot(3,3,j)
    j += 1
    
    cm = confusion_matrix(y_test, y_score)
    
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]

    print('Recall w/ threshold = %s :'%i, (tp/(tp+fn)))
    
    class_names = [0,1]
    plot_confusion_matrix(cm, 
                          classes=class_names, 
                          title='Threshold = %s'%i) 

In [None]:
# Prediction with recall = 100%
y_score = CV_log_clf.predict_proba(X_test)[:,1] > 0.3
cm = confusion_matrix(y_test, y_score)
class_names = [0,1]
show_metrics()

In [None]:
#Ensemble classifier
#Logistic Regression 
# Find the best hyperparameters (recall)
log2_clf = LogisticRegression(random_state = random_state)
param_grid = {
            'penalty' : ['l2','l1'],  
            'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            }

CV_log2_clf = GridSearchCV(estimator = log2_clf, param_grid = param_grid , scoring = 'recall', verbose = 1, n_jobs = -1)
CV_log2_clf.fit(X_train, y_train)

best_parameters = CV_log2_clf.best_params_
print('The best parameters for using this model is', best_parameters)

In [None]:
# Logistic Regression with best hyperparameters (recall)
CV_log2_clf = LogisticRegression(C = best_parameters['C'], 
                                 penalty = best_parameters['penalty'], 
                                 random_state = random_state)


CV_log2_clf.fit(X_train, y_train)

y_pred = CV_log2_clf.predict(X_test)
y_score = CV_log2_clf.decision_function(X_test)
# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]

In [None]:
# Cross-validation metrics of logistic regression ensemble classifier
cross_val_metrics(CV_log2_clf)

In [None]:
#Voting Classifier
voting_clf = VotingClassifier (
        estimators = [('log1', CV_log_clf), ('log_2', CV_log2_clf)],
                     voting='soft', weights = [1, 1])
    
voting_clf.fit(X_train,y_train)

y_pred = voting_clf.predict(X_test)
y_score = voting_clf.predict_proba(X_test)[:,1]

# Confusion maxtrix
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
show_metrics()

In [None]:
# Cross-validation metrics of voting classifier
cross_voting = cross_val_metrics(voting_clf)

In [None]:
#Learning curve of Voting classifier
plot_learning_curve(voting_clf, 'Learning Curve For Voting clf', X, y, (0.5,1), 10)
plt.savefig('9')
plt.show()

In [None]:
# Select Threshold with recall = 100%
thresholds_adj = [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]

plt.figure(figsize = (15,15))

j = 1
for i in thresholds_adj:
    y_score = voting_clf.predict_proba(X_test)[:,1] > i
    
    
    plt.subplot(3,3,j)
    j += 1
    
    cm = confusion_matrix(y_test, y_score)
    
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]

    print('Recall w/ threshold = %s :'%i, (tp/(tp+fn)))
    
    class_names = [0,1]
    plot_confusion_matrix(cm, 
                          classes=class_names, 
                          title='Threshold = %s'%i)

In [None]:
# Voting classifier with recall = 100%
y_score = voting_clf.predict_proba(X_test)[:,1] > 0.4
cm = confusion_matrix(y_test, y_score)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes = class_names, 
                      title = 'Ensemble Clf CM : recall = 100%')
plt.savefig('8')
plt.show()

show_metrics()

# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

# Precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()

In [None]:
#Support Vector Machine (SVM)
#Define variables
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Define output results
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [None]:
#Linear Kernel SVM
from sklearn.svm import LinearSVC

linear_kernel_SVM = LinearSVC(loss='hinge', dual=True)
linear_kernel_SVM.fit(X_train, y_train)

print_score(linear_kernel_SVM, X_train, y_train, X_test, y_test, train=True)
print_score(linear_kernel_SVM, X_train, y_train, X_test, y_test, train=False)

In [None]:
#Cross-validation metrics of linear kernel SVM
cross_val_metrics(linear_kernel_SVM)

In [None]:
#Polynomial Kernel SVM
from sklearn.svm import SVC

# The hyperparameter coef0 controls how much the model is influenced by high degree ploynomials 
polynomial_kernel_SVM = SVC(kernel='poly', degree=2, gamma='auto', coef0=1, C=5)
polynomial_kernel_SVM.fit(X_train, y_train)

print_score(polynomial_kernel_SVM, X_train, y_train, X_test, y_test, train=True)
print_score(polynomial_kernel_SVM, X_train, y_train, X_test, y_test, train=False)

In [None]:
#Cross-validation metrics of polynomial kernel SVM
cross_val_metrics(polynomial_kernel_SVM)

In [None]:
#Radial Kernel SVM
radial_kernel_SVM = SVC(kernel='rbf', gamma=0.5, C=0.1)
radial_kernel_SVM.fit(X_train, y_train)

print_score(radial_kernel_SVM, X_train, y_train, X_test, y_test, train=True)
print_score(radial_kernel_SVM, X_train, y_train, X_test, y_test, train=False)

In [None]:
#cross validation metrics of radial kernel SVM
cross_val_metrics(radial_kernel_SVM)

In [None]:
#Comparison of Cross-validation metrics
models_metrics = {'Logistic_regression': [0.986, 0.962,0.974,0.981], 
                 'Logistic_regression_with_RFE': [0.971, 0.939,0.955,0.966],
                 'Ensemble_logistic_regression' : [0.963,0.958,0.960,0.970],
                 'Voting_classifier' : [0.973,0.958,0.960,0.970],
                  'Linear_kernel_SVM' : [0.977,0.943,0.959,0.970],
                  'Polynomial_kernel_SVM' :[0.986,0.958,0.971,0.979],
                }
df = pd.DataFrame(data = models_metrics)
df.rename(index={0:'Precision',1:'Recall',2:'F1_Score', 3:'Accuracy'}, 
                 inplace=True)
ax = df.plot(kind='bar', figsize = (12,6), ylim = (0.92, 0.99), 
        color = ['blue', 'red', 'green', 'yellow','purple','orange'],
        rot = 0, title ='Comparison of Cross-Validation Metrics',
        edgecolor = 'grey', alpha = 0.5)
ax.legend(frameon=False, loc='upper center', ncol=2)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01, p.get_height() * 1.0005))
plt.show()