In [1]:
import numpy as np
import operator
import os
import pandas as pd

# Machine learning
from sklearn import metrics
from sklearn import preprocessing
from sklearn import svm
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression  # logistic regression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB  # naive bayes
from sklearn.svm import SVC, LinearSVC  # support vector machines

In [2]:
normal_data = pd.read_csv(os.getcwd() + '/Data/brain_tumour_normalized.csv')
normal_data.head()

Unnamed: 0,Image,Mean,Variance,Standard Deviation,Entropy,Skewness,Kurtosis,Contrast,Energy,ASM,Homogeneity,Dissimilarity,Correlation,Coarseness,Class
0,Image1,0.194705,0.212023,0.443074,0.274801,0.068211,0.010937,0.028236,0.47541,0.246092,0.603108,0.139694,0.981764,0.0,0
1,Image2,0.261489,0.276124,0.510114,0.674843,0.052278,0.007693,0.017951,0.797096,0.648383,0.7738,0.093527,0.997417,0.0,0
2,Image3,0.219003,0.392326,0.6142,0.001487,0.090618,0.016478,0.02328,0.012719,0.001173,0.23076,0.195261,0.972855,0.0,1
3,Image4,0.1773,0.329007,0.55975,0.001513,0.108202,0.021559,0.043805,0.012908,0.001192,0.196137,0.258588,0.941475,0.0,1
4,Image5,0.218223,0.24984,0.483677,0.370574,0.068403,0.011067,0.050836,0.56486,0.338854,0.560862,0.226679,0.960995,0.0,0


## Feature Importance

In [3]:
# feature importance using External Trees Classifier( similar to Random Forest)
X = normal_data[normal_data.columns[1 : 14]].values
Y = normal_data['Class'].values
cols = list(normal_data.columns[1 : 14])

# feature extraction
model = ExtraTreesClassifier(n_estimators = 10)
model.fit(X, Y)
scores = model.feature_importances_
scores_dict = {}
for i in range(1, len(scores) + 1):
    scores_dict[normal_data.columns[i]] = scores[i - 1]

#displaying features in the order of importance
scores_dict_sorted = sorted(scores_dict.items(), key = operator.itemgetter(1), reverse = True)
scores_dict_sorted

[('Entropy', 0.2864916083846457),
 ('Energy', 0.2513684740150152),
 ('Homogeneity', 0.1759162890497014),
 ('ASM', 0.15949605750921872),
 ('Variance', 0.024371690506652506),
 ('Kurtosis', 0.01968395620798589),
 ('Dissimilarity', 0.019014003031475015),
 ('Skewness', 0.01738275622991655),
 ('Mean', 0.01701186897705864),
 ('Contrast', 0.010465650983443655),
 ('Standard Deviation', 0.009911171229207634),
 ('Correlation', 0.007093757576436962),
 ('Coarseness', 0.0017927162992420673)]

## Create data partitions

In [4]:
# datset split
X = normal_data[normal_data.columns[: 14]]
Y = normal_data[normal_data.columns[-1]]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
X_test.head()

(3009, 14) (3009,) (753, 14) (753,)


Unnamed: 0,Image,Mean,Variance,Standard Deviation,Entropy,Skewness,Kurtosis,Contrast,Energy,ASM,Homogeneity,Dissimilarity,Correlation,Coarseness
3202,Image3203,0.411857,0.201923,0.431632,0.168981,0.009725,0.000958,0.01672,0.355911,0.145127,0.591544,0.101937,0.941657,0.02832
1135,Image1136,0.215528,0.428803,0.643585,0.008348,0.096743,0.01811,0.041743,0.052148,0.006699,0.266099,0.201322,0.965428,0.0
1501,Image1502,0.415565,0.517418,0.710152,0.036227,0.040976,0.005653,0.056216,0.141729,0.02988,0.392557,0.189193,0.929059,0.0
1301,Image1302,0.078619,0.114534,0.317399,0.004659,0.149218,0.034841,0.007935,0.033094,0.003671,0.273041,0.115757,0.921572,0.0
1220,Image1221,0.364182,0.21488,0.446262,0.288207,0.017398,0.002006,0.032085,0.487971,0.25823,0.69025,0.097081,0.916158,0.0


## Model 1: Logistic Regression

In [5]:
def logistic_regression(X_train, y_train, X_test):
    predictions = ['Mean', 'Variance', 'Standard Deviation', 'Entropy', 'Skewness', 'Kurtosis', 'Contrast', 'Energy', 'ASM', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
    
    x_train = X_train[predictions]
    x_test = X_test[predictions]
    
    logisticRegr = LogisticRegression()
    logisticRegr.fit(x_train, y_train)
    
    pred1 = logisticRegr.predict(x_test)
    pred = list(pred1)
    
    df_res = pd.DataFrame()
    df_res['Image'] = X_test['Image']
    df_res['Class'] = pred
    
    return df_res

x = logistic_regression(X_train, Y_train, X_test)
x

Unnamed: 0,Image,Class
3202,Image3203,0
1135,Image1136,1
1501,Image1502,1
1301,Image1302,1
1220,Image1221,0
...,...,...
2594,Image2595,0
586,Image587,0
2818,Image2819,1
876,Image877,0


In [6]:
# measuring the accuracy of the model
pred = x['Class']

print('Accuracy Score:')
print(metrics.accuracy_score(Y_test, pred))

Accuracy Score:
0.9933598937583001


## Model 2 : SVM (Linear and RBF)

In [7]:
def support_vector_machines(svm_type, x_train, y_train, x_test):
    predictions = ['Mean', 'Variance', 'Standard Deviation', 'Entropy', 'Skewness', 'Kurtosis', 'Contrast', 'Energy', 'ASM', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
    
    #linear SVM
    if svm_type == 'linear':
        svc = LinearSVC()
    #svm with rbf kernel
    elif svm_type == 'rbf':
        svc = svm.SVC(kernel = 'rbf', C = 1,gamma = 'auto')
    
    svc.fit(x_train[predictions], y_train)
    svm_pred = svc.predict(X_test[predictions])
    
    df_res = pd.DataFrame()
    df_res['Image'] = X_test['Image']
    df_res['Class'] = svm_pred
    
    return df_res, svm_pred

In [8]:
#linear SVM
x, svm_pred = support_vector_machines('linear', X_train, Y_train, X_test)
x

Unnamed: 0,Image,Class
3202,Image3203,0
1135,Image1136,1
1501,Image1502,1
1301,Image1302,1
1220,Image1221,0
...,...,...
2594,Image2595,0
586,Image587,0
2818,Image2819,1
876,Image877,0


In [9]:
# measuring accuracy of linear svm
print('Accuracy Score:')
print(metrics.accuracy_score(Y_test, svm_pred))

Accuracy Score:
0.9933598937583001


In [10]:
# svm with rbf kernel
y, rbf_svm_pred = support_vector_machines('rbf', X_train, Y_train, X_test)
y

Unnamed: 0,Image,Class
3202,Image3203,0
1135,Image1136,1
1501,Image1502,1
1301,Image1302,1
1220,Image1221,0
...,...,...
2594,Image2595,0
586,Image587,0
2818,Image2819,1
876,Image877,0


In [11]:
# measuring the accuracy of kernel svm
print('Accuracy Score:')
print(metrics.accuracy_score(Y_test, rbf_svm_pred))

Accuracy Score:
0.9933598937583001


## Model 3 : Naive Bayes

In [12]:
def naive_bayes(X_train, Y_train, X_test):
    predictions = ['Mean', 'Variance', 'Standard Deviation', 'Entropy', 'Skewness', 'Kurtosis', 'Contrast', 'Energy', 'ASM', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
    
    model = GaussianNB()
    model.fit(X_train[predictions], Y_train)
    
    predicted= model.predict(X_test[predictions])
    predicted = list(predicted)
    
    df_res = pd.DataFrame()
    df_res['Image'] = X_test['Image']
    df_res['Class'] = predicted
    
    return df_res, predicted

In [13]:
# naive bayes
z, nb_pred = naive_bayes(X_train, Y_train, X_test)
z

Unnamed: 0,Image,Class
3202,Image3203,0
1135,Image1136,1
1501,Image1502,1
1301,Image1302,1
1220,Image1221,0
...,...,...
2594,Image2595,0
586,Image587,0
2818,Image2819,1
876,Image877,0


In [14]:
# measuring the accuracy of naive bayes model
print('Accuracy Score:')
print(metrics.accuracy_score(Y_test, nb_pred))

Accuracy Score:
0.9907038512616202
