### Libraries

In [None]:
#########################################

import warnings
warnings.filterwarnings("ignore")

###############################

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split 

#Performance metrics

from sklearn import metrics

#Classifiers

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

#Mutual Information - Feature Selection

from sklearn.feature_selection import mutual_info_classif

#confusion matrix

from sklearn.metrics import confusion_matrix

#Cross Validation and Best Parametrs search

from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import GridSearchCV

#Plots

from matplotlib import pyplot as plt
import seaborn as sb
import plotly.express as px

#Data Process

import zipfile
import imageio
from PIL import Image
from scipy.stats import skew
from scipy.stats import kurtosis

#Data Normalization

from sklearn import preprocessing

### Data generator from MRI images

In [None]:
file = zipfile.ZipFile('MRI.zip') 
images_names = file.namelist() 

print('{} MRI'.format(len(images_names)))

In [None]:
#One MRI class 0 
image_file = file.open('brain_tumor_dataset/no/1 no.jpeg')
image = Image.open(image_file)
image_file.close()

In [None]:
def data_generator(file_names):
    
    arrays_values = []
    
    for name in images_names:
        
        if 'no' in name:
            image_file = file.open(name)
            image = Image.open(image_file)
            image_sequence = image.getdata()
            image_array = np.array(image_sequence)
            mean = np.mean(image_array)
            variance = np.var(image_array)
            standard_deviation = np.std(image_array)
            skewness = skew(image_array)
            if isinstance(skewness, np.ndarray):
                skewness = skew(image_array)[0]
            kurto = kurtosis(image_array)
            if isinstance(kurto, np.ndarray):    
                kurto = kurtosis(image_array)[0]
            arrays_values.append([mean,variance,standard_deviation,skewness,kurto,0])
            
        elif 'yes' in name:
            image_file = file.open(name)
            image = Image.open(image_file)
            image_sequence = image.getdata()
            image_array = np.array(image_sequence)
            mean = np.mean(image_array)
            variance = np.var(image_array)
            standard_deviation = np.std(image_array)
            skewness = skew(image_array)
            if isinstance(skewness, np.ndarray):
                skewness = skew(image_array)[0]
            kurto = kurtosis(image_array)
            if isinstance(kurto, np.ndarray):    
                kurto = kurtosis(image_array)[0]
            arrays_values.append([mean,variance,standard_deviation,skewness,kurto,1])
            
        else:
            return "error"
        
        
        df_columns = ['Mean','Variance','Standard Deviation','Skewness','Kurtosis','Label']
        data = pd.DataFrame(arrays_values,columns = df_columns)
    return data

In [None]:
df = data_generator(images_names)
file.close()

### Plot the distribution of every feature 

In [None]:
plt.figure(figsize=(12,15))
plt.suptitle('Distributions of first order features for both classes', fontsize=14, y=0.91)
plt.subplot(3,2,1)


sb.histplot(data = df, x = 'Mean', hue = 'Label')
plt.legend(['Tumor','No Tumor'])
plt.subplot(3,2,2)


sb.histplot(data = df, x = 'Variance', hue = 'Label')
plt.legend(['Tumor','No Tumor'])
plt.subplot(3,2,3)


sb.histplot(data = df, x = 'Standard Deviation', hue = 'Label')
plt.legend(['Tumor','No Tumor'])
plt.subplot(3,2,4)


sb.histplot(data = df, x = 'Skewness', hue = 'Label')
plt.legend(['Tumor','No Tumor'])
plt.subplot(3,1,3)


sb.histplot(data = df, x = 'Kurtosis',  log_scale=True, hue = 'Label')
plt.legend(['Tumor','No Tumor']);


In [None]:
plt.figure(figsize=(12,15))
plt.suptitle('Distributions of second order features for both classes', fontsize=14, y=0.91)
plt.subplot(4,2,1)
sb.histplot(data = df, x = 'Contrast', log_scale=True, hue = 'Label')
plt.legend(['tumor','No Tumor'])
plt.subplot(4,2,2)
sb.histplot(data = df, x = 'Energy', hue = 'Label')
plt.legend(['tumor','No Tumor'])
plt.subplot(4,2,3)
sb.histplot(data = df, x = 'ASM', hue = 'Label')
plt.legend(['tumor','No Tumor'])
plt.subplot(4,2,4)
sb.histplot(data = df, x = 'Entropy', hue = 'Label')
plt.legend(['tumor','No Tumor'])
plt.subplot(4,2,5)
sb.histplot(data = df, x = 'Homogeneity', hue = 'Label')
plt.legend(['tumor','No Tumor'])
plt.subplot(4,2,6)
sb.histplot(data = df, x = 'Dissimilarity', hue = 'Label')
plt.legend(['tumor','No Tumor'])
plt.subplot(4,1,4)
sb.histplot(data = df, x = 'Correlation', hue = 'Label')
plt.legend(['tumor','No Tumor']);

### Plot Functions

In [None]:
def Plot_3D(df):
    fig = px.scatter_3d(df, x=df.columns[0], y=df.columns[1], z=df.columns[3],color=df.columns[-1])
    fig.show()
    fig.write_html("3dPlot_{}.html".format(df['Name'][0])) 

In [None]:
def BarPlot(df):
    cm = sb.light_palette("green", as_cmap=True)
    s = df.style.background_gradient(cmap=cm)
    #sb.set(style="whitegrid")
    ax = sb.barplot(y="Name", x="Accuracy", data=df)
    fig=ax.get_figure()

### Model

In [None]:
def feature_selection(X_train,y_train):
    #feature selection with mutual information
    
    feature_mi = mutual_info_classif(X_train,y_train)
    sorted_mifeature_index = list(feature_mi.argsort()[-13:][::-1])
    features_mi = [X_train.columns[i] for i in sorted_mifeature_index]
    return features_mi

In [None]:
def best_model(X,y,K,randomstate = None):
    
    
    featureselection = feature_selection(X,y)
    
    names = ["Nearest_Neighbors",  
            "Random_Forest",'SVM']
          
        
    #Classifiers we use for CV and Hyperparametrs Tuning
    
    classifiers = [KNeighborsClassifier(),RandomForestClassifier(),SVC()]
    
    #Parametrs for Classifiers 
    
    Nearest_Neighbors_params = ("Nearest_Neighbors",{"n_neighbors": np.arange(1,15), 
                                "weights": ["uniform", "distance"], "p":[1,2]})

    Random_Forest_params = ("Random_Forest",{"max_depth": range(2,100,1)})
    
    SVM_params = ('SVM',{"C": np.arange(0.001, 0.11, 0.1), "degree": np.arange(1,6),
    "kernel": ["linear","poly","rbf","sigmoid"]})

    
    Classifiers_Parameters = [Nearest_Neighbors_params,  
                                    Random_Forest_params ,SVM_params]
    
    #number of best features selected from MI feature selection
    F = [2,5,10,13]
    
    L = []
    
    for name,clf,param in zip(names, classifiers, Classifiers_Parameters):
        if name in param[0]:
            for i in F:
                #take i best features from MI feature selection 
                X_train_F = X[featureselection[:i]]
                
                # stratified K fold CV and Hyper parametrs tuning 
                grid = GridSearchCV(estimator=clf, 
                                    param_grid = param[1], 

                                    scoring='accuracy',
                                    cv = StratifiedKFold(n_splits=K,shuffle = True, random_state = randomstate).split(X_train,y_train)
                                    ) #n_jobs = -1
                
                grid.fit(X_train_F, y_train)
                
                grid_results =  pd.concat([ pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
                grid_results['Name'] = name 
                grid_results['n_features'] = i

                print("For {}  classifier the best parameters are {} n_features = {} with a score of {}  ".format(name,grid.best_params_,i, grid.best_score_))
        
                L.append(grid_results)
    return L

### Results 

#### CV - Feature selection - Hyperparameter tuning results

In [None]:
X = scaled_df.drop(['Labels'],axis = 1)

y = scaled_df['Labels']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)

In [None]:
results = best_model(X_train,y_train,K=5,randomstate= None)

In [None]:
knn_results = pd.concat(results[:4], ignore_index = True) 
rf_results = pd.concat(results[5:8], ignore_index = True)
svm_results = pd.concat(results[9:],ignore_index = True)

In [None]:
Plot_3D(rf_results)

In [None]:
Plot_3D(knn_results)

In [None]:
Plot_3D(svm_results)

In [None]:
maxscore_knn = pd.DataFrame(knn_results.loc[knn_results['Accuracy'].idxmax()]).T  
maxscore_rf = pd.DataFrame(rf_results.loc[rf_results['Accuracy'].idxmax()]).T
maxscore_svm = pd.DataFrame(svm_results.loc[svm_results['Accuracy'].idxmax()]).T

max_scores= [maxscore_knn,maxscore_rf,maxscore_svm]

best_results = pd.concat([maxscore_knn[['Name', 'Accuracy']], maxscore_svm[['Name', 'Accuracy']], maxscore_rf[['Name', 'Accuracy']]],ignore_index = True)

BarPlot(best_results)

#### Test Data results

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors = list(maxscore_knn['n_neighbors'])[0],
                               weights = list(maxscore_knn['weights'])[0],
                               p = list(maxscore_knn['p'])[0])
knn_clf.fit(X_train,y_train)

y_pred_knn = knn_clf.predict(X_test)

print("KNN Accuracy:",metrics.accuracy_score(y_test, y_pred_knn))        


a = [['Nearest_Neighbors',metrics.accuracy_score(y_test, y_pred_knn)]]


knn_test_result = pd.DataFrame(a , columns = ['Accuracy','Name'])


confusion_matrix(y_test, y_pred_knn)

In [None]:
rf_clf = RandomForestClassifier(max_depth = list(maxscore_rf['max_depth'])[0])

rf_clf.fit(X_train,y_train)

y_pred_rf = rf_clf.predict(X_test)

print("Random Forest Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))   

b = [['Random_Forest',metrics.accuracy_score(y_test, y_pred_rf)]]

rf_test_result = pd.DataFrame(b , columns = ['Accuracy','Name'])


confusion_matrix(y_test, y_pred_rf)

In [None]:
svm_clf = SVC( C = list(maxscore_svm['C'])[0],
             degree = list(maxscore_svm['degree'])[0],
             kernel = list(maxscore_svm['kernel'])[0])

svm_clf.fit(X_train,y_train)

y_pred_svm = svm_clf.predict(X_test)

print("SVM  Accuracy:",metrics.accuracy_score(y_test, y_pred_svm)) 



c = [['SVM',metrics.accuracy_score(y_test, y_pred_svm)]]

svm_test_result = pd.DataFrame(c , columns = ['Accuracy','Name'])

confusion_matrix(y_test, y_pred_svm)

In [None]:
test_results = pd.concat([knn_test_result,rf_test_result,svm_test_result],ignore_index = True)

In [None]:
test_results

In [None]:
BarPlot(test_results)

In [None]:
f, ax = plt.subplots(figsize=(10,5))
plt.barh(range(len(random_dict)), list(random_dict.values()), align='center',color=[ 'orange' ])
plt.yticks(range(len(random_dict)), list(random_dict.keys()))
plt.xlabel('Coefficients')
plt.ylabel('Features')
plt.gca().invert_yaxis()
ax.invert_yaxis()