# Klasyfikacja spektroskopii MRI guzów mózgu - wektory widma

In [0]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier

# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale, MultiLabelBinarizer
from sklearn.model_selection import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'whitegrid' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

In [0]:
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = MultiLabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot, var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target  )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))
    
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='blue', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
  

In [0]:
from google.colab import files
uploaded = files.upload()

Saving all_spectra.csv to all_spectra (1).csv


In [0]:
import io
patients = pd.read_csv(io.BytesIO(uploaded['all_spectra.csv']))
patients = patients.iloc[:,1:]
print ('Datasets:', patients.shape)

Datasets: (210, 995)


In [0]:
patients.head()

Unnamed: 0,4.00735,3.99966999999999,3.992,3.98433,3.97665,3.96898,3.96130999999999,3.95363,3.94596,3.93829,3.93060999999999,3.92294,3.91526999999999,3.90759,3.89992,3.89225,3.88456999999999,3.8769,3.86922,3.86155,3.85388,3.8462,3.83853,3.83086,3.82318,3.81550999999999,3.80784,3.80016,3.79249,3.78482,3.77714,3.76946999999999,3.7618,3.75412,3.74645,3.73878,3.7311,3.72343,3.71576,3.70808,...,0.490981,0.483308,0.475634,0.467960999999999,0.460287,0.452614,0.444939999999999,0.437267,0.429593,0.421919999999999,0.414246,0.406573,0.3989,0.391226,0.383553,0.375879,0.368206,0.360532,0.352859,0.345184999999999,0.337512,0.329838,0.322165,0.314490999999999,0.306818,0.299145,0.291471,0.283798,0.276124,0.268451,0.260777,0.253104,0.24543,0.237757,0.230082999999999,0.22241,0.214736,0.207063,0.199389,GROUP
0,60020.5,62941.5,61411.3,60243.2,53853.8,62858.0,52778.9,53241.0,52999.5,60708.4,54799.2,53031.0,55083.5,62493.1,58099.6,60783.2,64882.1,59415.0,64061.3,62793.7,61492.7,62761.2,65574.6,62845.5,76226.3,74759.8,68157.7,64115.1,72202.8,67203.9,77189.7,82432.2,77348.5,79834.8,75258.3,81067.0,87411.3,81955.1,86149.7,86053.1,...,5234.48,7483.08,4781.48,11353.2,15937.3,10455.5,3051.29,4244.13,6408.17,4960.11,7432.67,8833.03,7390.92,5876.44,3224.4,6652.17,11150.6,10277.1,7999.14,7335.15,4854.76,8217.5,5680.25,6734.34,8405.47,6060.81,4991.76,1516.91,2397.21,3094.87,3009.05,2780.07,1939.72,2628.07,3595.77,7416.75,5704.25,11161.6,13346.1,1
1,112417.0,116841.0,111266.0,110678.0,114144.0,125850.0,117406.0,116452.0,117306.0,119900.0,114514.0,124048.0,116240.0,120321.0,115291.0,124948.0,108573.0,123812.0,118897.0,119934.0,117362.0,110887.0,122271.0,102731.0,110689.0,113182.0,98854.1,105521.0,110745.0,114264.0,100474.0,110334.0,110976.0,110642.0,113286.0,104138.0,116238.0,105515.0,117549.0,109129.0,...,2782.33,4962.92,10323.8,2464.08,12557.6,14529.0,1034.98,7595.34,3652.98,7379.48,3837.53,14942.4,10421.6,4322.53,6001.98,6929.9,6858.85,4135.9,9613.21,4980.96,6306.68,5299.14,6679.51,8297.21,10399.7,6989.16,3733.01,4163.39,4201.41,5414.39,8343.4,15572.4,15878.8,2765.64,11573.8,11264.0,5990.72,2771.23,13638.8,1
2,27951.5,27496.6,31282.8,26040.2,23606.4,31893.2,20343.9,26532.8,34812.3,30573.6,29773.1,32680.1,29637.2,28455.8,24835.4,24892.9,29514.7,30936.5,28257.3,30327.0,24468.6,32951.6,26251.5,33308.6,32622.9,26214.0,29753.0,34704.1,23983.1,36066.7,33833.8,32540.5,28984.1,30110.5,28261.3,35042.9,32904.9,39135.2,28524.9,41938.1,...,9493.71,5125.33,4063.8,3834.41,3274.22,3977.55,3755.68,4963.32,5437.73,5596.13,1674.81,5219.51,4515.32,6172.47,8028.99,6903.93,2238.02,8900.22,4757.92,6763.76,6543.35,195.151,3720.11,4871.7,2817.07,542.851,2287.22,6541.02,2207.93,7662.67,9001.54,3375.24,7473.72,6247.66,11492.2,11365.1,9450.09,8436.91,4569.64,1
3,102738.0,99305.2,99138.5,113538.0,94162.8,101745.0,93956.1,93866.2,90506.0,96366.9,94243.4,89279.3,93755.2,80222.2,87206.3,84250.6,77852.1,81077.9,83956.1,77812.3,72751.5,76275.5,73175.5,73306.3,77251.2,68041.3,62198.3,66654.4,64285.5,57694.5,61834.4,69111.3,62946.5,68095.1,72891.6,67056.1,69521.3,62109.2,75045.8,69023.0,...,3769.77,5145.03,8966.27,7356.28,9073.99,7220.42,10414.5,5307.59,9011.29,12494.1,7036.59,5111.37,7596.22,4254.97,5991.72,1566.54,4255.19,3262.01,9438.56,6755.08,3227.51,647.248,4991.91,1400.83,5755.31,2634.58,5626.08,3339.59,3917.58,1554.44,10298.3,8083.7,9120.81,7240.97,7297.68,9289.24,10270.3,13817.3,3501.78,1
4,111617.0,107421.0,108820.0,103266.0,113830.0,106591.0,107624.0,113711.0,100982.0,108717.0,109950.0,99128.4,106156.0,100200.0,101442.0,107167.0,93999.5,94260.2,102563.0,92612.5,102428.0,101650.0,106469.0,103439.0,100632.0,116647.0,96536.3,98496.1,95783.0,94523.4,96981.9,89227.6,98467.3,97993.5,93900.3,82299.5,83661.3,95451.6,81037.0,91966.9,...,5652.42,6249.27,2145.33,792.804,1669.98,3111.19,10438.5,11215.0,4604.24,1107.22,2408.27,6732.28,7044.82,4394.0,2661.25,4822.74,8139.2,6490.98,5804.3,5177.41,3402.26,3585.48,4890.37,4415.69,2175.92,2009.88,3917.81,3330.26,3993.97,3848.81,4311.95,3438.56,3699.98,6966.99,5670.79,2663.03,8816.61,5712.42,5853.89,0


In [0]:
print(patients.isnull().sum())

4.00735             0
3.99966999999999    0
3.992               0
3.98433             0
3.97665             0
                   ..
0.22241             0
0.214736            0
0.207063            0
0.199389            0
GROUP               0
Length: 995, dtype: int64


In [0]:
# import KMeans
from sklearn.cluster import KMeans

In [0]:
X = patients.iloc[:,:-1]
Y = patients.iloc[:,-1]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [0]:
dt_model = DecisionTreeClassifier()
dt_model.fit( X_train , y_train )

print ('Training Accuracy ', dt_model.score( X_train , y_train ) , 'Validation Accuracy ', dt_model.score( X_test , y_test ))

Training Accuracy  1.0 Validation Accuracy  0.6190476190476191


In [0]:
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit( X_train , y_train )

print ('Training Accuracy ', dt_model.score( X_train , y_train ) , 'Validation Accuracy ', dt_model.score( X_test , y_test ))

Training Accuracy  1.0 Validation Accuracy  0.6190476190476191


In [0]:
gb_model = GradientBoostingClassifier()
gb_model.fit( X_train , y_train )

print ('Training Accuracy ', dt_model.score( X_train , y_train ) , 'Validation Accuracy ', dt_model.score( X_test , y_test ))

Training Accuracy  1.0 Validation Accuracy  0.6190476190476191
