## Initialization

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
model_accuracies = {'KNN':1, 'LogReg':1, 'DT':1, 'RF':1, 'NB':1, 'LinearSVC':1, 'KernelSVC':1}

## Importing the Data

In [5]:
df_train = pd.read_csv('training.csv')
df_train.shape

(4339, 6)

In [6]:
df_train.head()

Unnamed: 0,class,GLCM_pan,Mean_Green,Mean_Red,Mean_NIR,SD_pan
0,w,120.362774,205.5,119.395349,416.581395,20.676318
1,w,124.739583,202.8,115.333333,354.333333,16.707151
2,w,134.691964,199.285714,116.857143,477.857143,22.496712
3,w,127.946309,178.368421,92.368421,278.473684,14.977453
4,w,135.431548,197.0,112.690476,532.952381,17.604193


In [7]:
df_test = pd.read_csv('testing.csv')
df_test.shape

(500, 6)

In [8]:
df_test.head()

Unnamed: 0,class,GLCM_pan,Mean_Green,Mean_Red,Mean_NIR,SD_pan
0,n,109.828571,183.7,82.95,251.75,16.079412
1,n,130.284483,212.637931,96.896552,482.396552,21.210295
2,n,131.386555,185.466667,85.466667,419.666667,13.339998
3,n,141.345098,180.875,81.5,348.0625,18.213577
4,w,121.383408,218.357143,112.017857,426.607143,19.083196


## Create Train and Test data

In [9]:
X_train = df_train.iloc[:, 1:6].values
X_test = df_test.iloc[:, 1:6].values
Y_train = df_train.iloc[:, 0].values
Y_test = df_test.iloc[:, 0].values

In [10]:
X_train

array([[ 120.3627737 ,  205.5       ,  119.3953488 ,  416.5813953 ,
          20.67631835],
       [ 124.7395833 ,  202.8       ,  115.3333333 ,  354.3333333 ,
          16.70715083],
       [ 134.6919643 ,  199.2857143 ,  116.8571429 ,  477.8571429 ,
          22.49671178],
       ..., 
       [ 132.1238592 ,  465.34375   ,  295.3333333 ,  499.25      ,
          38.76296686],
       [ 124.4695364 ,  215.1526316 ,   98.48421053,  675.9157895 ,
          28.50654248],
       [ 125.171928  ,  559.048     ,  365.968     ,  439.272     ,
          15.39232155]])

In [11]:
X_test

array([[ 109.8285714 ,  183.7       ,   82.95      ,  251.75      ,
          16.0794123 ],
       [ 130.2844828 ,  212.637931  ,   96.89655172,  482.3965517 ,
          21.21029549],
       [ 131.3865546 ,  185.4666667 ,   85.46666667,  419.6666667 ,
          13.33999833],
       ..., 
       [ 119.0766871 ,  247.9512195 ,  113.3658537 ,  808.0243902 ,
          24.83005893],
       [ 107.9444444 ,  197.        ,   90.        ,  451.        ,
           8.2148874 ],
       [ 119.7319277 ,  182.2380952 ,   74.28571429,  301.6904762 ,
          22.94427836]])

In [12]:
Y_train

array(['w', 'w', 'w', ..., 'n', 'n', 'n'], dtype=object)

In [13]:
Y_test

array(['n', 'n', 'n', 'n', 'w', 'n', 'w', 'w', 'n', 'n', 'n', 'n', 'n',
       'n', 'n', 'n', 'w', 'w', 'n', 'w', 'n', 'w', 'n', 'n', 'w', 'n',
       'w', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'w', 'n', 'n', 'n',
       'w', 'w', 'w', 'n', 'n', 'n', 'n', 'w', 'n', 'n', 'w', 'n', 'w',
       'w', 'w', 'n', 'n', 'n', 'n', 'w', 'n', 'w', 'n', 'w', 'w', 'w',
       'n', 'n', 'w', 'n', 'n', 'n', 'w', 'n', 'n', 'w', 'n', 'n', 'n',
       'w', 'n', 'w', 'w', 'n', 'w', 'n', 'w', 'w', 'w', 'n', 'w', 'w',
       'n', 'w', 'n', 'n', 'n', 'w', 'n', 'n', 'n', 'w', 'n', 'n', 'n',
       'w', 'n', 'w', 'w', 'w', 'w', 'n', 'w', 'w', 'w', 'w', 'w', 'w',
       'w', 'w', 'w', 'w', 'w', 'w', 'n', 'n', 'n', 'n', 'n', 'n', 'w',
       'n', 'n', 'n', 'w', 'n', 'n', 'w', 'w', 'w', 'w', 'w', 'n', 'n',
       'n', 'n', 'n', 'w', 'n', 'w', 'n', 'n', 'w', 'w', 'n', 'n', 'w',
       'n', 'n', 'n', 'n', 'n', 'n', 'w', 'w', 'w', 'n', 'n', 'w', 'n',
       'n', 'n', 'n', 'n', 'n', 'n', 'w', 'w', 'w', 'n', 'w', 'n

## Preprocess the data

In [14]:
le_Y = LabelEncoder()
Y_train = le_Y.fit_transform(Y_train)
Y_test = le_Y.transform(Y_test)

In [15]:
Y_train

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [16]:
Y_test

array([0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 1,

In [17]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
0,4265
1,74


In [18]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
0,313
1,187


In [19]:
# Scale the Data
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [20]:
X_train

array([[-0.47097751, -0.46759814,  0.03464194, -0.76077836, -0.38595952],
       [-0.15229913, -0.51204208, -0.032273  , -1.16373661, -0.74656234],
       [ 0.57234026, -0.56988976, -0.00717077, -0.36411431, -0.22057497],
       ..., 
       [ 0.38535484,  3.80961698,  2.9329273 , -0.22562923,  1.25723046],
       [-0.17196142, -0.30870888, -0.30983424,  0.91800376,  0.32542412],
       [-0.12081983,  5.35205636,  4.09651583, -0.6138924 , -0.86601588]])

In [21]:
X_test

array([[-1.23797968, -0.82644183, -0.5657345 , -1.82780227, -0.80359301],
       [ 0.25142864, -0.35010266, -0.33598828, -0.33472878, -0.33744717],
       [ 0.33167121, -0.79736122, -0.5242766 , -0.74080608, -1.05247149],
       ..., 
       [-0.56461831,  0.23117948, -0.06468395,  1.77319907, -0.00858807],
       [-1.3751642 , -0.60751426, -0.44959748, -0.53797205, -1.5180929 ],
       [-0.51690982, -0.85050583, -0.70846418, -1.50451627, -0.1799131 ]])

## Decision Tree Classifier

In [22]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [23]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [24]:
Y_pred_dt = clf_dt.predict(X_test)

In [25]:
cm_dt = confusion_matrix(Y_pred_dt, Y_test)
cm_dt

array([[303,  66],
       [ 10, 121]], dtype=int64)

## Random Forest Classifier

In [26]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [27]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
Y_pred_rf = clf_rf.predict(X_test)

In [29]:
cm_rf = confusion_matrix(Y_pred_rf, Y_test)
cm_rf

array([[312, 110],
       [  1,  77]], dtype=int64)

## Naive Bayes Classifier

In [30]:
clf_nb = GaussianNB()

In [31]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [32]:
Y_pred_nb = clf_nb.predict(X_test)

In [33]:
cm_nb = confusion_matrix(Y_pred_nb, Y_test)
cm_nb

array([[304, 160],
       [  9,  27]], dtype=int64)

## KNN Classifier

In [34]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [35]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [36]:
Y_pred_knn = clf_knn.predict(X_test)

In [37]:
cm_knn = confusion_matrix(Y_pred_knn, Y_test)
cm_knn

array([[310, 160],
       [  3,  27]], dtype=int64)

## Logistic Regression

In [38]:
clf_lr = LogisticRegression()

In [39]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [40]:
Y_pred_lr = clf_lr.predict(X_test)

In [41]:
cm_lr = confusion_matrix(Y_pred_lr, Y_test)
cm_lr

array([[311, 181],
       [  2,   6]], dtype=int64)

## SVC Linear

In [42]:
clf_lsvc = SVC(kernel = "linear")

In [43]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [44]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [45]:
cm_lsvc = confusion_matrix(Y_pred_lsvc, Y_test)
cm_lsvc

array([[312, 182],
       [  1,   5]], dtype=int64)

## SVC Kernel

In [46]:
clf_ksvc = SVC(kernel = "rbf")

In [47]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [48]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [49]:
cm_ksvc = confusion_matrix(Y_pred_ksvc, Y_test)
cm_ksvc

array([[313, 148],
       [  0,  39]], dtype=int64)

## Accuracy of Various Classifiers

In [50]:
model_accuracies['DT'] = accuracy_score(Y_pred_dt, Y_test)
model_accuracies['KNN'] = accuracy_score(Y_pred_knn, Y_test)
model_accuracies['KernelSVC'] = accuracy_score(Y_pred_ksvc, Y_test)
model_accuracies['LinearSVC'] = accuracy_score(Y_pred_lsvc, Y_test)
model_accuracies['LogReg'] = accuracy_score(Y_pred_lr, Y_test)
model_accuracies['NB'] = accuracy_score(Y_pred_nb, Y_test)
model_accuracies['RF'] = accuracy_score(Y_pred_rf, Y_test)
model_accuracies

{'DT': 0.84799999999999998,
 'KNN': 0.67400000000000004,
 'KernelSVC': 0.70399999999999996,
 'LinearSVC': 0.63400000000000001,
 'LogReg': 0.63400000000000001,
 'NB': 0.66200000000000003,
 'RF': 0.77800000000000002}