## 0 - Import libraries

In [1]:
# import classical libraries
%matplotlib inline
%pylab inline
%autosave 60

Populating the interactive namespace from numpy and matplotlib


Autosaving every 60 seconds


In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [3]:
# import functions
from os import listdir
from os.path import isfile, join
from scipy.misc import imread
from IPython.display import Image

## 1 - Prepare the datasets

Now, we load the training (70%) and validation (30%) datasets.

In [7]:
# paths to the training and validation datasets
database = '/Users/adekunle/Projects/Others/Data_Science/Data_Science_for_Geosciences/Data/'
path_train = database + "training/"
path_validation = database + "validation/"

In [8]:
# class names
classes = ['F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O']

# initialization
X_train = []
y_train = []
X_validation = []
y_validation = []

In [9]:
# loop on images
for j in range(len(classes)):
    path = path_train + classes[j]
    files = [f for f in listdir(path) if isfile(join(path, f))]
    
    # training
    for i in range(len(files)):
        tmp = imread(path + '/' + files[i])
        X_train.append(ravel(tmp[0:450,0:450:,0]))
        y_train.append(classes[j])
    
    # validation
    path = path_validation + classes[j]
    files = [f for f in listdir(path) if isfile(join(path, f))]
    for i in range(len(files)):
        tmp = imread(path + '/' + files[i])
        X_validation.append(ravel(tmp[0:450,0:450:,0]))
        y_validation.append(classes[j])

`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.
`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.


In [10]:
# transform to array
X_train = asarray(X_train)
y_train = asarray(y_train)
X_validation = asarray(X_validation)
y_validation = asarray(y_validation)

## 2 - Datasets size

In [11]:
xt_len = len(X_train)
xv_len = len(X_validation)
print('X_train shape : ',X_train.shape)
print('y_train shape : ',y_train.shape)
print('X_validation shape : ',X_validation.shape)
print('y_validation shape : ',y_validation.shape)

X_train shape :  (1120, 202500)
y_train shape :  (1120,)
X_validation shape :  (480, 202500)
y_validation shape :  (480,)


## 3 - Multiple Classifier

In [12]:
# - import models
import scipy as sp
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [13]:
# Define the classifier list
classifiers = [NearestCentroid(),SVC(), KNeighborsClassifier(), QuadraticDiscriminantAnalysis(), 
               LinearDiscriminantAnalysis(solver="lsqr"),RandomForestClassifier()]
names = ["NC","SVM", "KNN", "QDA", "LDA","RDF"]

In [36]:
# Define the dictionnary of parameters to optimize
param_grids = [dict(shrink_threshold=np.arange(1,10,1)),
               dict(kernel=['rbf'], gamma=1.0*sp.arange(1,12,2), C= 1.0*sp.arange(1,12,2),degree=sp.arange(1,10)),
               dict(n_neighbors = sp.arange(1,40)), # number of neighbors for KNN
               dict(reg_param = sp.linspace(0,0.1,30)), # Regularization parameter for QDA
               dict(shrinkage = sp.linspace(0,0.5,30)), # Regularization parameter for LDA
               dict(max_depth=np.arange(1,15,1),n_estimators = np.arange(1,100,10)), # Number of depths for  RDF
]

In [17]:
# - define function to run multiple model
def run_multiple_models(X_train_trans,X_validation_trans):   
    # Run all classifiers
    global classifier,name,param_grid
    global y_train,y_validation
    for classifier, name, param_grid in zip(classifiers, names, param_grids):
        grid = GridSearchCV(classifier, param_grid=param_grid, cv= 3, n_jobs=-1)
        grid.fit(X_train_trans, y_train)

        clf = grid.best_estimator_ 
        clf.fit(X_train_trans,y_train)

        y_predict = clf.predict(X_validation_trans)
        model_accuracy = "{:1.2f}".format(accuracy_score(y_validation,y_predict))
        best_param = grid.best_params_
        print("Accuracy score for {}: {} (best parameters {})".format(name,model_accuracy,best_param))

## 4 - Feature Exraction

### 4.1 - Principal Component Analysis (PCA) Approach

In [18]:
from sklearn.decomposition import PCA
# - combine X_train and X_validation
X = np.concatenate((X_train,X_validation),axis=0)
# - specify number of components
n_components = 100
pca = PCA(n_components=n_components)
# - transform X datasets
Transform_X = pca.fit_transform(X)
# - split X into X_train and X_validation
X_train_transform = Transform_X[:xt_len,:]
X_validation_transform = Transform_X[xt_len:,:]
# - print X size
print('X_train_transform shape : ',X_train_transform.shape)
print('X_validation_transform shape : ',X_validation_transform.shape)

X_train_transform shape :  (1120, 100)
X_validation_transform shape :  (480, 100)


### 4.2 - Histogram of Oriented Gradient (HOG) Approach

In [19]:
def compute_image_hog(X): 
    '''Compute gradient of the image
    and obtain the histogram of the gradients'''
    n = 450 # shape of the matrix
    _X_hog = []
    for i in range(len(X)):#iterate over all image vectors
        X_reshape = reshape(X[i,:],[n,n])
        image_grad = np.gradient(X_reshape)
        total_grad = np.sqrt(image_grad[0]**2 + image_grad[1]**2)
        bin_count,bin_scale = np.histogram(ravel(total_grad),bins=25,range=(0,250))
        _X_hog.append(bin_count)
    X_hog = np.array(_X_hog)
    return X_hog

In [20]:
X_train_hog = compute_image_hog(X_train)
X_validation_hog = compute_image_hog(X_validation)
print('X_train_hog shape : ',X_train_hog.shape)
print('X_validation_hog shape : ',X_validation_hog.shape)

X_train_hog shape :  (1120, 25)
X_validation_hog shape :  (480, 25)


### 4.3 - Histogram of the Image pixels (HOP)

In [21]:
def compute_stats_1D(x):
    '''Compute statistics for the 1-D array'''
    mean = int(sp.mean(x))
    median = int(sp.median(x))
    skew = int(sp.stats.skew(x,bias=True))
    std = int(sp.std(x))
    x_stat = np.array([mean,median,skew,std])
    return x_stat

In [22]:
def compute_pixel_hist(X): 
    n = 450 # shape of the matrix
    _X_hist = []
    for i in range(len(X)):#iterate over all image vectors
        bin_count,bin_scale = np.histogram(X[i,:],bins=20,range=(0,255))
        x_stat = compute_stats_1D(X[i,:]) 
        X_f = np.concatenate((bin_count,x_stat),axis=0)
        _X_hist.append(X_f )
    X_hist = np.array(_X_hist)
    return X_hist

In [23]:
X_train_pxl_hist = compute_pixel_hist(X_train)
X_validation_pxl_hist = compute_pixel_hist(X_validation)
print('X_train_pxl shape : ',X_train_pxl_hist.shape)
print('X_validation_pxl shape : ',X_validation_pxl_hist.shape)

X_train_pxl shape :  (1120, 24)
X_validation_pxl shape :  (480, 24)


## 5 - Run Models

### 5.1 - PCA

In [37]:
# - Run result for PCA
run_multiple_models(X_train_transform,X_validation_transform)

Accuracy score for NC: 0.25 (best parameters {'shrink_threshold': 2})
Accuracy score for SVM: 0.10 (best parameters {'C': 1.0, 'degree': 1, 'gamma': 1.0, 'kernel': 'rbf'})
Accuracy score for KNN: 0.34 (best parameters {'n_neighbors': 1})
Accuracy score for QDA: 0.18 (best parameters {'reg_param': 0.003448275862068966})
Accuracy score for LDA: 0.36 (best parameters {'shrinkage': 0.3620689655172414})
Accuracy score for RDF: 0.48 (best parameters {'max_depth': 13, 'n_estimators': 81})


### 5.2 - HOG

In [38]:
# - Run result for HOG
run_multiple_models(X_train_hog,X_validation_hog)

Accuracy score for NC: 0.33 (best parameters {'shrink_threshold': 1})
Accuracy score for SVM: 0.10 (best parameters {'C': 1.0, 'degree': 1, 'gamma': 1.0, 'kernel': 'rbf'})
Accuracy score for KNN: 0.42 (best parameters {'n_neighbors': 13})
Accuracy score for QDA: 0.45 (best parameters {'reg_param': 0.010344827586206898})
Accuracy score for LDA: 0.49 (best parameters {'shrinkage': 0.0})
Accuracy score for RDF: 0.46 (best parameters {'max_depth': 11, 'n_estimators': 91})


### 5.3 - HOP

In [39]:
# - Run result for pixel histogram
run_multiple_models(X_train_pxl_hist,X_validation_pxl_hist)

Accuracy score for NC: 0.34 (best parameters {'shrink_threshold': 1})
Accuracy score for SVM: 0.10 (best parameters {'C': 1.0, 'degree': 1, 'gamma': 1.0, 'kernel': 'rbf'})
Accuracy score for KNN: 0.47 (best parameters {'n_neighbors': 13})
Accuracy score for QDA: 0.48 (best parameters {'reg_param': 0.01724137931034483})
Accuracy score for LDA: 0.39 (best parameters {'shrinkage': 0.017241379310344827})
Accuracy score for RDF: 0.55 (best parameters {'max_depth': 13, 'n_estimators': 51})


## 6 - Tabulate Result

In [23]:
import pandas as pd

In [30]:
data = [{'NC': 0.26, 'SVM': 0.10,'KNN':0.32,'QDA':0.19,'LDA':0.37,'RDF':0.54},
        {'NC': 0.34, 'SVM': 0.10,'KNN':0.42,'QDA':0.45,'LDA':0.49,'RDF':0.43},
        {'NC': 0.34, 'SVM': 0.10,'KNN':0.47,'QDA':0.48,'LDA':0.39,'RDF':0.51}]
index = ['PCA','HOG','HOP']
#title = 'Table of model accuracy'

In [36]:
df = pd.DataFrame(data,index)

In [37]:
df

Unnamed: 0,KNN,LDA,NC,QDA,RDF,SVM
PCA,0.32,0.37,0.26,0.19,0.54,0.1
HOG,0.42,0.49,0.34,0.45,0.43,0.1
HOP,0.47,0.39,0.34,0.48,0.51,0.1
