In [5]:
import os
import numpy as np
import cv2
import pickle
import matplotlib.pyplot as plt 
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from sklearn.externals.joblib import dump, load
from skimage.feature import hog

X = []
y = []

In [6]:
for path, subdirs, files in os.walk('dataset/English/Img/GoodImg/Bmp/'):
    for filename in files:
        f = os.path.join(path, filename)  # filename: 'img001-0004.png'
        target = filename[3:filename.index('-')]  # target: 001
        img = cv2.imread(f)  # load a color image into greyscale image
        img = cv2.medianBlur(img,5) # median blur to remove salt & and pepper noise
        img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
        img = cv2.equalizeHist(img) # Equalize histogram
        img_resized = cv2.resize(img,(30,30))
        
        X.append(img_resized.reshape(-1,1))
        y.append(target)

# Shape [7705, 900, 1] => 7705 ta sample, 900(30x30) hocche pixel values
X = np.array(X)
print(X.shape)
X = X.reshape(X.shape[:2]) # just removing the singleton dimension. Now shape = [7705,900]

(7705, 288, 1)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)

In [8]:
pipeline = Pipeline([
        ('clf', SVC(kernel='rbf', gamma=0.01, C=100))
])

parameters = {
        'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
        'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}
    
grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')

grid_search.fit(X_train, y_train)
print("training ended")

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed: 10.6min
[Parallel(n_jobs=3)]: Done  90 out of  90 | elapsed: 19.9min finished


training ended


In [10]:
dump(grid_search,'GridSearchMedianBlurWithHist.joblib') # save model

# grid_search = load('GridSearchMedianBlur.joblib') # load the model.
test_performance = grid_search.score(X_test, y_test)

print('Best score: %0.3f' % grid_search.best_score_)
print('Test score: %0.3f' % test_performance)

Best score: 0.777
Test score: 0.799


In [11]:
predictions = grid_search.predict(X_test)
    
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         001       0.50      0.29      0.36        14
         002       1.00      0.44      0.62         9
         003       0.50      0.67      0.57         3
         004       1.00      0.67      0.80         6
         005       1.00      0.67      0.80         3
         006       1.00      1.00      1.00         4
         007       1.00      0.83      0.91         6
         008       1.00      0.67      0.80         6
         009       0.50      1.00      0.67         1
         010       0.00      0.00      0.00         0
         011       0.91      0.96      0.94        55
         012       0.91      1.00      0.95        10
         013       0.75      0.75      0.75        20
         014       0.86      0.95      0.90        19
         015       0.80      0.95      0.87        39
         016       1.00      0.71      0.83         7
         017       0.90      0.75      0.82        12
         018       0.86    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [None]:
model2 = KNeighborsClassifier(n_neighbors=3)
model2.fit(X_train, y_train)
print("Training ended")

In [None]:
test_performance = model2.score(X_test, y_test)

print('Test score: %0.3f' % test_performance)