In [12]:
import os
import numpy as np
import cv2
import pickle
import matplotlib.pyplot as plt 
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from sklearn.externals.joblib import dump, load
from skimage.feature import hog

X = []
y = []

In [13]:
for path, subdirs, files in os.walk('dataset/English/Img/GoodImg/Bmp/'):
    for filename in files:
        f = os.path.join(path, filename)  # filename: 'img001-0004.png'
        target = filename[3:filename.index('-')]  # target: 001
        img = cv2.imread(f)  # load a color image into greyscale image
        img = cv2.medianBlur(img,5) # median blur to remove salt & and pepper noise
        img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
        img = cv2.equalizeHist(img) # Equalize histogram
        img_resized = cv2.resize(img,(30,30))
        
        X.append(img_resized.reshape(-1,1))
        y.append(target)

# Shape [7705, 900, 1] => 7705 ta sample, 900(30x30) hocche pixel values
X = np.array(X)
print(X.shape)
X = X.reshape(X.shape[:2]) # just removing the singleton dimension. Now shape = [7705,900]

(7705, 900, 1)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)

In [15]:
pipeline = Pipeline([
        ('clf', SVC(kernel='rbf', gamma=0.01, C=100))
])

parameters = {
        'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
        'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}
    
grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')

grid_search.fit(X_train, y_train)
print("training ended")

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed: 43.6min
[Parallel(n_jobs=3)]: Done  90 out of  90 | elapsed: 88.1min finished


training ended


In [16]:
dump(grid_search,'GridSearchMedianBlurWithHist.joblib') # save model

# grid_search = load('GridSearchMedianBlur.joblib') # load the model.
test_performance = grid_search.score(X_test, y_test)

print('Best score: %0.3f' % grid_search.best_score_)
print('Test score: %0.3f' % test_performance)

Best score: 0.073
Test score: 0.078


In [17]:
predictions = grid_search.predict(X_test)
    
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         001       0.00      0.00      0.00         9
         002       0.00      0.00      0.00         6
         003       0.00      0.00      0.00         9
         004       0.00      0.00      0.00         5
         005       0.00      0.00      0.00         5
         006       0.00      0.00      0.00         4
         007       0.00      0.00      0.00         7
         008       0.00      0.00      0.00         4
         009       0.00      0.00      0.00         2
         010       0.00      0.00      0.00         2
         011       0.08      1.00      0.14        59
         012       0.00      0.00      0.00        13
         013       0.00      0.00      0.00        26
         014       0.00      0.00      0.00        18
         015       0.00      0.00      0.00        52
         016       0.00      0.00      0.00         8
         017       0.00      0.00      0.00        18
         018       0.00    

  'precision', 'predicted', average, warn_for)


In [18]:
model2 = KNeighborsClassifier(n_neighbors=3)
model2.fit(X_train, y_train)
print("Training ended")

Training ended


In [19]:
test_performance = model2.score(X_test, y_test)

print('Test score: %0.3f' % test_performance)

Test score: 0.553
