In [1]:
from sklearn.model_selection import train_test_split
import cv2
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
def load_images(file_name):
    images = []
    for img_file in os.listdir(file_name):
         if img_file.lower().endswith((".jpg")):
            path = os.path.join(file_name, img_file)
            images.append(cv2.imread(path))
    return np.array(images)

In [3]:
class feature_extractor(BaseEstimator, TransformerMixin):
    def __init__(self, feature_type='hog'):
        self.feature_type = feature_type

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        features = []
        window_size = (256, 256)
        block_size = (16, 16)
        block_stride = (8, 8)
        cell_size = (8, 8)
        nbins = 9
        hog = cv2.HOGDescriptor(window_size, block_size, block_stride, cell_size, nbins)

        for img in X:
                hog_features = hog.compute(img)
                features.append(hog_features.flatten())


        return np.array(features)


In [114]:
image_data = load_images('JPEGImages')
labels = pd.read_csv('labels.csv')
labels = labels.drop('Unnamed: 0', axis=1).iloc[:-1]
labels = labels[labels['Category'].isnull() == False]



multi_label_indices = labels.index[labels['Category'].str.contains(',')]

image_data_grayscale = np.array([cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) for img in image_data])

images_downscaled = [cv2.resize(img, (64, 64)) for img in image_data_grayscale]


data = pd.DataFrame({'image': list(images_downscaled), 'label': labels['Category'].values, 'image_num': labels['Image'].values})
multi_label_indices = data.index[data['label'].str.contains(',')] 
data = data.drop(multi_label_indices)


random_state = 42
random.seed(random_state)
to_remove = []
neutrophil_indices = data.index[data['label'] == "NEUTROPHIL"]
x = 1
for x in range(57):
    r1 = random.randint(0, len(neutrophil_indices) - 1)
    index = neutrophil_indices[r1]
    to_remove.append(index)
    neutrophil_indices = np.delete(neutrophil_indices, r1)

data = data.drop(to_remove)


eosinophil_indices = data.index[data['label'] == "EOSINOPHIL"]
x=1
to_add_e = []
for x in range(63):
    r2 = random.randint(0, len(eosinophil_indices) - 1)
    index = eosinophil_indices[r2]
    image = data.loc[index]['image']    
    image_inverted = cv2.flip(image, 1)
    new_row = pd.DataFrame([[image_inverted, 'EOSINOPHIL', len(data)+1]], columns=data.columns)
    to_add_e.append(new_row)
    eosinophil_indices = np.delete(eosinophil_indices, r2)
    
data = pd.concat([data] + to_add_e, ignore_index=True)


to_add_l = []
lymphocyte_indices = data.index[data['label'] == "LYMPHOCYTE"]
x=1
for x in range(29):
    r3 = random.randint(0, len(lymphocyte_indices) - 1)
    index = lymphocyte_indices[r3]
    image = data.loc[index]['image']

    image_inverted = cv2.flip(image, 1)
    new_row = pd.DataFrame([[image_inverted, 'LYMPHOCYTE', len(data)+1]], columns=data.columns)
    to_add_l.append(new_row)

    kernel_size = np.random.choice([3, 5])
    image_gaussian = cv2.GaussianBlur(image, (kernel_size, kernel_size), 0)
    new_row2 = pd.DataFrame([[image_gaussian, 'LYMPHOCYTE', len(data)+1]], columns=data.columns)
    to_add_l.append(new_row2)

    kernel_size = np.random.choice([3, 5])
    image_gaussian = cv2.GaussianBlur(image, (kernel_size, kernel_size), 0)
    new_row3 = pd.DataFrame([[image_gaussian, 'LYMPHOCYTE', len(data)+1]], columns=data.columns)
    to_add_l.append(new_row3)

    noise = np.random.normal(0, 5, image.shape)
    image_noisy = np.clip(image + noise, 0, 255).astype(np.uint8)
    new_row4 = pd.DataFrame([[image_noisy, 'LYMPHOCYTE', len(data)+1]], columns=data.columns)
    to_add_l.append(new_row4)

    lymphocyte_indices = np.delete(lymphocyte_indices, r3)

data = pd.concat([data] + to_add_l, ignore_index=True)



too_add_m = []
monocyte_indices = data.index[data['label'] == "MONOCYTE"]
for x in range(20):
    r4 = random.randint(0, len(monocyte_indices) - 1)
    index = monocyte_indices[r4]
    image = data.loc[index]['image']
    image_inverted = cv2.flip(image, 1)
    new_row = pd.DataFrame([[image_inverted, 'MONOCYTE', len(data)+1]], columns=data.columns)
    too_add_m.append(new_row)

    kernel_size = np.random.choice([3, 5])
    image_gaussian = cv2.GaussianBlur(image, (kernel_size, kernel_size), 0)
    new_row2 = pd.DataFrame([[image_gaussian, 'MONOCYTE', len(data)+1]], columns=data.columns)
    too_add_m.append(new_row2)

    kernel_size = np.random.choice([3, 5])
    image_gaussian = cv2.GaussianBlur(image, (kernel_size, kernel_size), 0)
    new_row3 = pd.DataFrame([[image_gaussian, 'MONOCYTE', len(data)+1]], columns=data.columns)
    too_add_m.append(new_row3)

    noise = np.random.normal(0, 5, image.shape)
    image_noisy = np.clip(image + noise, 0, 255).astype(np.uint8)
    new_row4 = pd.DataFrame([[image_noisy, 'MONOCYTE', len(data)+1]], columns=data.columns)
    too_add_m.append(new_row4)

    image_inverted_vert = cv2.flip(image, 0)
    new_row5 = pd.DataFrame([[image_inverted_vert, 'MONOCYTE', len(data)+1]], columns=data.columns)
    too_add_m.append(new_row5)

    image_inverted_both = cv2.flip(image, -1)
    new_row5 = pd.DataFrame([[image_inverted_both, 'MONOCYTE', len(data)+1]], columns=data.columns)
    too_add_m.append(new_row5)

    monocyte_indices = np.delete(monocyte_indices, r4)


data = pd.concat([data] + too_add_m, ignore_index=True)

   
data = data[data['label'] != 'BASOPHIL']



data['label'].value_counts()



label
NEUTROPHIL    150
EOSINOPHIL    150
LYMPHOCYTE    149
MONOCYTE      141
Name: count, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['image'], data['label'], test_size=0.2, shuffle=True, random_state=20)


pipeline = Pipeline([ ('feature_extractor', feature_extractor(feature_type='hog')),  
                     ('svm', SVC(kernel='rbf', C=10, gamma='scale', random_state=42))])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
pipeline.score(X_test, y_test)

0.8342148102498219
