In [1]:
import os
import cv2
import pandas as pd
import numpy as np
from skimage import feature

#for plotting
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#for data splitting
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

#for the model prediction
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
data_dir     = './dataset anemia/'
india_folder = os.path.join(data_dir, 'India')
italy_folder = os.path.join(data_dir, 'Italy')


In [3]:
def should_use_img(img_name, img_type):
    if img_name.split(".")[1] in ["jpg", "png"]:
        img_name = img_name.split(".")[0]
        
        if img_type == "":
            img_types   = ["forniceal", "forniceal_palpebral", "palpebral"]
            for _type in img_types:
                if img_name.endswith(_type):
                    return False
            return True
        elif img_type == "palpebral":
            img_name = img_name.split(".")[0]
            if img_name.endswith("forniceal_palpebral"):
                    return False
            return True if img_name.endswith(img_type) else False
            
        else:
            return True if img_name.endswith(img_type)  else False
    
    else:
        return False
        

img_to_use  = "forniceal_palpebral"
img_files   = {"India":[], "Italy":[]}

for folder in ['India', 'Italy']:
    img_folder = os.path.join(data_dir, folder)
    for root, dirs, files in os.walk(img_folder):
        flag = False
        for file in files:
            if should_use_img(file, img_to_use):
                cls = root.split("\\")[-1]
                img_files[folder].append( ( os.path.join(root, file), cls) )
                flag = True
                break
        if flag is False:
            print(root, " not found -> ", files)


./dataset anemia/India  not found ->  ['.DS_Store', 'IndiaRd.xlsx']
./dataset anemia/Italy  not found ->  ['.DS_Store', 'Italyrd.xlsx']
./dataset anemia/Italy\1  not found ->  ['.DS_Store', '001_palpebral.png', '1.jpg']
./dataset anemia/Italy\109  not found ->  ['.DS_Store', 'T_78_20190614_074753.jpg', 'T_78_20190614_074753_palpebral.png']
./dataset anemia/Italy\35  not found ->  ['.DS_Store', 'T_4_20190606_095326.jpg', 'T_4_20190606_095326_palpebral.png']
./dataset anemia/Italy\54  not found ->  ['.DS_Store', 'T_23_20190608_090427.jpg', 'T_23_20190608_090427_palpebral.png']
./dataset anemia/Italy\58  not found ->  ['.DS_Store', 'T_27_20190608_100451.jpg', 'T_27_20190608_100451_palpebral.png']
./dataset anemia/Italy\75  not found ->  ['.DS_Store', 'T_44_20190611_083543.jpg', 'T_44_20190611_083543_palpebral.png']


In [4]:
for key, items in img_files.items():
    print(f"items for {key} -> {len(items)}")

items for India -> 95
items for Italy -> 117


In [5]:
img_files["India"][0]

('./dataset anemia/India\\1\\20200118_164733_forniceal_palpebral.png', '1')

In [6]:
data_dir = './dataset anemia/'
file1    = data_dir+'India/IndiaRd.xlsx'
file2    = data_dir+'Italy/Italyrd.xlsx'

In [7]:
labels = {}

d1 = pd.read_excel(file1)
d1["Anemia"] = d1["Note"]
d1 = d1[["Number", "Anemia" ]]
d1.replace("No anemia", "No Anemia", inplace=True)

d2 = pd.read_excel(file2)[["Number", "Anemia" ]]
d2.replace("No anemia", "No Anemia", inplace=True)

unique_values   = list(d1['Anemia'].unique())
enum            = {val:i for i, val in enumerate(unique_values)} 
print(enum)

d1['Anemia'].replace(enum, inplace=True)
d2['Anemia'].replace(enum, inplace=True)

labels["India"] = { row["Number"]:row["Anemia"] for index, row in d1.iterrows() }
labels["Italy"] = { row["Number"]:row["Anemia"] for index, row in d2.iterrows() }


{'No Anemia': 0, 'Anemia': 1}


## Load Images

### Feature Extractors

In [8]:
class HistogramOrientedGradient:
    def __init__(self, winSize = (60,60), blockSize = (12,12), blockStride = (12,12), cellSize = (12,12), nbins = 9, 
                 derivAperture = 1, winSigma = -1., histogramNormType = 0, L2HysThreshold = 0.2, gammaCorrection = 1, 
                 nlevels = 64, signedGradient = True):
        self._descriptor = cv2.HOGDescriptor(winSize,blockSize,blockStride,cellSize,nbins,derivAperture,winSigma,histogramNormType,
                                L2HysThreshold,gammaCorrection,nlevels, signedGradient)
    
    def preprocess(self, image):
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        return gray
    
    def describe(self, image):
        image = self.preprocess(image)
        hog   = self._descriptor.compute(image)
        return hog

    
#hog =  np.array(hog_descriptors)
#hog = hog.reshape(hog.shape[0], hog.shape[1]*hog.shape[2])

In [9]:
# HOG Hist
X = []
y = []

desc = HistogramOrientedGradient()

for key, items in img_files.items():
    for item in items:
        img_path, cls = item
        img           = cv2.imread(img_path)
        img           = cv2.resize(img, (256, 256))
        hist          = desc.describe(img)
        lbl           = labels[key][int(cls)]
        
        X.append(hist)
        y.append(lbl)
        

In [12]:
X[0].shape

(65025,)

In [10]:
import pickle

file = open("data_hog_forniceal_palpebral.pickle", "wb")
pickle.dump((X, y), file)
file.close()

In [18]:
import pickle

file = open("data_hog_forniceal_palpebral.pickle", "rb")
X, y = pickle.load(file)
file.close()

In [19]:
print(f"len(X) -> {len(X)}")
print(f"len(y) -> {len(y)}")

len(X) -> 212
len(y) -> 212


In [20]:
y = [0 if i == 0 else 1 for i in y]

# Training

In [21]:
X = np.array(X)
X.shape

(212, 65025)

In [34]:
X[2]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [22]:
Y = y.copy()
len(Y)

212

In [23]:
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, train_size=0.7, random_state=123)
print('Training Samples: '+str(len(X_train)) + ' \n Test Samples: ' + str(len(X_test)))

Training Samples: 148 
 Test Samples: 64


## Tf Model

In [15]:
import tensorflow as tf



In [16]:
X_train.shape

(148, 65025)

In [24]:
y_train = tf.keras.utils.to_categorical(Y_train, 2)
y_test  = tf.keras.utils.to_categorical(Y_test, 2)

In [22]:
y_train.shape

(152, 2)

In [25]:
model =   tf.keras.models.Sequential([
                            tf.keras.layers.Dense(128, input_shape=(None, X_train.shape[1]), activation='relu'),
                            tf.keras.layers.Dense(56,  activation='relu'),
                            tf.keras.layers.Dense(2)
                            ])

In [26]:
loss_fn = tf.keras.losses.CategoricalCrossentropy()
model.compile(optimizer='adam',loss=loss_fn, metrics=['accuracy'])

In [27]:
model.fit(X_train, y_train, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x2392f092f88>

In [28]:
res = model.evaluate(X_test, y_test)



In [101]:
res

[0.6324321031570435, 0.6666666865348816]

## SVM

In [29]:
param_grid = {'C':[2**1], 'gamma':[0.1, 10], 'kernel':['poly']}
gridSVM = GridSearchCV(SVC(), param_grid=param_grid, cv = 5)

#gridSVM = SVC(C=2, gamma=0.1, kernel='poly')
gridSVM.fit(X_train, Y_train)

print('Training Score: ' + str(gridSVM.score(X_train, Y_train)))
print("Testing Score: " + str(gridSVM.score(X_test, Y_test)))

Training Score: 1.0
Testing Score: 0.765625


In [30]:
gridSVM.best_params_

{'C': 2, 'gamma': 0.1, 'kernel': 'poly'}

In [31]:
param_grid = {'C':[0.5, 2**1, 2**2, 2**3 ], 'gamma':[0.1, 2, 10, 100], 'kernel':['poly']}
gridSVM = GridSearchCV(SVC(), param_grid=param_grid, cv = 5)

#gridSVM = SVC(C=2, gamma=0.1, kernel='poly')
gridSVM.fit(X_train, Y_train)

print('Training Score: ' + str(gridSVM.score(X_train, Y_train)))
print("Testing Score: " + str(gridSVM.score(X_test, Y_test)))

Training Score: 1.0
Testing Score: 0.765625


In [32]:
gridSVM.best_params_

{'C': 0.5, 'gamma': 0.1, 'kernel': 'poly'}

In [33]:
param_grid = {'C':[0.5, 2**1, 2**2, 2**3, 2**4, 2**5 ], 'gamma':[0.01, 0.1, 2, 10, 100], 'kernel':['poly']}
gridSVM = GridSearchCV(SVC(), param_grid=param_grid, cv = 5)

#gridSVM = SVC(C=2, gamma=0.1, kernel='poly')
gridSVM.fit(X_train, Y_train)

print('Training Score: ' + str(gridSVM.score(X_train, Y_train)))
print("Testing Score: " + str(gridSVM.score(X_test, Y_test)))

Training Score: 1.0
Testing Score: 0.765625


In [34]:
gridSVM.best_params_

{'C': 0.5, 'gamma': 0.01, 'kernel': 'poly'}

In [35]:
param_grid = {'C':[0.5, 2**1, 2**2, 2**3, 2**4, 2**5, 2**6, 2**8 ], 'gamma':[0.01, 0.1, 2, 10, 100], 'kernel':['poly', 'linear', 'rbf', 'sigmoid']}
gridSVM = GridSearchCV(SVC(), param_grid=param_grid, cv = 5)

#gridSVM = SVC(C=2, gamma=0.1, kernel='poly')
gridSVM.fit(X_train, Y_train)

print('Training Score: ' + str(gridSVM.score(X_train, Y_train)))
print("Testing Score: " + str(gridSVM.score(X_test, Y_test)))

Training Score: 1.0
Testing Score: 0.71875


In [36]:
gridSVM.best_params_

{'C': 0.5, 'gamma': 0.01, 'kernel': 'linear'}

In [37]:
param_grid = {'C':[0.5, 2**1, 2**2, 2**3 ], 'gamma':[0.1, 2, 10, 100], 'kernel':['linear']}
gridSVM = GridSearchCV(SVC(), param_grid=param_grid, cv = 5)

#gridSVM = SVC(C=2, gamma=0.1, kernel='poly')
gridSVM.fit(X_train, Y_train)

print('Training Score: ' + str(gridSVM.score(X_train, Y_train)))
print("Testing Score: " + str(gridSVM.score(X_test, Y_test)))

Training Score: 1.0
Testing Score: 0.71875


In [38]:
gridSVM.best_params_

{'C': 0.5, 'gamma': 0.1, 'kernel': 'linear'}

In [39]:
param_grid = {'C':[0.5, 2**1, 2**2, 2**3 ], 'gamma':[0.1, 2, 10, 100], 'kernel':['sigmoid']}
gridSVM = GridSearchCV(SVC(), param_grid=param_grid, cv = 5)

#gridSVM = SVC(C=2, gamma=0.1, kernel='poly')
gridSVM.fit(X_train, Y_train)

print('Training Score: ' + str(gridSVM.score(X_train, Y_train)))
print("Testing Score: " + str(gridSVM.score(X_test, Y_test)))

Training Score: 0.6148648648648649
Testing Score: 0.625


In [49]:
gridSVM.best_params_

{'C': 0.5, 'gamma': 0.1, 'kernel': 'sigmoid'}

## KNN

In [40]:
from sklearn.neighbors import KNeighborsClassifier


In [41]:
model = KNeighborsClassifier(n_neighbors=1, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric_params=None, n_jobs=-1)
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.671875

In [42]:
knnAlgos = ['auto', 'ball_tree', 'kd_tree', 'brute']

model = KNeighborsClassifier(n_neighbors=1, weights='uniform', algorithm=knnAlgos[3], leaf_size=30, p=2, metric_params=None, n_jobs=-1)
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.671875

In [43]:
model = KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='ball_tree', leaf_size=30, p=2, metric_params=None, n_jobs=-1)
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.671875

In [44]:
model = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric_params=None, n_jobs=-1)
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.75

In [45]:
model = KNeighborsClassifier(n_neighbors=7, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric_params=None, n_jobs=-1)
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.65625

In [46]:
model = KNeighborsClassifier(n_neighbors=21, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric_params=None, n_jobs=-1)
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.640625

In [47]:
model = KNeighborsClassifier(n_neighbors=21, weights='uniform', algorithm='kd_tree', leaf_size=30, p=2, metric_params=None, n_jobs=-1)
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.640625

In [48]:
model = KNeighborsClassifier(n_neighbors=61, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric_params=None, n_jobs=-1)
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.625

In [49]:
# Grid Search
param_grid = {'n_neighbors':[1, 3, 5, 21, 61], 'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']}

gridKNN = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=5)
gridKNN.fit(X_train, Y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [1, 3, 5, 21, 61]})

In [50]:
gridKNN.score(X_test, Y_test)

0.640625

In [51]:
gridKNN.best_score_

0.6213793103448275

In [52]:
gridKNN.best_params_

{'algorithm': 'auto', 'n_neighbors': 21}

# Random Forest

In [53]:
param_grid = {'n_estimators':[1500, 1800, 2000]}
#gridRFT = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv = 5)
gridRFT = RandomForestClassifier(n_estimators=10)
gridRFT.fit(X_train, Y_train)

print('Training Score: ' + str(gridRFT.score(X_train, Y_train)))
print("Testing Score: " + str(gridRFT.score(X_test, Y_test)))

Training Score: 0.9797297297297297
Testing Score: 0.6875


In [54]:
param_grid = {'n_estimators':[1500, 1800, 2000]}
#gridRFT = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv = 5)
gridRFT = RandomForestClassifier(n_estimators=20)
gridRFT.fit(X_train, Y_train)

print('Training Score: ' + str(gridRFT.score(X_train, Y_train)))
print("Testing Score: " + str(gridRFT.score(X_test, Y_test)))

Training Score: 1.0
Testing Score: 0.640625


In [55]:
param_grid = {'n_estimators':[1500, 1800, 2000]}
#gridRFT = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv = 5)
gridRFT = RandomForestClassifier(n_estimators=30)
gridRFT.fit(X_train, Y_train)

print('Training Score: ' + str(gridRFT.score(X_train, Y_train)))
print("Testing Score: " + str(gridRFT.score(X_test, Y_test)))

Training Score: 0.9932432432432432
Testing Score: 0.796875


In [56]:
param_grid = {'n_estimators':[1500, 1800, 2000]}
#gridRFT = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv = 5)
gridRFT = RandomForestClassifier(n_estimators=50)
gridRFT.fit(X_train, Y_train)

print('Training Score: ' + str(gridRFT.score(X_train, Y_train)))
print("Testing Score: " + str(gridRFT.score(X_test, Y_test)))

Training Score: 1.0
Testing Score: 0.703125


In [57]:
param_grid = {'n_estimators':[1500, 1800, 2000]}
#gridRFT = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv = 5)
gridRFT = RandomForestClassifier(n_estimators=500)
gridRFT.fit(X_train, Y_train)

print('Training Score: ' + str(gridRFT.score(X_train, Y_train)))
print("Testing Score: " + str(gridRFT.score(X_test, Y_test)))

Training Score: 1.0
Testing Score: 0.671875


In [58]:
param_grid = {'n_estimators':[1500, 1800, 2000]}
#gridRFT = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv = 5)
gridRFT = RandomForestClassifier(n_estimators=700)
gridRFT.fit(X_train, Y_train)

print('Training Score: ' + str(gridRFT.score(X_train, Y_train)))
print("Testing Score: " + str(gridRFT.score(X_test, Y_test)))

Training Score: 1.0
Testing Score: 0.6875


In [59]:
param_grid = {'n_estimators':[1500, 1800, 2000]}
#gridRFT = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv = 5)
gridRFT = RandomForestClassifier(n_estimators=1000)
gridRFT.fit(X_train, Y_train)

print('Training Score: ' + str(gridRFT.score(X_train, Y_train)))
print("Testing Score: " + str(gridRFT.score(X_test, Y_test)))

Training Score: 1.0
Testing Score: 0.703125


In [60]:
param_grid = {'n_estimators':[1500, 1800, 2000]}
#gridRFT = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv = 5)
gridRFT = RandomForestClassifier(n_estimators=1200)
gridRFT.fit(X_train, Y_train)

print('Training Score: ' + str(gridRFT.score(X_train, Y_train)))
print("Testing Score: " + str(gridRFT.score(X_test, Y_test)))

Training Score: 1.0
Testing Score: 0.6875


In [61]:
param_grid = {'n_estimators':[1500, 1800, 2000]}
#gridRFT = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv = 5)
gridRFT = RandomForestClassifier(n_estimators=3500)
gridRFT.fit(X_train, Y_train)

print('Training Score: ' + str(gridRFT.score(X_train, Y_train)))
print("Testing Score: " + str(gridRFT.score(X_test, Y_test)))

Training Score: 1.0
Testing Score: 0.6875


# MLP

In [62]:
from sklearn.neural_network import MLPClassifier


In [63]:

grid_params = {'hidden_layer_sizes':[(50,), (100), (50, 100,)], 'activation':['identity', 'logistic', 'tanh', 'relu'], 'solver':['lbfgs', 'sgd', 'adam'] }
gridMLP = GridSearchCV(MLPClassifier(), param_grid=grid_params, cv=5)
gridMLP.fit(X_train, Y_train)





GridSearchCV(cv=5, estimator=MLPClassifier(),
             param_grid={'activation': ['identity', 'logistic', 'tanh', 'relu'],
                         'hidden_layer_sizes': [(50,), 100, (50, 100)],
                         'solver': ['lbfgs', 'sgd', 'adam']})

In [64]:
gridMLP.best_score_

0.7096551724137932

In [67]:
gridMLP.best_params_

{'activation': 'relu', 'hidden_layer_sizes': 100, 'solver': 'lbfgs'}

In [68]:
mlpCls = MLPClassifier(hidden_layer_sizes=(100,), activation='identity', solver='adam' )
mlpCls.fit(X_train, Y_train)
mlpCls.score(X_test, Y_test)

0.609375

In [69]:
# grid_params = {'hidden_layer_sizes':[(50,), (100), (50, 100,)], 'activation':['identity', 'logistic', 'tanh', 'relu'], 'solver':['lbfgs', 'sgd', 'adam'] }
mlpCls = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', solver='adam' )
mlpCls.fit(X_train, Y_train)
mlpCls.score(X_test, Y_test)

0.625

In [70]:
# grid_params = {'hidden_layer_sizes':[(50,), (100), (50, 100,)], 'activation':['identity', 'logistic', 'tanh', 'relu'], 'solver':['lbfgs', 'sgd', 'adam'] }
mlpCls = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', solver='sgd' )
mlpCls.fit(X_train, Y_train)
mlpCls.score(X_test, Y_test)



0.6875

In [71]:
# grid_params = {'hidden_layer_sizes':[(50,), (100), (50, 100,)], 'activation':['identity', 'logistic', 'tanh', 'relu'], 'solver':['lbfgs', 'sgd', 'adam'] }
mlpCls = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', solver='lbfgs' )
mlpCls.fit(X_train, Y_train)
mlpCls.score(X_test, Y_test)

0.71875

In [72]:
# grid_params = {'hidden_layer_sizes':[(50,), (100), (50, 100,)], 'activation':['identity', 'logistic', 'tanh', 'relu'], 'solver':['lbfgs', 'sgd', 'adam'] }
mlpCls = MLPClassifier(hidden_layer_sizes=(50,), activation='identity', solver='adam' )
mlpCls.fit(X_train, Y_train)
mlpCls.score(X_test, Y_test)

0.59375

In [73]:
# grid_params = {'hidden_layer_sizes':[(50,), (100), (50, 100,)], 'activation':['identity', 'logistic', 'tanh', 'relu'], 'solver':['lbfgs', 'sgd', 'adam'] }
mlpCls = MLPClassifier(hidden_layer_sizes=(200,), activation='identity', solver='adam' )
mlpCls.fit(X_train, Y_train)
mlpCls.score(X_test, Y_test)

0.671875

In [74]:
# grid_params = {'hidden_layer_sizes':[(50,), (100), (50, 100,)], 'activation':['identity', 'logistic', 'tanh', 'relu'], 'solver':['lbfgs', 'sgd', 'adam'] }
mlpCls = MLPClassifier(hidden_layer_sizes=(50, 100,), activation='identity', solver='adam' )
mlpCls.fit(X_train, Y_train)
mlpCls.score(X_test, Y_test)

0.6875