# Malaria Classification Example

In [None]:
from sklearn import cross_validation, neighbors, metrics
%pylab inline

Load dataset of blood smear image patches, with labels corresponding to the presence of absence of plasmodium.

<b>Dataset citation:</b><br>
J.A. Quinn, A. Andama, I. Munabi, F.N. Kiwanuka. <i>Automated Blood Smear Analysis for Mobile Malaria Diagnosis</i>. Chapter in Mobile Point-of-Care Monitors and Diagnostic Device Design, eds. W. Karlen and K. Iniewski, CRC Press, 2014. <br>http://air.ug/~jquinn/papers/AutomatedMalariaDiagnosisChapter.pdf



In [None]:
data = np.load('malaria-classification-example.npz')
X = data['X']
y = data['y']
images = data['images']


### Show sample images with positive labels

In [None]:
pos = np.where(y==1)[0]

figsize(6,6)
for i in range(36):
    plt.subplot(6,6,i+1)
    plt.imshow(images[pos[i],:,:], cmap=plt.cm.gray)
    plt.xticks([])
    
    plt.yticks([])

### Show sample images with negative labels

In [None]:
neg = np.where(y==0)[0]

figsize(6,6)
for i in range(1,37):
    plt.subplot(6,6,i)
    plt.imshow(images[neg[i],:,:], cmap=plt.cm.gray)
    plt.xticks([])
    plt.yticks([])

### Train and test a nearest neighbour classifier

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25)

In [None]:
clf = neighbors.KNeighborsClassifier(n_neighbors=1, weights='distance')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('Accuracy = %.3f' % (metrics.accuracy_score(y_test,y_pred>0.5)))

### Show the nearest neighbours for test images

In [None]:
nn = neighbors.NearestNeighbors(n_neighbors=1)
nn.fit(X_train)

N_examples = 10
figsize(2,15)
for i in range(N_examples):
    subplot(N_examples,2,2*i+1)
    plt.imshow(np.reshape(X_test[i,:],(40,40)), cmap=plt.cm.gray)
    plt.title(y_test[i])
    plt.xticks([])
    plt.yticks([])
    
    subplot(N_examples,2,2*i+2)
    neighbour_idx = int(nn.kneighbors(X_test[i,:])[1])
    plt.imshow(np.reshape(X_train[neighbour_idx,:],(40,40)), cmap=plt.cm.gray)
    plt.title(y_train[neighbour_idx])
    plt.xticks([])
    plt.yticks([])

### Show the neighbours for test images where the classifier was wrong

In [None]:
mistakes = np.where(np.logical_xor(y_pred>.5,y_test))[0]

nn = neighbors.NearestNeighbors(n_neighbors=1)
nn.fit(X_train)

N_examples = 10
figsize(2,15)
for i in range(N_examples):
    subplot(N_examples,2,2*i+1)
    plt.imshow(np.reshape(X_test[mistakes[i],:],(40,40)), cmap=plt.cm.gray)
    plt.title(y_test[mistakes[i]])
    plt.xticks([])
    plt.yticks([])
    
    subplot(N_examples,2,2*i+2)
    neighbour_idx = int(nn.kneighbors(X_test[mistakes[i],:])[1])
    plt.imshow(np.reshape(X_train[neighbour_idx,:],(40,40)), cmap=plt.cm.gray)
    plt.title(y_train[neighbour_idx])
    plt.xticks([])
    plt.yticks([])

### Show Receiver Operating Characteristics curve

**Exercise 1:** Below we can see the ROC for the classifier on the malaria example. Can you adjust the classifier to make the ROC area better?

In [None]:
from sklearn import neighbors
from sklearn import svm
from sklearn import naive_bayes
from sklearn import tree
from sklearn import ensemble

clf = neighbors.KNeighborsClassifier(n_neighbors=15, weights='distance')
#clf = tree.DecisionTreeClassifier()
#clf = ensemble.RandomForestClassifier(n_estimators=50, min_samples_split=1, max_depth=None, max_features=16)
#clf = ensemble.ExtraTreesClassifier(n_estimators=100, min_samples_split=1, max_depth=None, max_features=8)
#clf = naive_bayes.GaussianNB()
    
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)

figsize(6,6)
plot(fpr, tpr)
title('ROC (area under curve=%.3f)' % (metrics.roc_auc_score(y_test, y_pred)))
plt.xlabel('FPR')
plt.ylabel('TPR')

### Show classifier probabilities on test images