In [None]:
import pandas as pd
import numpy as np
from mpl_toolkits.axes_grid1 import make_axes_locatable
import itertools
from IPython.core.display import display, HTML, Image
import matplotlib.pyplot as plt
from sklearn import metrics

In [None]:
f = '../data/googleimages.csv'
I = pd.read_csv(f, index_col=0)

In [None]:
I.head()

## Naive implementation of Naive Bayes

$$
P(class \mid features) = P(class)P(feature \mid class)
$$

In [None]:
size = int(I.shape[0]*0.8)
train, test = I.iloc[:size], I.iloc[size:]

### Priors

In [None]:
categories = train.category.unique()

In [None]:
categories

In [None]:
K = {}
for c in categories:
    K[c] = train[train.category == c]

In [None]:
priors = np.array([K[c].shape[0] for c in categories])

In [None]:
priors = priors / priors.sum()

### Likelihood

In [None]:
models = {}
for c in categories:
    models[c] = K[c][K[c].columns.difference(['doc', 'url', 'category'])].sum() + 1
for c, m in models.items():
    models[c] = m / m.sum()

In [None]:
train_true = train.category.values
test_true = test.category.values

In [None]:
train_pred, test_pred = [], []
for i, row in train[train.columns.difference([
    'doc', 'url', 'category'])].iterrows():
    predictions = np.ones(len(categories))
    for k, _ in [(x, y) for x, y in row.items() if y > 0]:
        for j, z in enumerate(categories):
            predictions[j] *= models[z][k]
    pred = np.argmax(predictions * priors)
    train_pred.append(categories[pred])
for i, row in test[test.columns.difference([
    'doc', 'url', 'category'])].iterrows():
    predictions = np.ones(len(categories))
    for k, _ in [(x, y) for x, y in row.items() if y > 0]:
        for j, z in enumerate(categories):
            predictions[j] *= models[z][k]
    pred = np.argmax(predictions * priors)
    test_pred.append(categories[pred])

In [None]:
def cm_plot(ax, classes, CM, title, figure):
    im = ax.imshow(CM, interpolation='nearest', cmap=plt.cm.Blues)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes('right', size='5%', pad=0.05)
    figure.colorbar(im, cax=cax, orientation='vertical')
    tick_marks = np.arange(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_xticklabels(classes, rotation=90, fontsize=12)
    ax.set_yticks(tick_marks)
    ax.set_yticklabels(classes, rotation=0, fontsize=12)
    ax.set_title(title, fontsize=16)
    thresh = CM.max() / 2.
    for i, j in itertools.product(range(CM.shape[0]), range(CM.shape[1])):
        ax.text(j, i, CM[i, j], horizontalalignment="center",
                 color="white" if CM[i, j] > thresh else "black", fontsize=12)
    ax.set_ylabel('True label', fontsize=16)
    ax.set_xlabel('Predicted label', fontsize=16)

In [None]:
cm_train = metrics.confusion_matrix(train_true, train_pred, labels=categories)
cm_test = metrics.confusion_matrix(test_true, test_pred, labels=categories)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 7))
cm_plot(axes[0], categories, cm_train, 'Train set', fig)
cm_plot(axes[1], categories, cm_test, 'Test set', fig)
plt.tight_layout()
plt.show()

## Visualize results

In [None]:
images = {'image': [], 'true': [], 'predicted': [], 'url': []}
for j, (i, row) in enumerate(test.iterrows()):
    images['image'].append('<img src="{}">'.format(row.url))
    images['true'].append(row.category)
    images['predicted'].append(test_pred[j])
    images['url'].append(row.url)

In [None]:
rows = []
for i, image in enumerate(images['image']):
    row = '<tr><td>{}</td><td>{}</td><td>{}</td></tr>'.format(
        image, images['true'][i], images['predicted'][i]
    )
    rows.append(row)

In [None]:
table = "<table><tr><th>{}</th><th>{}</th><th>{}</th></tr>{}</table>".format(
    'image', 'true', 'predicted', "".join(rows)
)

In [None]:
display(HTML(table))