In [None]:
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from collections import defaultdict

In [None]:
class Classifier(object):
    
    def __init__(self, model, labels_tag):
        self.model = model
        self.categories = []
        for doc_id in self.model.docs:
            doc = self.model.images.find_one({self.model.url: doc_id})
            try:
                self.categories.append(doc[labels_tag])
            except KeyError:
                self.categories.append('unknown')
        self.pca = PCA(n_components=2)
    
    def show(self, axes, data=None, categories=None):
        if data is None:
            data = self.model.M
        if categories is None:
            categories = self.categories
        cat = list(set(categories))
        sM = self.pca.fit_transform(data)
        cmap = plt.cm.get_cmap('nipy_spectral', len(cat))
        for c in cat:
            points = [i for i, x in enumerate(categories) if x == c]
            axes.scatter(sM[points,0], sM[points,1], c=cmap(cat.index(c)), alpha=0.4, label=c)


class Supervised(Classifier):
    
    def __init__(self, model, labels_tag, train_size=0.75):
        super(Supervised, self).__init__(model, labels_tag)
        self.train_size = train_size
        self.gaussian = GaussianNB()
        self.bernoulli = BernoulliNB()
        self.multinomial = MultinomialNB()
        self.svc = SVC()
        self.knn = KNeighborsClassifier(n_neighbors=10)
        
    def train_test(self):
        """
        Returns train set and categories
        and test set and categories
        """
        cat = defaultdict(lambda: [])
        for i, c in enumerate(self.categories):
            cat[c].append(i)
        train, test = [], []
        r_cat, t_cat = [], []
        for c, docs in cat.items():
            s = int(len(docs) * self.train_size)
            r = len(docs) - s
            train += docs[:s]
            r_cat += [c]*s
            test += docs[s:]
            t_cat += [c]*r
        train = np.array([self.model.M[x] for x in train])
        test = np.array([self.model.M[x] for x in test])
        return train, r_cat, test, t_cat
    
    def train(self, train_set, train_categories, method):
        method.fit(train_set, train_categories)
    
    def predict(self, test_set, method):
        return method.predict(test_set)