# Use of different classifiers from sklearn

In [1]:
import numpy as np
import nbimporter
import pandas as pd
from A03_indexing import Index
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2

Importing Jupyter notebook from A03_indexing.ipynb


In [2]:
H = Index('inforet', 'epicurious_tokens', 
          labels=['ingredients'], docids=None, field='lemma')

In [3]:
dH = pd.DataFrame.from_dict(H.U)

In [4]:
dH.head()

Unnamed: 0,(,),",",-,1,1/2,12-inch,2,3,3/4,...,shaping,12x9-inch,11/,skordalia,vali,oilepi,rolls,ounces)(soybean,coax,drambuie
0,1.0,1.0,9.0,3.0,7.0,2.0,1.0,2.0,1.0,1.0,...,,,,,,,,,,
1,,,10.0,,6.0,2.0,,3.0,7.0,1.0,...,,,,,,,,,,
2,2.0,2.0,4.0,,3.0,1.0,,3.0,,,...,,,,,,,,,,
3,1.0,1.0,6.0,4.0,4.0,1.0,,2.0,1.0,,...,,,,,,,,,,
4,,,2.0,1.0,3.0,1.0,,1.0,,,...,,,,,,,,,,


In [5]:
dH.fillna(0, inplace=True)
dU = dH.copy()
dU[dU != 0] = 1

In [6]:
dH.head()

Unnamed: 0,(,),",",-,1,1/2,12-inch,2,3,3/4,...,shaping,12x9-inch,11/,skordalia,vali,oilepi,rolls,ounces)(soybean,coax,drambuie
0,1.0,1.0,9.0,3.0,7.0,2.0,1.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,10.0,0.0,6.0,2.0,0.0,3.0,7.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,2.0,4.0,0.0,3.0,1.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,6.0,4.0,4.0,1.0,0.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,2.0,1.0,3.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
dU.head()

Unnamed: 0,(,),",",-,1,1/2,12-inch,2,3,3/4,...,shaping,12x9-inch,11/,skordalia,vali,oilepi,rolls,ounces)(soybean,coax,drambuie
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature selection by variance
Keep features with a threshold over variance, calculated as:
$$
Var(X) = p(1-p)
$$

In [19]:
SH = VarianceThreshold(threshold=0.4)
SU = VarianceThreshold(threshold=(.2 * (1 - .2)))
KH = SH.fit_transform(dH)
KU = SU.fit_transform(dU)

In [20]:
dU[[x for i, x in enumerate(dU.columns) if SU.get_support()[i]]].head()

Unnamed: 0,(,),-,1/2,3,3/4,4,6,and,black,...,leaf,onion,red,sugar,unsalted,finely,white,about,ounce,water
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature selection by chi2
Keep features by using the $\chi ^2$ test. Note that to do so, we need the target of classification out of the training set.

In [21]:
import pymongo

In [22]:
db = pymongo.MongoClient()['inforet']['epicurious']
q, p = {'calories': {'$exists': True, '$gte': 0, '$lte': 2000}}, {'_id': 0, 'calories': 1}
target_field = [r['calories'] for r in db.find(q, p)]

In [63]:
np.percentile(target_field, [25, 50, 75])

array([195., 323., 562.])

In [68]:
classes = [(x, i) for i, x in enumerate(np.percentile(target_field, [25, 50, 75]))]

In [69]:
classes

[(195.0, 0), (323.0, 1), (562.0, 2)]

In [70]:
docs = {}
for k, v in classes:
    if v == 0:
        q = {'calories': {'$exists': True, '$gte': 0, '$lte': k}}
    else:
        q = {'calories': {'$exists': True, '$gt': classes[v-1][0], '$lte': k}}
    docs[k] = [r['_id'] for r in db.find(q)]

In [71]:
target, train = [], []
for k, v in docs.items():
    train += v[:int(np.ceil(len(v)*0.75))]
    for doc in v[:int(np.ceil(len(v)*0.75))]:
        target.append(dict(classes)[k])

In [72]:
T = Index('inforet', 'epicurious_tokens', 
          labels=['ingredients'], docids=train, field='lemma')

In [73]:
dT = pd.DataFrame.from_dict(T.U)
dT.fillna(0, inplace=True)
dT[dT != 0] = 1

In [74]:
dT.head()

Unnamed: 0,grate,2,juice,syrup,corn,lemon,yolk,sugar,cream,bread,...,opal,g/,morsel,calorie,poire,cep,7up,nothing,dress,680
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
C = SelectKBest(chi2, k=200)
kT = C.fit_transform(dT, target)

In [76]:
t_kept_features = C.get_support()

In [77]:
dKT = dT[[x for i, x in enumerate(dT.columns) if t_kept_features[i]]]

In [78]:
dKT.head()

Unnamed: 0,corn,divide,optional,half,breast,buttermilk,cinnamon,split,cilantro,melt,...,rectangle,unpitted,completely,cubanelle,61,even,fast,standard,semidri,moderately
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Naive Bayes

In [79]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVM

In [80]:
classifier = MultinomialNB(fit_prior=True)
classifier.fit(dKT, target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [81]:
y_pred = classifier.predict(dKT)

In [82]:
import sklearn.metrics as met

In [83]:
met.confusion_matrix(target, y_pred)

array([[1783,  596,  565],
       [1352, 1029,  545],
       [1319,  618,  995]])