In [1]:
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn import cross_validation as cv
from sklearn import svm
import pandas as pd
import numpy as np
import hazm
import os

In [None]:
import nazarkav as nk
data_path = os.path.join(nk.__path__[0], 'data')

In [None]:
cache = {}
hotel_pol = pd.read_csv(os.path.join(data_path, 'hotel-polarity.tsv'), 
                        sep='\t')
hotel_comment = hotel_pol['comment'].tolist()

In [None]:
# # B_BOW if tf was False
# # TF_BOW if tf was True
# def bow(tf,ng=(1,1)):
#     def _bow(maxf=None):
#         vectorizer = CountVectorizer(
#             ngram_range=ng,
#             binary=not tf,
#             tokenizer=nk.Preprocessor().tokenize,
#             preprocessor=nk.Preprocessor().clean,
#             max_features=maxf)
#         return vectorizer.fit_transform(hotel_comment) 
#     return _bow

# def b_bow(ng=(1,1)): return bow(False, ng)
# def tf_bow(ng=(1,1)): return bow(True, ng)
# def tfidf_bow(ng=(1,1)):
#     def _bow(maxf=None):
#         vectorizer = TfidfVectorizer(
#             ngram_range=ng,
#             tokenizer=nk.Preprocessor().tokenize,
#             preprocessor=nk.Preprocessor().clean,
#             max_features=maxf)
#         return vectorizer.fit_transform(hotel_comment) 
#     return _bow

In [63]:
def binary_bow(maxf=None, ng=(1,1)):
    vectorizer = CountVectorizer(
        ngram_range=ng,
        binary=True,
        tokenizer=nk.Preprocessor().tokenize,
        preprocessor=nk.Preprocessor().clean,
        max_features=maxf)
    return vectorizer.fit_transform(hotel_comment) 

In [65]:
def tf_bow(maxf=None, ng=(1,1)):
    vectorizer = CountVectorizer(
        ngram_range=ng,
        tokenizer=nk.Preprocessor().tokenize,
        preprocessor=nk.Preprocessor().clean,
        max_features=maxf)
    return vectorizer.fit_transform(hotel_comment) 

In [66]:
def tfidf_bow(maxf=None, ng=(1,1)):
    vectorizer = TfidfVectorizer(
        ngram_range=ng,
        binary=True,
        tokenizer=nk.Preprocessor().tokenize,
        preprocessor=nk.Preprocessor().clean,
        max_features=maxf)
    return vectorizer.fit_transform(hotel_comment) 

In [None]:
max_features = [100, 300, 500, 1000, 3000, 5000, 7000, 10000, 12000, 13000, 15000, 17000, 20000, 25000, 30000]
preprocess = [
    (b_bow(),'B_BOW'), 
    (tf_bow(),'TF_BOW'), 
    (tf_bow(ng=(1,2)),'TF_BOW+2gram'), 
    (tf_bow(ng=(1,3)),'TF_BOW+3gram'),
    (tfidf_bow(),'TFIDF_BOW'),
    (tfidf_bow(ng=(1,2)),'TFIDF_BOW+2gram'),
    (tfidf_bow(ng=(1,3)),'TFIDF_BOW+3gram')]

In [None]:
# Return dataframe
def make_models(clf, is_cache=True):
    acc_matrix = []
    index = []
    for p, name in preprocess:
        index.append(name)
        row = []
        for m in max_features:
            if cache.get((name,m),None) is None or not is_cache:
                acc = cv.cross_val_score( clf, p(m), hotel_pol["c"].tolist(), cv=5).mean()
                cache[name,m]= acc
            row.append(cache[name,m])
        acc_matrix.append(row)
    df = pd.DataFrame(acc_matrix,index=index,columns=max_features)
    nk.dataframe2png(df, height=200)
    return df

In [None]:
max_features = [100, 300]
bow = []
ngram_range = []
def make_models2(clf, is_cache=True):

In [None]:
#df = make_models(MultinomialNB())

In [None]:
# %matplotlib qt
# df.T.plot()

In [18]:
df = pd.DataFrame(columns=['max_features', 'bow', 'ngram_range', 'naive_bayes'])
