In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from array import array
from scipy.sparse import csr_matrix
from scipy.sparse import vstack, hstack
from scipy import sparse
from sklearn.utils import shuffle

# Prep Data

In [17]:
n = 200000
df_train = pd.read_csv('reviews_tr.csv', header = 0).sample(n=n)
df_train.shape

(200000, 2)

In [18]:
review_list = df_train['text'].tolist()
labels_binary = df_train['label']
df_train[df_train['label'] == 0] = -1
labels = df_train['label']
labels = sparse.csr_matrix(labels).transpose()
labels_binary = sparse.csr_matrix(labels_binary).transpose()

In [65]:
count_vect = CountVectorizer(token_pattern='\\b\\w+\\b')
X_train_count = count_vect.fit_transform(review_list)

#create tf sparse matrix
tf_transformer = TfidfTransformer(use_idf = False).fit(X_train_count)
X_train_tf = tf_transformer.transform(X_train_count)
tf_data = hstack((labels,X_train_tf)).tocsr()
print tf_data.shape

#create tfidf sparse matrix
count_vect_tfidf = CountVectorizer(token_pattern='\\b\\w+\\b')
X_train_count_tfidf = count_vect_tfidf.fit_transform(review_list)
tfidf_transformer = TfidfTransformer(use_idf = True,smooth_idf=False, norm=None).fit(X_train_count_tfidf)
X_train_tfidf = tfidf_transformer.transform(X_train_count_tfidf)

#adjust according to definition by minus df_tf and adjusting for log_n by dividing  log(10)
X_train_tfidf.data = (X_train_tfidf.data-X_train_tf.data)/np.log(10) 
tfidf_data = hstack((labels,X_train_tfidf)).tocsr()
print tfidf_data.shape

#create naive bayes matrix
count_vect_nb = CountVectorizer(binary = True)
X_train_count_nb = count_vect_nb.fit_transform(review_list)
nb_data = hstack((labels_binary,X_train_count_nb)).tocsr()
print nb_data.shape

(200000, 93930)
(200000, 93930)
(200000, 93893)


In [39]:
#create bigram sparse matrix
bigram_vectorizer = CountVectorizer(token_pattern='\\b\\w+\\b',ngram_range=(2,2),binary=False)
X_train_bigram_count = bigram_vectorizer.fit_transform(review_list)
tf_bigram = TfidfTransformer(use_idf = False).fit(X_train_bigram_count)
X_train_bigram = tf_bigram.transform(X_train_bigram_count)
bigram_data = hstack((labels,X_train_bigram)).tocsr()

print bigram_data.shape

dict_bigram = bigram_vectorizer.get_feature_names()

(200000, 2268278)


In [40]:
#create sparse matrix of tf-idf modified to exclude less useful words, create min frequency = 2, and weight 
#results 1+log(tf)

idf_mod_vectorizer = CountVectorizer(stop_words = ('a','i','the','to','is','ll','d','t','b','m','and','be','of','for',
                                                   'on','with','as','do','have','that'), min_df=2)
X_train_idf_mod_count = idf_mod_vectorizer.fit_transform(review_list)
idf_mod = TfidfTransformer(use_idf = False).fit(X_train_idf_mod_count)
X_train_idf_mod = idf_mod.transform(X_train_idf_mod_count)

X_train_idf_mod.data = (np.log10(X_train_idf_mod.data)) + 1.0
idf_mod_data = hstack((labels,X_train_idf_mod)).tocsr()
print idf_mod_data.shape

(200000, 49379)


# Define Model and Train

In [7]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import KFold
import time

In [8]:
def naiveBayes(train_data, test_data):
    y = train_data[:,0].toarray().ravel()
    x = train_data[:,1:].toarray()
    y_test = test_data[:,0].toarray().ravel()
    x_test = test_data[:,1:].toarray()
    
    bernNB = BernoulliNB()
    y_pred = bernNB.fit(x, y).predict(x_test)
    error_rate = (y_test != y_pred).sum() / float(x_test.shape[0])
    return error_rate

In [9]:
def avgPerceptron(train_data, test_data):
    W = np.zeros(shape = (1,train_data.shape[1]-1))
    U = np.zeros(shape = (1,train_data.shape[1]-1))
    b = 0  #bias
    beta = 0  #stored bias
    c = 1

    train_data1 = shuffle(train_data)

    #FirstPass Perceptron
    for row in train_data1:
        y = row[:,:1].toarray()
        x = row[:,1:].toarray()
        a = np.dot(W,x.transpose()) + b
        if y*a <= 0:
            W = W + y*x
            b = b + y
    
    #2ndPass AVG Perceptron
    train_data2 = shuffle(train_data)

    for row in train_data2:
        y = row[:,:1].toarray()
        x = row[:,1:].toarray()
        a = np.dot(W,x.transpose()) + b
        if y*a<= 0:
            W = W + y*x
            b = b + y
            U = U + y*c*x
            beta = beta + y*c
        c = c +1
    
    W = (W - (1/c)*U)
    b = b - (1/c)*beta
    
    errors = 0
    
    for row in test_data:
        y_test = row[:,:1].toarray()
        x_test = row[:,1:].toarray()
        if y_test != np.sign(np.dot(W,x_test.transpose())+b):
            errors +=1
    error_rate = errors / float(test_data.shape[0])
    return error_rate

In [10]:
kf = KFold(n = df_train.shape[0], n_folds = 5, shuffle = True)

error_list = []
startT = time.time()
for train_index, test_index in kf:

    #tf avgPerceptron
    error_rate = avgPerceptron(tf_data[train_index], tf_data[test_index])
    error_list.append(error_rate)
    print "tf",  np.round((time.time() - startT)/60,2) , "minutes"

    #tfidf avgPerceptron
    error_rate = avgPerceptron(tfidf_data[train_index], tfidf_data[test_index])
    error_list.append(error_rate)
    print "tfidf",  np.round((time.time() - startT)/60,2) , "minutes"

    #tfidf_mod avgPerceptron
    error_rate = avgPerceptron(idf_mod_data[train_index], idf_mod_data[test_index])
    error_list.append(error_rate)
    print "tfidf_mod",  np.round((time.time() - startT)/60,2) , "minutes"
    
    #bigram avgPerceptron
    error_rate = avgPerceptron(bigram_data[train_index], bigram_data[test_index])
    error_list.append(error_rate)
    print "bigram",  np.round((time.time() - startT)/60,2) , "minutes"
    
    #NaiveBayes
    error_rate = naiveBayes(nb_data[train_index], nb_data[test_index])
    error_list.append(error_rate)
    print "NaiveBayes",  np.round((time.time() - startT)/60,2) , "minutes"
    print error_list

In [11]:
error_list = np.array(error_list).reshape(5,5)
df_errors = pd.DataFrame(errors2, columns = ['nb'])#,'bigram_avgP', 'tfidf_mod_avgP',

print df_errors_nb
df_errors_nb.mean(axis = 0)

# Test Results

In [20]:
df_test = pd.read_csv('reviews_te.csv', header = 0)
df_test.shape

(320122, 2)

In [21]:
review_list_test = df_test['text'].tolist()
df_test[df_test['label'] == 0] = -1
labels_test = df_test['label']
labels_test = sparse.csr_matrix(labels_test).transpose()

In [50]:
dict_bigram

[u'0 0',
 u'0 00',
 u'0 00005',
 u'0 01',
 u'0 02',
 u'0 05',
 u'0 1',
 u'0 10',
 u'0 100',
 u'0 19',
 u'0 2',
 u'0 20',
 u'0 25',
 u'0 27',
 u'0 3',
 u'0 30',
 u'0 34',
 u'0 35',
 u'0 38',
 u'0 39',
 u'0 4',
 u'0 40',
 u'0 45',
 u'0 49',
 u'0 5',
 u'0 50',
 u'0 52',
 u'0 54',
 u'0 55',
 u'0 60',
 u'0 69',
 u'0 7',
 u'0 70',
 u'0 75',
 u'0 76',
 u'0 8',
 u'0 80',
 u'0 85',
 u'0 86',
 u'0 89',
 u'0 90',
 u'0 95',
 u'0 99',
 u'0 actually',
 u'0 affecting',
 u'0 again',
 u'0 ambiance',
 u'0 and',
 u'0 anyway',
 u'0 as',
 u'0 at',
 u'0 balance',
 u'0 because',
 u'0 being',
 u'0 burger',
 u'0 but',
 u'0 calories',
 u'0 cocktail',
 u'0 complains',
 u'0 complaints',
 u'0 concessions',
 u'0 cotton',
 u'0 crab',
 u'0 daisy',
 u'0 days',
 u'0 don',
 u'0 earthquake',
 u'0 excellent',
 u'0 finally',
 u'0 flavor',
 u'0 food',
 u'0 for',
 u'0 frame',
 u'0 friendly',
 u'0 friends',
 u'0 garlic',
 u'0 happy',
 u'0 health',
 u'0 heat',
 u'0 here',
 u'0 http',
 u'0 hubby',
 u'0 i',
 u'0 if',
 u'0 in',
 

In [73]:
count_vect_test = CountVectorizer(ngram_range=(2, 2), binary=False, vocabulary=dict_bigram, token_pattern='\\b\\w+\\b')
test_data_count = count_vect_test.fit_transform(review_list_test)
tf_bigram_test = TfidfTransformer(use_idf = False).fit(test_data_count)
X_test_bigram = tf_bigram_test.transform(test_data_count)

test_bigram = hstack((labels_test,X_test_bigram)).tocsr()

(320122, 2268278)


In [76]:
training_results = avgPerceptron(bigram_data, bigram_data)

In [79]:
print "training error:", training_results
print "test error:", test_results

training error: 0.116971029795


In [1]:
# test_results = avgPerceptron(bigram_data, test_bigram)


In [11]:
2*2*2*2*2*2

64