In [110]:
from collections import defaultdict
import numpy as np
import pandas as pd 
from sklearn.utils import shuffle

In [111]:
# load text

def load_train_data(bow, data_path):
    reviews = []
    f = open(data_path, "r")
    for line in f:
        review = line.split(" ")
        reviews.append(review)
        for word in review:
            if word not in bow:
                bow[word] = len(bow)
    return reviews
                
                
def load_validation_data(data_path):
    reviews = []
    f = open(data_path, "r")
    for line in f:
        review = line.split(" ")
        reviews.append(review)
    return reviews
        
def unigram_BOW(reviews, bow, label):
    raw = []
    length = len(bow)
    for review in reviews:
        cur_bow = np.zeros(length)
        for word in review:
            if word in bow:
                cur_bow[bow[word]] += 1
            else:
                cur_bow[bow['UNKNOWN']] += 1
        raw.append([cur_bow, label])
        
    return raw
                
            
train_decep = "./train/deceptive.txt"
train_truth = "./train/truthful.txt"
validation_decep = "./validation/deceptive.txt"
validation_truth = "./validation/truthful.txt"

uni_bow = {}

deceptive_reviews_train  = load_train_data(uni_bow, train_decep)
truthful_reviews_train = load_train_data(uni_bow, train_truth)
uni_bow['UNKNOWN'] = len(uni_bow)

deceptive_reviews_val = load_validation_data(validation_decep)
truthful_reviews_val = load_validation_data(validation_truth)            
    
# generate unigram BOW
uni_train = unigram_BOW(deceptive_reviews_train, uni_bow, 0) + unigram_BOW(truthful_reviews_train, uni_bow, 1)
uni_val = unigram_BOW(deceptive_reviews_val, uni_bow, 0) + unigram_BOW(truthful_reviews_val, uni_bow, 1)

uni_train_X_y = shuffle(pd.DataFrame(uni_train, columns = ['X', 'y']))
uni_val_X_y = shuffle(pd.DataFrame(uni_val, columns = ['X', 'y']))


In [141]:
# train unigram NB 
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

uni_X_train, uni_y_train = list(uni_train_X_y.loc[:,'X']), list(uni_train_X_y.loc[:,'y'])
uni_X_val, uni_y_val = list(uni_val_X_y.loc[:,'X']), list(uni_val_X_y.loc[:,'y'])

uni_clf = MultinomialNB()
uni_clf.fit(uni_X_train, uni_y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
uni_y_pred = uni_clf.predict(uni_X_val)

print("Accuracy:", metrics.accuracy_score(uni_y_val, uni_y_pred))

# load test data
test_path = "./test/test.txt"
test_reviews = load_validation_data(test_path)

test = []
length = len(uni_bow)
for review in test_reviews:
    cur_bow = np.zeros(length)
    for word in review:
        if word in uni_bow:
            cur_bow[uni_bow[word]] += 1
        else:
            cur_bow[uni_bow['UNKNOWN']] += 1
    test.append(cur_bow)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
test_y_pred = uni_clf.predict(test)
print(test_y_pred)

Accuracy: 0.91796875
[1 1 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 0
 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 0 1 0 0 0 1 0 1 0 1 1 1 0 0 0 1 1
 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1
 0 1 1 0 1 1 0 0 1 1 1 0 1 1 0 1 0 1 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 1 0 1 0
 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 1 1 1 1 1 0 0 1 0 1 0 0 1 0 0
 0 1 1 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 1 1 0 0 0 1 0 1 1
 1 1 1 1 1 0 1 0 1 0 1 1 1 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1
 1 0 1 0 1 0 1 0 1 1 1 0 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 1 0 1 0
 1 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0]


In [113]:
# generate bigram BOW

def bigram_BOW_dict(reviews, bow):
    for review in reviews:
        for i in range(len(review)):
            if i == 0:
                key = '<s>' + ' ' + review[0]
            else:
                key = review[i-1] + ' ' + review[i]
            if key not in bow:
                bow[key] = len(bow)
                
def bigram_BOW(reviews, bow, label):
    raw = []
    length = len(bow)
    for review in reviews:
        cur_bow = np.zeros(length)
        for i in range(len(review)):
            if i == 0:
                key = '<s>' + ' ' + review[0]
            else:
                key = review[i-1] + ' ' + review[i]
            if key in bow:
                cur_bow[bow[key]] += 1
            else:
                cur_bow[bow['UNKNOWN']] += 1
        raw.append([cur_bow, label])
        
    return raw
        
bi_bow = {}
bigram_BOW_dict(deceptive_reviews_train, bi_bow)
bigram_BOW_dict(truthful_reviews_train, bi_bow)
bi_bow['UNKNOWN'] = len(bi_bow)

bi_train = bigram_BOW(deceptive_reviews_train, bi_bow, 0) + bigram_BOW(truthful_reviews_train, bi_bow, 1)
bi_val = bigram_BOW(deceptive_reviews_val, bi_bow, 0) + bigram_BOW(truthful_reviews_val, bi_bow, 1)

bi_train_X_y = shuffle(pd.DataFrame(bi_train, columns = ['X', 'y']))
bi_val_X_y = shuffle(pd.DataFrame(bi_val, columns = ['X', 'y']))

bi_X_train, bi_y_train = list(bi_train_X_y.loc[:,'X']), list(bi_train_X_y.loc[:,'y'])
bi_X_val, bi_y_val = list(bi_val_X_y.loc[:,'X']), list(bi_val_X_y.loc[:,'y'])

bi_clf = MultinomialNB()
bi_clf.fit(bi_X_train, bi_y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
bi_y_pred = bi_clf.predict(bi_X_val)

print("Accuracy:", metrics.accuracy_score(bi_y_val, bi_y_pred))

Accuracy: 0.859375


In [114]:
from collections import Counter

def uni_bow_filter(reviews, bow):
    for review in reviews:
        for word in review:
            value = bow.get(word, 0)
            bow[word] = value + 1

uni_bow_freq = {}

uni_bow_filter(deceptive_reviews_train, uni_bow_freq)
uni_bow_filter(truthful_reviews_train, uni_bow_freq)

filtered_dict = {k:v for k,v in uni_bow_freq.items() if v > 10}
filtered_dict = {k:i for i,(k,v) in enumerate(filtered_dict.items())}
filtered_dict['UNKNOWN'] = len(filtered_dict)

fil_uni_train = unigram_BOW(deceptive_reviews_train, filtered_dict, 0) + unigram_BOW(truthful_reviews_train, filtered_dict, 1)
fil_uni_val = unigram_BOW(deceptive_reviews_val, filtered_dict, 0) + unigram_BOW(truthful_reviews_val, filtered_dict, 1)

fil_uni_train_X_y = shuffle(pd.DataFrame(fil_uni_train, columns = ['X', 'y']))
fil_uni_val_X_y = shuffle(pd.DataFrame(fil_uni_val, columns = ['X', 'y']))

fil_uni_X_train, fil_uni_y_train = list(fil_uni_train_X_y.loc[:,'X']), list(fil_uni_train_X_y.loc[:,'y'])
fil_uni_X_val, fil_uni_y_val = list(fil_uni_val_X_y.loc[:,'X']), list(fil_uni_val_X_y.loc[:,'y'])

fil_uni_clf = MultinomialNB()
fil_uni_clf.fit(fil_uni_X_train, fil_uni_y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
fil_uni_y_pred = fil_uni_clf.predict(fil_uni_X_val)

print("Accuracy:", metrics.accuracy_score(fil_uni_y_val, fil_uni_y_pred))

Accuracy: 0.8984375


In [134]:
def bi_bow_filter(reviews, bow):
    for review in reviews:
        for i in range(len(review)):
            if i == 0:
                key = '<s>' + ' ' + review[0]
            else:
                key = review[i-1] + ' ' + review[i]
            value = bow.get(key, 0)
            bow[key] = value + 1
            
bi_bow_freq = {}

bi_bow_filter(deceptive_reviews_train, bi_bow_freq)
bi_bow_filter(truthful_reviews_train, bi_bow_freq)

bi_filtered_dict = {k:v for k,v in bi_bow_freq.items() if v > 2}
bi_filtered_dict = {k:i for i,(k,v) in enumerate(bi_filtered_dict.items())}
bi_filtered_dict['UNKNOWN'] = len(bi_filtered_dict)

fil_bi_train = bigram_BOW(deceptive_reviews_train, bi_filtered_dict, 0) + bigram_BOW(truthful_reviews_train, bi_filtered_dict, 1)
fil_bi_val = bigram_BOW(deceptive_reviews_val, bi_filtered_dict, 0) + bigram_BOW(truthful_reviews_val, bi_filtered_dict, 1)

fil_bi_train_X_y = shuffle(pd.DataFrame(fil_bi_train, columns = ['X', 'y']))
fil_bi_val_X_y = shuffle(pd.DataFrame(fil_bi_val, columns = ['X', 'y']))

fil_bi_X_train, fil_bi_y_train = list(fil_bi_train_X_y.loc[:,'X']), list(fil_bi_train_X_y.loc[:,'y'])
fil_bi_X_val, fil_bi_y_val = list(fil_bi_val_X_y.loc[:,'X']), list(fil_bi_val_X_y.loc[:,'y'])

fil_bi_clf = MultinomialNB()
fil_bi_clf.fit(fil_bi_X_train, fil_bi_y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
fil_bi_y_pred = fil_bi_clf.predict(fil_bi_X_val)

print("Accuracy:", metrics.accuracy_score(fil_bi_y_val, fil_bi_y_pred))

Accuracy: 0.890625


In [137]:
top_fake = set(["Hyatt Regency", "hyatt regency", "highly recommend", "hotel stay", "staff friendly", "east hotel",
                "trip Chicago", "trip chicago", "ambassador east", "hilton chicago", "Hilton Chicago", "recently stayed",
                 "rock hotel", "place stay", "customer service", "hard rock", "stay hotel", "Hard Rock", "recommend hotel",
                 "downtown chicago", "chicago hotel", "Chicago hotel", "hotel chicago", "hotel Chicago", "room service",
                "front desk"])
top_real = set(["great hotel", "desk staff", "stayed hotel", "hotel room", "friendly helpful", "magnificent mile",
               "customer service", "navy pier", "room clean", "michigan ave", "hotel staff", "recommend hotel",
               "walking distance", "great location", "stay hotel", "staff friendly", "room service", "front desk"])

def count_keywords(top_fake, top_real, reviews, label):
    raw = []
    for review in reviews:
        cur = []
        fake, real = 0, 0
        for i in range(len(review)-1):
            key = review[i] + ' ' + review[i+1]
            if key in top_fake:
                fake += 1
            if key in top_real:
                real += 1
        raw.append([[real, fake], label])
    return raw

train = count_keywords(top_fake, top_real, deceptive_reviews_train, 0) + count_keywords(top_fake, top_real, truthful_reviews_train, 1)
val = count_keywords(top_fake, top_real, deceptive_reviews_val, 0) + count_keywords(top_fake, top_real, truthful_reviews_val, 1)

train_X_y = shuffle(pd.DataFrame(train, columns = ['X', 'y']))
val_X_y = shuffle(pd.DataFrame(val, columns = ['X', 'y']))

X_train, y_train = list(train_X_y.loc[:,'X']), list(train_X_y.loc[:,'y'])
X_val, y_val = list(val_X_y.loc[:,'X']), list(val_X_y.loc[:,'y'])

clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
y_pred = clf.predict(X_val)

print("Accuracy:", metrics.accuracy_score(y_val, y_pred))


Accuracy: 0.50390625
