# Project Description

Project 1: fake review detection
Data
This dataset includes reviews for restaurants located in New York City. Reviews include product and user id, timestamp, ratings, and a plaintext review. Yelp has a filtering algorithm in place that identifies fake/suspicious reviews and separates them into a filtered list. This Yelp dataset contains both recommended and filtered reviews. We consider them as genuine and fake, respectively. Your goal is to predict whether a review is fake or not, i.e. a binary classification task. The positive classes (+1) are fake reviews and the negative classes are genuine reviews (0). Note that the classes are imbalanced, with around 10% fake reviews. 

Evaluation
Your model should output a score for each example; higher score indicates the example is more likely to be fake.
We will evaluate the results using auROC and AP.

# Pull in Dataset from Codalab

In [1]:
import pandas as pd
import os

path = os.getcwd()

train_df_pre = pd.read_csv(path+"/train.csv")
validation_df_pre = pd.read_csv(path+"/dev.csv")
#HW4 has 1, -1 as labels, so convert 0 to -1
train_df_pre['label'] = train_df_pre['label'].replace(to_replace=0,value=-1)
validation_df_pre['label'] = validation_df_pre['label'].replace(to_replace=0,value=-1)

In [2]:
from collections import Counter
print(Counter(train_df_pre['label']))

Counter({-1: 225055, 1: 25819})


In [3]:
validation_df_pre.head()

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,11,934,0,5.0,1,2014-01-20,"all around good place, cozy, I came in and did..."
1,17,940,0,4.0,-1,2014-09-16,"For lunch, my friend and I had: -Lamb sandwich..."
2,20,943,0,5.0,-1,2014-05-24,Some good Big Greek cooking!! Came to City on ...
3,30,953,0,4.0,-1,2013-10-17,So... as you may notice from some of my other ...
4,43,966,0,3.0,-1,2012-12-19,"I don't understand the whole ""You can't order ..."


- Should we normalize case?  e.g., does HELLO mean Hello mean hello?
- Should we remove stop words?
- Should we remove punctuation, special symbols?
- Should we lemmatise?  "There is currently no lemmatiser with a very high accuracy rate:
e.g., caresses -> caress ponies -> poni etc.
- Less common are error correction, converting words to parts of speech, mapping synonyms to one.  In nltk library.
from nltk import stem
from nltk.corpus import stopwords

stemmer = stem.SnowballStemmer('english')

stopwords = set(stopwords.words('english'))

def sample_normalizer(msg):
    #converting messages to lowercase
    msg = msg.lower()
    #removing stopwords
    msg = [word for word in msg.split() if word not in stopwords]
    #using a stemmer
    msg = " ".join([stemmer.stem(word) for word in msg])
    return msg

data['text'] = data['text'].apply(review_messages)


In [4]:
import os
import numpy as np
import pickle
import random

In [5]:
#For training, remove special symbols, and remake list(review), label combos.
train_df = []
symbols = '${}()[].,:;+-*/&|<>=~" '
for review, label in dict(zip(train_df_pre['review'],train_df_pre['label'])).items():
    rvw = review.split(' ')
    words = map(lambda Element: Element.translate(str.maketrans("", "", symbols)).strip(), rvw)
    words = filter(None, words)
    r = list(words)
    r.append(str(label))
    train_df.append(r)

In [6]:
#Commented out since the below takes a while (2 minutes)
#train_df[0]

In [7]:
validation_df = []
symbols = '${}()[].,:;+-*/&|<>=~" '
for review, label in dict(zip(validation_df_pre['review'],validation_df_pre['label'])).items():
    rvw = review.split(' ')
    words = map(lambda Element: Element.translate(str.maketrans("", "", symbols)).strip(), rvw)
    words = filter(None, words)
    r = list(words)
    r.append(str(label))
    validation_df.append(r)

In [8]:
#Commented out since the below takes a while (2 minutes)
#validation_df[0]

In [9]:
from collections import Counter
#Takes in a dataset. 
def BOW(dataset):
    BOW_representation = {}
    for i, review in enumerate(dataset):
        count_this = review[:-1]
        BOW_representation[i] = Counter(count_this)# For review i, count each word
    return BOW_representation

#One can slice and take one or many examples of a dataset, as the commented 
#out print() shows immediately below.
#print(BOW(train_df[2:3])) #convert example 2 to BOW

In [10]:
print(BOW(train_df[2:3]))

{0: Counter({'lunch': 2, 'the': 2, 'ordered': 1, 'for': 1, '15': 1, 'from': 1, 'Snack': 1, 'last': 1, 'Friday': 1, 'On': 1, 'time': 1, 'nothing': 1, 'missing': 1, 'and': 1, 'food': 1, 'was': 1, 'great': 1, 'I': 1, 'have': 1, 'added': 1, 'it': 1, 'to': 1, 'regular': 1, 'company': 1, 'list': 1, 'as': 1, 'everyone': 1, 'enjoyed': 1, 'their': 1, 'meal': 1})}


In [11]:
for k, v in BOW(train_df[2:3]).items():
    for a,b in v.items():
        print(b)

1
2
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [12]:
X_BOW = BOW(train_df)

In [13]:
print(X_BOW[0])

Counter({'is': 3, 'The': 2, 'Greek': 2, 'the': 2, 'food': 1, 'at': 1, 'snack': 1, 'a': 1, 'selection': 1, 'of': 1, 'popular': 1, 'dishes': 1, 'appetizer': 1, 'tray': 1, 'good': 1, 'as': 1, 'salad': 1, 'We': 1, 'were': 1, 'underwhelmed': 1, 'with': 1, 'main': 1, 'courses': 1, 'There': 1, 'are': 1, '45': 1, 'tables': 1, 'here': 1, 'so': 1, "it's": 1, 'sometimes': 1, 'hard': 1, 'to': 1, 'get': 1, 'seated': 1})


In [14]:
#See my 5/15/2020 6:52 iPhone photo for how this looks
X_BOW = []

for k, v in BOW(train_df).items():
    temp = []
    for a,b in v.items():
        temp.append(b)
    X_BOW.append(temp)

In [15]:
print(X_BOW[0:10])

[[2, 1, 1, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 2, 1, 1, 2, 3, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 4, 1, 1, 1, 3, 1, 7, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 3, 1, 1, 3, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [3, 2, 5, 4, 3, 1, 2, 7, 1, 3, 2, 1, 1, 1, 2, 

In [16]:
X_BOW = np.asarray(X_BOW)

In [17]:
print(min([len(u) for u in X_BOW]))

0


In [None]:
#pad to 659 each row
X_BOW_post = [np.pad(row, pad_width=659, mode='constant', constant_values=0) for row in X_BOW]

In [None]:
from imblearn.over_sampling import SMOTE
ovr = SMOTE(random_state = 42)
X, y \
= ovr.fit_resample(X_BOW, train_df[-1])

In [None]:
def dotProduct(d1, d2):
    """
    @param dict d1: a feature vector represented by a mapping from a feature (string) to a weight (float).
    @param dict d2: same as d1
    @return float: the dot product between d1 and d2
    """
    if len(d1) < len(d2):
        return dotProduct(d2, d1)
    else:
        return sum(d1.get(f, 0) * v for f, v in d2.items())

def increment(d1, scale, d2):
    """
    Implements d1 += scale * d2 for sparse vectors.
    @param dict d1: the feature vector which is mutated.
    @param float scale
    @param dict d2: a feature vector.

    NOTE: This function does not return anything, but rather
    increments d1 in place. We do this because it is much faster to
    change elements of d1 in place than to build a new dictionary and
    return it.
    """
    for f, v in d2.items():
        d1[f] = d1.get(f, 0) + v * scale

In [None]:
def pegasos_fast(review_list, max_epoch, lam):
    W = {}
    epoch = 0
    t = 1
    s = 1
    x = BOW(review_list)
    y = []
    for review in review_list:
        y.append(int(review[-1]))
    #Loop
    # Use the util.increment and util.dotProduct functions in update
    #We use the results of problem 2 here in increment()
    while epoch < max_epoch:
        for j in range(len(x)):
            t += 1
            eta_t = 1/(t*lam)
            s -= eta_t*lam*s
            if y[j]*dotProduct(W,x[j])*s < 1:
                increment(W,(eta_t*y[j])/s,x[j])        
        epoch += 1
    W.update((x,s*y) for x,y in W.items()) #Let's update in place.
    return W

In [None]:
w_pegasos_fast = pegasos_fast(train_df, max_epoch = 3, lam = 0.5)

In [None]:
def accuracy_percent(review_list, weight):
    x = BOW(review_list)
    y = []
    for review in review_list:
        y.append(int(review[-1]))############4/20#####y.append(review[-1])
        
    error = 0
    for i in range(len(x)):
        if dotProduct(weight, x[i]) < 0:
            pred = -1
        else:
            pred = 1
        if y[i] != pred:
            error += 1
    return error/len(x)

In [None]:
lam_list = [0.000000001, 0.00000001, 0.0000001, 0.000001,0.00001, 0.0001, 0.001, 0.01, 0.1, 1,10, 100]
loss_list = []
for regularizer in lam_list:
    weight = pegasos_fast(train_df,max_epoch=10, lam = regularizer)
    loss = accuracy_percent(validation_df,weight)
    loss_list.append(loss)
print('Table of each Lambda and its Loss')
for lam, loss in zip(lam_list, loss_list):
    print(lam, loss)  

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score
def evaluate_model(review_list, weight, evaluation_metric):
    
    x = BOW(review_list)
    
    y_true_pre = []
    for review in review_list:
        y_true_pre.append(int(review[-1]))
    
    y_scores_pre = [] #the model predictions
    for i in range(len(x)):
        if dotProduct(weight, x[i]) < 0:
            y_scores_pre.append(-1)
        else:
            y_scores_pre.append(1)
    
    y_true = np.array(y_true_pre)
    y_scores = np.array(y_scores_pre)
        
    if evaluation_metric == 'auROC':
        metric = roc_auc_score(y_true, y_scores)
    elif evaluation_metric == 'AP':
        metric = average_precision_score(y_true, y_scores)
    return metric  

In [None]:
lam_list = [0.000000001, 0.00000001, 0.0000001, 0.000001,0.00001, 0.0001, 0.001, 0.01, 0.1, 1,10, 100]
auROC_list = []
AP_list = []
for regularizer in lam_list:
    weight = pegasos_fast(train_df,max_epoch=10, lam = regularizer)
    auROC_metric = evaluate_model(validation_df,weight,'auROC')
    auROC_list.append(auROC_metric)
    AP_metric = evaluate_model(validation_df,weight,'AP')
    AP_list.append(AP_metric)
print('Table of each Lambda and its Evaluation Metric')
for lam, auROC, AP in zip(lam_list, auROC_list, AP_list):
    print(lam, auROC, AP)  

In [None]:
#Handle Imbalanced Data too