In [1]:
import numpy as np
import scipy.stats as stats
import csv
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import gensim as gs 
from gensim import corpora, models, similarities
import logging
import multiprocessing

import pickle

import gensim

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, recall_score,precision_score,fbeta_score

np.random.seed(0)

# this line tells jupyter notebook to put the plots in the notebook rather than saving them to file.
%matplotlib inline

# this line makes plots prettier on mac retina screens. If you don't have one it shouldn't do anything.
%config InlineBackend.figure_format = 'retina'

cores = multiprocessing.cpu_count()



# Hotel Reviews: What is real?

<h1>Problem Statement</h1>
When someone goes on to plan a stay, they tend to use online reviews to base their decision; However, there are usually a range of opinions so how can we tell if a certain review is fake/real or just plain bad luck? Thus the problem statement is to detect fradulent reviews for people are surfing hotel booking sites. 

<h1>Data Source</h1>
1600 Labelled Records, 800 True, 800 Fradulent reviews, plain text review (some generated by MTurk) <br>
512k Unlabelled Records, with numerical Ratings, Positive and Negative labels

<h1>Prior Research</h1>
Using Bi-grams (2-word combinations), detection rate is at ~80% 


<h1>Proposed Method(s)</h1>
-Preprocess the data as according to the best practices (mentioned in Empath, Stanford, 2016) <br>
-train_test_split(train, test)We only train our model from the trainsplit corpus. <br> 
-Decide on which Vocabulary to use (Unlabelled, or with labelled data)[Unlabelled] <br>
-Decide on which train data to input into the doc2vec class (unlablled, labelled. [Unlabelled] <br>
-Create additional Text Features: <strike>TF-IDF</strike>, <i>unigram and bigram stopword removal (minor improvement)</i>, Probabilistic Context-Free Grammar (but tends to be bad with paras with many sentences), <strike>LIWC (Linguistic tagging-word count features)</strike>Can we replicate the individual features?, GloVE, Consistency,<i> Sentiment-Detection (decent results)</i>, <i>LDA, topic-modelling (marginal improvement)</i>, <i>Empath topic modelling </i><br>
-TFIDF good with LDA, LSA <br>
-Determine the classifiers to build the model on top of (Logreg, SVM,   GaussianNB , Decision Trees creation), Word Level Features <br>
-Consider PCA/K-best? <br>
-Determine which semisupervised learning algorithm to implement. there are inbuilt python classes for semisupervised learning (LabelPropagation, LabelSpreading) or self-developed label propagation functions. 
<br>
-Currently, use of other features such as social network topology, or timestamps, or rating behavior will not be examined. 

<h1>Risks and Assumptions</h1>
Risks: There are quite many NLP libraries available (NLTK, gensim, etc.) which means we have to try substanial number of libraries to assess their results.

Each run of the algorithm takes up a significant amount of time, thus more time is needed so a CUDA-specific libray (TensorFlow) could be used.However this would mean even more time spent on on-boarding. 

There is a assumption that there actually exists some pattern or trend that exists in both truthful and fake reviews. 

<h1>Specific Aim</h1>
To aim to obtain between 71% ~ 74% accuracy overall, with a good f1 score: As it is not very useful to customers if there are a lot of false positive, the noise to signal ratio would overwhelm customers who would then fail to accept the system's recommendation of fake review.

In [2]:
df = pd.read_csv('./input/deceptive-opinion.csv')

In [3]:
truedf = df[df.deceptive=='truthful'].loc[:,'text']
fakedf = df[df.deceptive=='deceptive'].loc[:,'text']
truedfy = df[df.deceptive=='truthful'].loc[:,'deceptive']
fakedfy = df[df.deceptive=='deceptive'].loc[:,'deceptive']
truedfy.replace({'truthful':1},inplace=True)
fakedfy.replace({'deceptive':0},inplace=True)
#Truth = 1, Fake = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [38]:
unlabelled_corpus = pickle.load(open('./input/unlabelled_corpus_clean2.p','rb'))
true_corpus = pickle.load(open('./input/true_corpus_clean2.p','rb'))
fake_corpus = pickle.load(open('./input/fake_corpus_clean2.p','rb'))

In [39]:
temp_arr = []
ctr = 0
for ind, s in enumerate(unlabelled_corpus):
    temp_arr.append(gensim.models.doc2vec.TaggedDocument(s,[ctr]))
    ctr += 1
unlabelled_corpus = temp_arr.copy()
temp_arr = []
for ind, s in enumerate(true_corpus):
    temp_arr.append(gensim.models.doc2vec.TaggedDocument(s,[ctr]))
    ctr += 1
true_corpus = temp_arr.copy()
temp_arr = []
for ind, s in enumerate(fake_corpus):
    temp_arr.append(gensim.models.doc2vec.TaggedDocument(s,[ctr]))
    ctr += 1
fake_corpus = temp_arr.copy()
del(temp_arr)

In [40]:
unlabelled_index = list(range(len(unlabelled_corpus)))
np.random.shuffle(unlabelled_index)
kratio = 3
rand_unlabelled_corpus = [unlabelled_corpus[a] for a in unlabelled_index[:(len(true_corpus) + len(fake_corpus))*kratio]]

In [41]:
turk_model = gensim.models.doc2vec.Doc2Vec(dm=0, size=100,min_count=30, window=5,workers=cores, seed=8, negative=5)
turk_model.build_vocab(unlabelled_corpus)

In [42]:
turk_model.train(unlabelled_corpus, total_examples=turk_model.corpus_count, epochs=turk_model.iter)

27200977

In [43]:
true_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in true_corpus])
fake_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in fake_corpus])
# rand_unlabelled_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in rand_unlabelled_corpus])
unlabelled_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in unlabelled_corpus])

In [35]:
# rand_unlabelled_vec.head()

In [36]:
# pickle.dump(unlabelled_vec, open('./input/unlabelled_vec.p','wb'))
# pickle.dump(true_vec, open('./input/true_vec.p','wb'))
# pickle.dump(fake_vec, open('./input/fake_vec.p','wb'))

In [4]:
# true_vec = pickle.load(open('./input/true_vec.p','rb'))
# fake_vec = pickle.load(open('./input/fake_vec.p','rb'))
# unlabelled_vec = pickle.load(open('./input/unlabelled_vec.p','rb'))

In [34]:
# VADER VECS
true_vader_vec = pickle.load(open('./input/true_vader_raw_vec.p','rb'))
fake_vader_vec = pickle.load(open('./input/fake_vader_raw_vec.p','rb'))
vader_vec = pd.concat([true_vader_vec, fake_vader_vec], axis = 0)
# unlabelled_vader_vec = pickle.load(open('./input/unlabelled_vader_vec.p','rb'))

In [6]:
# LDA VECS
true_lda_vec = pickle.load(open('./input/true_lda_vec.p','rb'))
fake_lda_vec = pickle.load(open('./input/fake_lda_vec.p','rb'))
lda_vec = pd.concat([true_lda_vec, fake_lda_vec], axis = 0)
# unlabelled_lda_vec = pickle.load(open('./input/unlabelled_lda_vec.p','rb'))

In [35]:
# EMP VEC
true_emp_vec = pickle.load(open('./input/true_empraw_vec.p','rb'))
fake_emp_vec = pickle.load(open('./input/fake_empraw_vec.p','rb'))
emp_vec = pd.concat([true_emp_vec, fake_emp_vec], axis = 0)

In [46]:
true_fake_vec = pd.concat([true_vec, fake_vec], axis=0)
true_fake_vec2 = pd.concat([true_fake_vec, vader_vec, lda_vec, emp_vec], axis=1)
all_y = pd.concat([truedfy, fakedfy], axis= 0)
X_train, X_test, y_train, y_test = train_test_split(true_fake_vec2, all_y, train_size=0.75, random_state=8)



# Perform baseline Supervised Learning 

In [10]:
def summary_report(y, y_pred, model):
    conmat = confusion_matrix(y, y_pred, labels=model.classes_)
    # converts np.matrix format matrix to a dataframe and adds index and column names
    conmat= pd.DataFrame(conmat, columns=model.classes_, index=model.classes_)
    print(conmat)
    print(classification_report(y, y_pred))

In [11]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

def run_test(X, y, cv_val, scoring):
    gnb = GaussianNB()
    dtree = DecisionTreeClassifier()
    svm2 = svm.SVC(random_state=8)
    xg = XGBClassifier()
    # logreg_cv = linear_model.LogisticRegressionCV(Cs=100, cv=5, penalty='l1',scoring='accuracy',solver='liblinear',n_jobs=-1)
    print('Gaussian NB:')
    scorelist = cross_val_score(gnb, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('DecisionTree')
    scorelist = cross_val_score(dtree, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('SVM:')
    scorelist = cross_val_score(svm2, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('XGB Default:')
    scorelist = cross_val_score(xg, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    # print('Logistics Regression:')
    # scorelist = cross_val_score(logreg_cv, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    # print(scorelist, np.mean(scorelist))



In [47]:
run_test(X_train, y_train, 5, 'f1')

Gaussian NB:
[0.63436123 0.61111111 0.68070175 0.52307692 0.62831858] 0.6155139212249944
DecisionTree
[0.50220264 0.592      0.58823529 0.60408163 0.63529412] 0.5843627375179147
SVM:
[0.77911647 0.71489362 0.70866142 0.72268908 0.72961373] 0.7309948619486794
XGB Default:
[0.68695652 0.65271967 0.67489712 0.67521368 0.70119522] 0.6781964401379683


In [24]:
xg_clf = XGBClassifier()
xg_params = {
#     'booster'=['gbtree'],
    'colsample_bytree':[0.15,0.4,0.85],
    'max_depth':[4,8,16,20],
    'subsample':[0.7,0.95],
    'min_child_weight':[1,3,9],
    'gamma':[0,0.01,0.05,0.3,0.6,1]
}
scorer = make_scorer(fbeta_score,beta=0.5)
xg_gs = GridSearchCV(xg_clf, xg_params, cv=5, scoring=scorer, n_jobs=-1)
xg_gs.fit(X_train,y_train)
best_xg_clf = xg_gs.best_estimator_
print(best_xg_clf)
best_pred = best_xg_clf.predict(X_test)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.15,
       gamma=0.6, learning_rate=0.1, max_delta_step=0, max_depth=4,
       min_child_weight=9, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.95)


  if diff:


In [25]:
performance = {'accuracy': accuracy_score(best_pred,y_test),
                'recall': recall_score(best_pred,y_test),
                'precision': precision_score(best_pred,y_test)}
print(performance)

{'accuracy': 0.685, 'recall': 0.6966292134831461, 'precision': 0.6326530612244898}


In [26]:
svm_clf = svm.SVC(random_state=8)
svm_params = {
    "kernel":['rbf','linear'],
    'C':[0.1,0.2,0.4,0.6,0.8,1,10],
    'gamma': np.logspace(-1,1,9)
}
scorer = make_scorer(fbeta_score,beta=0.5)
svm_gs = GridSearchCV(svm_clf, svm_params, cv=5, scoring=scorer, n_jobs=-1)
svm_gs.fit(X_train,y_train)
best_clf = svm_gs.best_estimator_
print(best_clf)
best_pred = best_clf.predict(X_test)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
  max_iter=-1, probability=False, random_state=8, shrinking=True,
  tol=0.001, verbose=False)


In [27]:
performance = {'accuracy': accuracy_score(best_pred,y_test),
                'recall': recall_score(best_pred,y_test),
                'precision': precision_score(best_pred,y_test)}
print(performance)

{'accuracy': 0.705, 'recall': 0.695, 'precision': 0.7091836734693877}


# Start of Semi-Supervised Learning

In [41]:
# we pseudo label a bunch of unlabelled data

def pseudo_label(trained_model, labelled_X, labelled_y, unlabelled, sample_rate):
    unlabelled_index = list(range(len(unlabelled)))
    np.random.shuffle(unlabelled_index)
    end_index = int(sample_rate * float(len(unlabelled)))
    rand_unlabelled = pd.DataFrame([unlabelled.iloc[a,:] for a in unlabelled_index[:end_index]]).copy()
#     rand_unlabelled.reset_index(inplace=True,drop=True)
#     print(rand_unlabelled.head(10))
    all_data = pd.concat([labelled_X, rand_unlabelled], axis=0, ignore_index=True)
    p_labels = pd.DataFrame(trained_model.predict(rand_unlabelled))
    all_labels = pd.concat([labelled_y, p_labels], axis=0, ignore_index=True)
    
    trained_model.fit(all_data, all_labels)
    return trained_model, all_data , all_labels
#SMOTE BOOST
logreg_cv.fit(X_train, y_train)
newlr, newX_train, newy_train = pseudo_label(logreg_cv, X_train, y_train, rand_unlabelled_vec, 1.0)
y_lr_pred = newlr.predict(X_test)
summary_report(y_test, y_lr_pred, newlr)
newy_train.info()

     0    1
0  137   67
1   65  131
             precision    recall  f1-score   support

          0       0.68      0.67      0.67       204
          1       0.66      0.67      0.66       196

avg / total       0.67      0.67      0.67       400

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 1 columns):
0    6000 non-null int64
dtypes: int64(1)
memory usage: 47.0 KB


In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from scipy import sparse as sp

newX_train = pd.concat([X_train, rand_unlabelled_vec],axis=0)
newy_train = pd.concat([y_train, pd.Series([-1] *rand_unlabelled_vec.shape[0])],axis=0)
# newX_train_spread = newX_train.copy()
# newy_train_spread = newy_train.copy()

pseudo_params = {
    'kernel': ['rbf'],
    'gamma' : range(10,100,10)
}
label_prop_gridsearch = GridSearchCV(LabelPropagation(), pseudo_params, n_jobs=-1)
# label_spread_gridsearch = GridSearchCV(LabelSpreading(), pseudo_params)
label_prop_gridsearch.fit(newX_train, newy_train)
# label_spread_gridsearch.fit(newX_train_spread, newy_train_spread)

y_prop_grid_pred = label_prop_gridsearch.best_estimator_.predict(X_test)
# y_spread_grid_pred = label_spread_gridsearch.best_estimator_.predict(X_test)
summary_report(y_test, y_prop_grid_pred, label_prop_gridsearch.best_estimator_)
# summary_report(y_test, y_spread_grid_pred, label_spread_gridsearch.best_estimator_)

     0   1
0  118  86
1  118  78
             precision    recall  f1-score   support

          0       0.50      0.58      0.54       204
          1       0.48      0.40      0.43       196

avg / total       0.49      0.49      0.49       400



In [43]:
# pseudo_params2 = {
#     'kernel': ['knn'],
#     'alpha' : np.linspace(0.01,0.99,20)
# }
# label_prop_gridsearch = GridSearchCV(LabelPropagation(), pseudo_params2)
# label_spread_gridsearch = GridSearchCV(LabelSpreading(), pseudo_params2)
# label_prop_gridsearch.fit(newX_train, newy_train)
# label_spread_gridsearch.fit(newX_train_spread, newy_train_spread)

# y_prop_grid_pred = label_prop_gridsearch.best_estimator_.predict(X_test)
# y_spread_grid_pred = label_spread_gridsearch.best_estimator_.predict(X_test)
# summary_report(y_test, y_prop_grid_pred, label_prop_gridsearch.best_estimator_)
# summary_report(y_test, y_spread_grid_pred, label_spread_gridsearch.best_estimator_)

In [44]:
# # label_prop_model = LabelPropagation(kernel='rbf',gamma=10,n_jobs=-1)
# # label_spread_model = LabelSpreading(kernel='rbf',gamma=10,n_jobs=-1)
# # label_prop_model.fit(newX_train, newy_train)
# # label_spread_model.fit(newX_train_spread, newy_train_spread)
# y_prop_pred = label_prop_model.predict(X_test)
# y_spread_pred = label_spread_model.predict(X_test)

# summary_report(y_test, y_prop_pred, label_prop_model)
# summary_report(y_test, y_spread_pred, label_spread_model)