In [1]:
import numpy as np
import scipy.stats as stats
import csv
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import gensim as gs 
from gensim import corpora, models, similarities
import logging
import multiprocessing

import pickle

import gensim

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, recall_score,precision_score,fbeta_score

np.random.seed(0)

# this line tells jupyter notebook to put the plots in the notebook rather than saving them to file.
%matplotlib inline

# this line makes plots prettier on mac retina screens. If you don't have one it shouldn't do anything.
%config InlineBackend.figure_format = 'retina'

cores = multiprocessing.cpu_count()



# Hotel Reviews: What is real?

<h1>Problem Statement</h1>
When someone goes on to plan a stay, they tend to use online reviews to base their decision; However, there are usually a range of opinions so how can we tell if a certain review is fake/real or just plain bad luck? Thus the problem statement is to detect fradulent reviews for people are surfing hotel booking sites. 

<h1>Data Source</h1>
1600 Labelled Records, 800 True, 800 Fradulent reviews, plain text review (some generated by MTurk) <br>
512k Unlabelled Records, with numerical Ratings, Positive and Negative labels

<h1>Prior Research</h1>
Using Bi-grams (2-word combinations), detection rate is at ~80% 


<h1>Proposed Method(s)</h1>
-Preprocess the data as according to the best practices (mentioned in Empath, Stanford, 2016) <br>
-train_test_split(train, test)We only train our model from the trainsplit corpus. <br> 
-Decide on which Vocabulary to use (Unlabelled, or with labelled data)[Unlabelled] <br>
-Decide on which train data to input into the doc2vec class (unlablled, labelled. [Unlabelled] <br>
-Create additional Text Features: <strike>TF-IDF</strike>, <i>unigram and bigram stopword removal (minor improvement)</i>, Probabilistic Context-Free Grammar (but tends to be bad with paras with many sentences), <strike>LIWC (Linguistic tagging-word count features)</strike>Can we replicate the individual features?, GloVE, Consistency,<i> Sentiment-Detection (decent results)</i>, <i>LDA, topic-modelling (marginal improvement)</i>, <i>Empath topic modelling </i><br>
-TFIDF good with LDA, LSA <br>
-Determine the classifiers to build the model on top of (Logreg, SVM,   GaussianNB , Decision Trees creation), Word Level Features <br>
-Consider PCA/K-best? <br>
-Determine which semisupervised learning algorithm to implement. there are inbuilt python classes for semisupervised learning (LabelPropagation, LabelSpreading) or self-developed label propagation functions. 
<br>
-Currently, use of other features such as social network topology, or timestamps, or rating behavior will not be examined. 

<h1>Risks and Assumptions</h1>
Risks: There are quite many NLP libraries available (NLTK, gensim, etc.) which means we have to try substanial number of libraries to assess their results.

Each run of the algorithm takes up a significant amount of time, thus more time is needed so a CUDA-specific libray (TensorFlow) could be used.However this would mean even more time spent on on-boarding. 

There is a assumption that there actually exists some pattern or trend that exists in both truthful and fake reviews. 

<h1>Specific Aim</h1>
To aim to obtain between 71% ~ 74% accuracy overall, with a good f1 score: As it is not very useful to customers if there are a lot of false positive, the noise to signal ratio would overwhelm customers who would then fail to accept the system's recommendation of fake review.

In [2]:
df = pd.read_csv('./input/deceptive-opinion.csv')

In [3]:
truedf = df[df.deceptive=='truthful'].loc[:,'text']
fakedf = df[df.deceptive=='deceptive'].loc[:,'text']
truedfy = df[df.deceptive=='truthful'].loc[:,'deceptive']
fakedfy = df[df.deceptive=='deceptive'].loc[:,'deceptive']
truedfy.replace({'truthful':1},inplace=True)
fakedfy.replace({'deceptive':0},inplace=True)
#Truth = 1, Fake = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [4]:
unlabelled_corpus = pickle.load(open('./input/unlabelled_corpus_clean3.p','rb'))
true_corpus = pickle.load(open('./input/true_corpus_clean3.p','rb'))
fake_corpus = pickle.load(open('./input/fake_corpus_clean3.p','rb'))

In [5]:
temp_arr = []
ctr = 0
for ind, s in enumerate(unlabelled_corpus):
    temp_arr.append(gensim.models.doc2vec.TaggedDocument(s,[ctr]))
    ctr += 1
unlabelled_corpus = temp_arr.copy()
temp_arr = []
for ind, s in enumerate(true_corpus):
    temp_arr.append(gensim.models.doc2vec.TaggedDocument(s,[ctr]))
    ctr += 1
true_corpus = temp_arr.copy()
temp_arr = []
for ind, s in enumerate(fake_corpus):
    temp_arr.append(gensim.models.doc2vec.TaggedDocument(s,[ctr]))
    ctr += 1
fake_corpus = temp_arr.copy()
del(temp_arr)

In [6]:
turk_model = gensim.models.doc2vec.Doc2Vec(dm=0, size=100,min_count=30, window=5,workers=cores, seed=8, negative=5)
turk_model.build_vocab(unlabelled_corpus)

In [7]:
turk_model.train(unlabelled_corpus, total_examples=turk_model.corpus_count, epochs=turk_model.iter)

42630941

In [8]:
true_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in true_corpus])
fake_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in fake_corpus])
# rand_unlabelled_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in rand_unlabelled_corpus])
unlabelled_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in unlabelled_corpus])

In [9]:
# rand_unlabelled_vec.head()

In [27]:
# pickle.dump(unlabelled_vec, open('./input/unlabelled_vec.p','wb'))
# pickle.dump(true_vec, open('./input/true_vec.p','wb'))
# pickle.dump(fake_vec, open('./input/fake_vec.p','wb'))

In [4]:
true_vec = pickle.load(open('./input/true_vec.p','rb'))
fake_vec = pickle.load(open('./input/fake_vec.p','rb'))
unlabelled_vec = pickle.load(open('./input/unlabelled_vec.p','rb'))

In [5]:
# VADER VECS
true_vader_vec = pickle.load(open('./input/true_vader_raw_vec.p','rb'))
fake_vader_vec = pickle.load(open('./input/fake_vader_raw_vec.p','rb'))
vader_vec = pd.concat([true_vader_vec, fake_vader_vec], axis = 0)
unlabelled_vader_vec = pickle.load(open('./input/unlabelled_vader_raw_vec.p','rb'))

In [6]:
# LDA VECS
true_lda_vec = pickle.load(open('./input/true_lda_vec_sm.p','rb'))
fake_lda_vec = pickle.load(open('./input/fake_lda_vec_sm.p','rb'))
lda_vec = pd.concat([true_lda_vec, fake_lda_vec], axis = 0)
unlabelled_lda_vec = pickle.load(open('./input/unlabelled_lda_vec_sm.p','rb'))

In [7]:
# EMP VEC
true_emp_vec = pickle.load(open('./input/true_empraw_vec.p','rb'))
fake_emp_vec = pickle.load(open('./input/fake_empraw_vec.p','rb'))
emp_vec = pd.concat([true_emp_vec, fake_emp_vec], axis = 0)
unlabelled_emp_vec = pickle.load(open('./input/unlabelled_empraw_vec.p','rb'))

In [8]:
true_fake_vec = pd.concat([true_vec, fake_vec], axis=0)
true_fake_vec2 = pd.concat([true_fake_vec, vader_vec, lda_vec, emp_vec], axis=1)
unlabelled_full_vec = pd.concat([unlabelled_vec, unlabelled_vader_vec, unlabelled_lda_vec, unlabelled_emp_vec], axis=1)
all_y = pd.concat([truedfy, fakedfy], axis= 0)
X_train, X_test, y_train, y_test = train_test_split(true_fake_vec2, all_y, train_size=0.75, random_state=8)



In [10]:
unlabelled_index = list(range(unlabelled_vec.shape[0]))
np.random.shuffle(unlabelled_index)
kratio = 5
rand_unlabelled_vec = pd.DataFrame([unlabelled_full_vec.iloc[a] for a in unlabelled_index[:(len(true_fake_vec2))*kratio]])

# Perform baseline Supervised Learning 

In [17]:
def summary_report(y, y_pred, model):
    conmat = confusion_matrix(y, y_pred, labels=model.classes_)
    # converts np.matrix format matrix to a dataframe and adds index and column names
    conmat= pd.DataFrame(conmat, columns=model.classes_, index=model.classes_)
    print(conmat)
    print(classification_report(y, y_pred))

In [18]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

def run_test(X, y, cv_val, scoring):
    gnb = GaussianNB()
    dtree = DecisionTreeClassifier()
    svm2 = svm.SVC(random_state=8)
    xg = XGBClassifier()
    rf = RandomForestClassifier()
    # logreg_cv = linear_model.LogisticRegressionCV(Cs=100, cv=5, penalty='l1',scoring='accuracy',solver='liblinear',n_jobs=-1)
    print('Gaussian NB:')
    scorelist = cross_val_score(gnb, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('DecisionTree')
    scorelist = cross_val_score(dtree, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('Rand Forest:')
    scorelist = cross_val_score(rf, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    # print('Logistics Regression:')
    print('SVM:')
    scorelist = cross_val_score(svm2, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('XGB Default:')
    scorelist = cross_val_score(xg, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    # scorelist = cross_val_score(logreg_cv, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    # print(scorelist, np.mean(scorelist))



In [19]:
run_test(X_train, y_train, 5, 'f1')

Gaussian NB:
[0.66666667 0.64545455 0.65338645 0.63255814 0.68421053] 0.6564552664310304
DecisionTree
[0.76724138 0.74889868 0.79166667 0.7611336  0.75833333] 0.7654547321926616
Rand Forest:
[0.67567568 0.67889908 0.66666667 0.63507109 0.7027027 ] 0.6718030435322492
SVM:
[0.78884462 0.70638298 0.71146245 0.72803347 0.72881356] 0.7327074165911229
XGB Default:
[0.88034188 0.82987552 0.85714286 0.84388186 0.8907563 ] 0.860399683043606


In [20]:
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
# X_scaled = MinMaxScaler().fit_transform(X_train)
# rfe = RFE(linear_model.LogisticRegression(), 30)
# rfe_fit = rfe.fit(X_train,y_train)
pca = PCA(n_components=10)
pca_fit = pca.fit(X_train)
print(pca_fit.explained_variance_ratio_)
print(pca_fit.components_)

[0.24277994 0.0942935  0.05189576 0.04543919 0.03203522 0.03022592
 0.02552607 0.02488714 0.02113581 0.01902314]
[[ 0.0020883   0.00562228 -0.00120664 ...  0.00659096  0.04414256
   0.01040226]
 [-0.00589415 -0.00389124  0.00024527 ... -0.00146657  0.0306059
   0.0032964 ]
 [-0.00162188  0.00611202  0.00612417 ...  0.00026423  0.04422782
  -0.0008979 ]
 ...
 [ 0.00631885 -0.00339791 -0.00489315 ...  0.02088504 -0.00409579
   0.03853281]
 [-0.01836773 -0.01161891 -0.00550687 ...  0.00898552  0.21136514
   0.01966989]
 [ 0.00863393  0.0053672  -0.0020898  ...  0.00954293 -0.02983154
  -0.00296591]]


In [22]:
# pd.DataFrame([rfe_fit.ranking_, X_train.columns]).T.sort_values(by=0)

In [23]:
xg_clf = XGBClassifier()
xg_params = {
#     'booster'=['gbtree'],
    'colsample_bytree':[0.15,0.4,0.85],
    'max_depth':[4,8,16,20],
    'subsample':[0.7,0.95],
    'min_child_weight':[1,3,9],
    'gamma':[0,0.01,0.05,0.3,0.6,1]
}
scorer = make_scorer(fbeta_score,beta=0.5)
xg_gs = GridSearchCV(xg_clf, xg_params, cv=5, scoring=scorer, n_jobs=3)
xg_gs.fit(X_train,y_train)
best_xg_clf = xg_gs.best_estimator_
print(best_xg_clf)
best_pred = best_xg_clf.predict(X_test)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.85,
       gamma=1, learning_rate=0.1, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.7)


  if diff:


In [24]:
best_pred = xg_gs.best_estimator_.predict(X_test)
print(classification_report(y_test, best_pred))

{'accuracy': 0.845, 'recall': 0.8383838383838383, 'precision': 0.8469387755102041}


In [25]:
svm_clf = svm.SVC(random_state=8)
svm_params = {
    "kernel":['rbf','linear'],
    'C':[0.1,0.2,0.4,0.6,0.8,1,10],
    'gamma': np.logspace(-1,1,9)
}
scorer = make_scorer(fbeta_score,beta=0.5)
svm_gs = GridSearchCV(svm_clf, svm_params, cv=5, scoring=scorer, n_jobs=3)
svm_gs.fit(X_train,y_train)
best_clf = svm_gs.best_estimator_
print(best_clf)
best_pred = best_clf.predict(X_test)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
  max_iter=-1, probability=False, random_state=8, shrinking=True,
  tol=0.001, verbose=False)


In [59]:
performance = {'accuracy': accuracy_score(best_pred,y_test),
                'recall': recall_score(best_pred,y_test),
                'precision': precision_score(best_pred,y_test)}
print(performance)

             precision    recall  f1-score   support

          0       0.85      0.84      0.85       204
          1       0.84      0.85      0.84       196

avg / total       0.85      0.84      0.85       400



  if diff:


In [58]:
# pickle.dump(xg_gs, open('./input/best_model.p','wb'))
# xg_gs = pickle.load(open('./input/best_model.p','rb'))



# Start of Semi-Supervised Learning

In [12]:
from pomegranate import NaiveBayes, NormalDistribution
import pomegranate

In [13]:
y_unlabelled = pd.DataFrame([-1] * rand_unlabelled_vec.shape[0])

In [14]:
X_train_new = pd.concat([X_train, rand_unlabelled_vec], axis=0).as_matrix()
y_train_new = pd.concat([y_train, y_unlabelled], axis = 0).values
y_train_new = [a[0] for a in y_train_new]

In [21]:
# Naive Bayes 
semi_model = pomegranate.NaiveBayes.from_samples(NormalDistribution, X_train_new, y_train_new, verbose=True,)

[1] Improvement: 1034803.0554936281	Time (s): 0.3269
[2] Improvement: 49621.30275710113	Time (s): 0.3259
[3] Improvement: 48731.89374624984	Time (s): 0.3249
[4] Improvement: 67665.42744293204	Time (s): 0.3248
[5] Improvement: 65889.86933530355	Time (s): 0.3168
[6] Improvement: 41252.78177992953	Time (s): 0.3209
[7] Improvement: 14949.463219265454	Time (s): 0.3319
[8] Improvement: 3225.9216378661804	Time (s): 0.3188
[9] Improvement: -464.8625073530711	Time (s): 0.3168
Total Improvement: 1325674.8529049228
Total Time (s): 3.2837


In [61]:
best_pred = semi_model.predict(X_test)
print(classification_report(y_test, best_pred))

             precision    recall  f1-score   support

          0       0.49      0.40      0.44       204
          1       0.47      0.56      0.51       196

avg / total       0.48      0.48      0.48       400



In [97]:
from sklearn.semi_supervised import label_propagation
from scipy.sparse import csgraph

#Label Propagation
label_spread = label_propagation.LabelSpreading(kernel='rbf', alpha=0.1, gamma=0.2, n_jobs=-1)
label_spread.fit(X_train_new, y_train_new)
best_pred = label_spread.predict(X_test)
print(classification_report(y_test, best_pred))

LabelSpreading(alpha=0.1, gamma=0.2, kernel='rbf', max_iter=30, n_jobs=-1,
        n_neighbors=7, tol=0.001)

             precision    recall  f1-score   support

          0       0.61      0.69      0.65       204
          1       0.63      0.55      0.58       196

avg / total       0.62      0.62      0.62       400



In [82]:
label_spread = label_propagation.LabelSpreading(kernel='knn', alpha=0.95, n_neighbors=40, n_jobs=-1)
label_spread.fit(X_train_new, y_train_new)
best_pred = label_spread.predict(X_test)
print(classification_report(y_test, best_pred))



             precision    recall  f1-score   support

          0       0.58      0.50      0.54       204
          1       0.54      0.62      0.58       196

avg / total       0.56      0.56      0.56       400



In [106]:
label_prop = label_propagation.LabelPropagation(kernel='rbf', gamma=0.2, n_jobs=-1)
label_prop.fit(X_train_new, y_train_new)
best_pred = label_prop.predict(X_test)
print(classification_report(y_test, best_pred))

             precision    recall  f1-score   support

          0       0.54      0.82      0.65       204
          1       0.60      0.28      0.38       196

avg / total       0.57      0.56      0.52       400





In [109]:
pd.Series(best_pred).value_counts()

0    309
1     91
dtype: int64

In [121]:
from frameworks.CPLELearning import CPLELearningModel
from frameworks.SelfLearning import SelfLearningModel