In [1]:
import numpy as np
import csv
import pandas as pd
import gensim,logging
from gensim.parsing import PorterStemmer
from gensim.models import Word2Vec, Doc2Vec, Phrases
from gensim.models.phrases import Phraser
import nltk
import pickle
import warnings
import re
import multiprocessing

import spacy
from spacy.tokenizer import Tokenizer

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
warnings.filterwarnings(action='ignore')

assert gensim.models.doc2vec.FAST_VERSION > -1

np.random.seed(0)
cores = multiprocessing.cpu_count()



In [2]:
stop_words = pd.read_csv('./input/stopwords.csv',names=['stop'])
new_stop = stop_words.stop.map(lambda x: str.capitalize(x))
all_stop_set = set(stop_words.stop.append(new_stop,ignore_index=True))

#generate combinations of all_stop_words, bigrams
#generate bigram
bigram_stops=[]
for a in all_stop_set:
    for b in all_stop_set:
        bigram_stop = a+"_"+b
        bigram_stops.append(bigram_stop)
all_stop_set = all_stop_set.union(bigram_stops)

In [None]:
df = pd.read_csv('./input/deceptive-opinion.csv')
df2 = pd.read_csv('./input/Hotel_Reviews.csv')
data2 = df2[~df2['lat'].isnull()]
data2 = data2.reset_index(drop=True)

In [None]:
all_unlabelled = []
for i,line in enumerate(data2['Negative_Review']):
    inp = ""
    if (np.random.randint(0,2) == 0):
        inp = data2.loc[i,'Negative_Review'] + ' ' + data2.loc[i,'Positive_Review']
    else:
        inp = data2.loc[i,'Positive_Review'] + ' ' + data2.loc[i,'Negative_Review']
    all_unlabelled.append(inp)

data2['All_Text'] = pd.Series(all_unlabelled)

In [None]:
data2[['All_Text','Negative_Review','Positive_Review']].head(3)

In [None]:
# generate truth and fake df
truedf = df[df.deceptive=='truthful'].loc[:,'text']
fakedf = df[df.deceptive=='deceptive'].loc[:,'text']
truedfy = df[df.deceptive=='truthful'].loc[:,'deceptive']
fakedfy = df[df.deceptive=='deceptive'].loc[:,'deceptive']
truedfy.replace({'truthful':1},inplace=True)
fakedfy.replace({'deceptive':0},inplace=True)

In [None]:
def read_corpus(sentdf, tokens_only=False):
    for i, line in enumerate(sentdf):
        if tokens_only:
            yield list(gensim.utils.tokenize(line))
        else:
            yield gensim.models.doc2vec.TaggedDocument(list(gensim.utils.tokenize(line)),[i])

In [None]:
nlp = spacy.load('en_core_web_sm')
def to_list(doc):
    return [t.text for t in doc]

# true_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(truedf)]
# fake_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(fakedf)]

# temp_arr = []
# for ind,s in enumerate(data2['All_Text']):
#     if ind % 100 == 0:
#         print (ind)
#     temp_arr.append(to_list(nlp(s))[:-1])
# unlabelled_corpus = temp_arr.copy()
true_corpus = list(read_corpus(truedf,tokens_only=True))
fake_corpus = list(read_corpus(fakedf,tokens_only=True))
unlabelled_corpus = list(read_corpus(data2['All_Text'], tokens_only=True))

In [None]:
pickle.dump(unlabelled_corpus, open('./input/unlabelled_corpus_raw.p','wb'))
pickle.dump(fake_corpus, open('./input/fake_corpus_raw.p','wb'))
pickle.dump(true_corpus, open('./input/true_corpus_raw.p','wb'))

In [3]:
unlabelled_corpus = pickle.load(open('./input/unlabelled_corpus_raw.p','rb'))
true_corpus = pickle.load(open('./input/true_corpus_raw.p','rb'))
fake_corpus = pickle.load(open('./input/fake_corpus_raw.p','rb'))

In [4]:
sentence_stream = [s for s in unlabelled_corpus + true_corpus + fake_corpus]
phrases = Phrases(sentence_stream, threshold=2, min_count=2, common_terms = list(all_stop_set))
bigram = Phraser(phrases)
sentence_stream = [bigram[s] for s in sentence_stream]
true_corpus = [bigram[s] for s in true_corpus]
fake_corpus = [bigram[s] for s in fake_corpus]
unlabelled_corpus = [bigram[s] for s in unlabelled_corpus]

2018-01-28 23:41:07,606 : INFO : collecting all words and their counts
2018-01-28 23:41:07,607 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2018-01-28 23:41:08,040 : INFO : PROGRESS: at sentence #10000, processed 381532 words and 151085 word types
2018-01-28 23:41:08,417 : INFO : PROGRESS: at sentence #20000, processed 691920 words and 249660 word types
2018-01-28 23:41:08,890 : INFO : PROGRESS: at sentence #30000, processed 1023549 words and 348766 word types
2018-01-28 23:41:09,336 : INFO : PROGRESS: at sentence #40000, processed 1357244 words and 442359 word types
2018-01-28 23:41:09,764 : INFO : PROGRESS: at sentence #50000, processed 1696278 words and 532372 word types
2018-01-28 23:41:10,201 : INFO : PROGRESS: at sentence #60000, processed 2018082 words and 617030 word types
2018-01-28 23:41:10,701 : INFO : PROGRESS: at sentence #70000, processed 2359057 words and 707817 word types
2018-01-28 23:41:11,150 : INFO : PROGRESS: at sentence #80000, processed 2

In [14]:
fake_corpus[:3]

[['My',
  'husband_and_I_visited',
  'the',
  'Fairmont_Chicago',
  'Millennium_Park',
  'for',
  'our',
  'honeymoon',
  'The',
  'customer_service',
  'was',
  'amazing',
  'From',
  'the',
  'time',
  'we',
  'booked',
  'our',
  'packege',
  'to',
  'the',
  'time',
  'we',
  'checked',
  'in',
  'everything',
  'was',
  'absolutely_amazing',
  'These',
  'people',
  'were',
  'proficient',
  'respectful',
  'and',
  'very',
  'thoughtful',
  'The',
  'Fairmont',
  'had',
  'a',
  'lounge',
  'a',
  'wine',
  'room',
  'a',
  'bar',
  'and',
  'a',
  'restaurant',
  'I',
  'couldn_t',
  'decide',
  'where',
  'I',
  'wanted_to_go',
  'first',
  'After',
  'we',
  'put_our_bags',
  'up',
  'we',
  'headed',
  'down',
  'to',
  'the',
  'wine',
  'room',
  'It',
  'was',
  'totally',
  'delicious',
  'We',
  'also',
  'got',
  'free_wine',
  'just',
  'because',
  'it',
  'was',
  'our',
  'honeymoon',
  'Then',
  'after',
  'a',
  'few',
  'glasses_of_wine',
  'we',
  'hit',
  'the'

In [5]:
phrases = Phrases(sentence_stream, threshold=2, min_count=2,common_terms = list(all_stop_set))
trigram = Phraser(phrases)
sentence_stream = [trigram[s] for s in sentence_stream]
true_corpus = [trigram[s] for s in true_corpus]
fake_corpus = [trigram[s] for s in fake_corpus]
unlabelled_corpus = [trigram[s] for s in unlabelled_corpus]

2018-01-28 23:43:24,266 : INFO : collecting all words and their counts
2018-01-28 23:43:24,268 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2018-01-28 23:43:24,628 : INFO : PROGRESS: at sentence #10000, processed 326563 words and 162988 word types
2018-01-28 23:43:24,968 : INFO : PROGRESS: at sentence #20000, processed 591911 words and 275196 word types
2018-01-28 23:43:25,345 : INFO : PROGRESS: at sentence #30000, processed 875085 words and 390240 word types
2018-01-28 23:43:25,696 : INFO : PROGRESS: at sentence #40000, processed 1159890 words and 500717 word types
2018-01-28 23:43:26,040 : INFO : PROGRESS: at sentence #50000, processed 1448203 words and 608822 word types
2018-01-28 23:43:26,408 : INFO : PROGRESS: at sentence #60000, processed 1722727 words and 710410 word types
2018-01-28 23:43:26,756 : INFO : PROGRESS: at sentence #70000, processed 2015693 words and 818026 word types
2018-01-28 23:43:27,105 : INFO : PROGRESS: at sentence #80000, processed 23

In [22]:
true_corpus[3:6]

[['The',
  'Omni_Chicago',
  'really',
  'delivers',
  'on',
  'all',
  'fronts',
  'from',
  'the',
  'spaciousness_of_the_rooms',
  'to',
  'the',
  'helpful_staff',
  'to',
  'the',
  'prized',
  'location',
  'on',
  'Michigan_Avenue',
  'While',
  'this',
  'address',
  'in',
  'Chicago',
  'requires',
  'a',
  'high_level_of_quality',
  'the',
  'Omni',
  'delivers',
  'Check',
  'in',
  'for',
  'myself',
  'and',
  'a',
  'whole_group',
  'of',
  'people',
  'with',
  'me',
  'was',
  'under',
  'minutes',
  'the',
  'staff',
  'had',
  'plentiful',
  'recommendations_for_dining',
  'and',
  'events',
  'and',
  'the',
  'rooms',
  'are',
  'some',
  'of',
  'the',
  'largest',
  'you',
  'll_find',
  'at',
  'this',
  'price_range',
  'in',
  'Chicago',
  'Even',
  'the',
  'standard_room',
  'has',
  'a',
  'separate_living_area',
  'and',
  'work_desk',
  'The',
  'fitness_center',
  'has',
  'free_weights',
  'weight_machines',
  'and',
  'two',
  'rows',
  'of',
  'cardio_

In [23]:
pickle.dump(unlabelled_corpus, open('./input/unlabelled_corpus_clean1.p','wb'))
pickle.dump(fake_corpus, open('./input/fake_corpus_clean1.p','wb'))
pickle.dump(true_corpus, open('./input/true_corpus_clean1.p','wb'))

In [6]:
def filter_stream(df, stopword_set):
    sentence_stream = []
    for s in df:
        s2 = [c  for c in s if c not in stopword_set] 
        sentence_stream.append(s2)
    return sentence_stream

unlabelled_corpus = filter_stream(unlabelled_corpus, all_stop_set)
true_corpus = filter_stream(true_corpus, all_stop_set)
fake_corpus = filter_stream(fake_corpus, all_stop_set)

In [7]:
pickle.dump(unlabelled_corpus, open('./input/unlabelled_corpus_clean2.p','wb'))
pickle.dump(fake_corpus, open('./input/fake_corpus_clean2.p','wb'))
pickle.dump(true_corpus, open('./input/true_corpus_clean2.p','wb'))