In [1]:
import numpy as np
import csv
import pandas as pd
import gensim,logging
from gensim.parsing import PorterStemmer
from gensim.models import Word2Vec, Doc2Vec, Phrases
from gensim.models.phrases import Phraser
import nltk
import pickle
import warnings
import re
import multiprocessing

import spacy
from spacy.tokenizer import Tokenizer

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
warnings.filterwarnings(action='ignore')

assert gensim.models.doc2vec.FAST_VERSION > -1

np.random.seed(0)
cores = multiprocessing.cpu_count()



In [2]:
df = pd.read_csv('./input/deceptive-opinion.csv')

stop_words = pd.read_csv('./input/stopwords.csv',names=['stop'])
new_stop = stop_words.stop.map(lambda x: str.capitalize(x))
all_stop_set = set(stop_words.stop.append(new_stop,ignore_index=True))

In [3]:
df2 = pd.read_csv('./input/Hotel_Reviews.csv')
data2 = df2[~df2['lat'].isnull()]
data2 = data2.reset_index(drop=True)

In [4]:
all_unlabelled = []
for i,line in enumerate(data2['Negative_Review']):
    inp = ""
    if (np.random.randint(0,2) == 0):
        inp = data2.loc[i,'Negative_Review'] + ' ' + data2.loc[i,'Positive_Review']
    else:
        inp = data2.loc[i,'Positive_Review'] + ' ' + data2.loc[i,'Negative_Review']
    all_unlabelled.append(inp)

data2['All_Text'] = pd.Series(all_unlabelled)

In [5]:
data2[['All_Text','Negative_Review','Positive_Review']].head(3)

Unnamed: 0,All_Text,Negative_Review,Positive_Review
0,I am so angry that i made this post available...,I am so angry that i made this post available...,Only the park outside of the hotel was beauti...
1,No real complaints the hotel was great great ...,No Negative,No real complaints the hotel was great great ...
2,Location was good and staff were ok It is cut...,Rooms are nice but for elderly a bit difficul...,Location was good and staff were ok It is cut...


In [6]:
#generate combinations of all_stop_words, bigrams
#generate bigram
bigram_stops=[]
for a in all_stop_set:
    for b in all_stop_set:
        bigram_stop = a+"_"+b
        bigram_stops.append(bigram_stop)
all_stop_set = all_stop_set.union(bigram_stops)

In [7]:
# generate truth and fake df
truedf = df[df.deceptive=='truthful'].loc[:,'text']
fakedf = df[df.deceptive=='deceptive'].loc[:,'text']
truedfy = df[df.deceptive=='truthful'].loc[:,'deceptive']
fakedfy = df[df.deceptive=='deceptive'].loc[:,'deceptive']
truedfy.replace({'truthful':1},inplace=True)
fakedfy.replace({'deceptive':0},inplace=True)

In [8]:
def read_corpus(sentdf, tokens_only=False):
    for i, line in enumerate(sentdf):
        if tokens_only:
            yield list(gensim.utils.tokenize(line))
        else:
            yield gensim.models.doc2vec.TaggedDocument(list(gensim.utils.tokenize(line)),[i])

In [32]:
# nlp = spacy.load('en_core_web_sm')
# def to_list(doc):
#     return [t.text for t in doc]

# true_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(truedf)]
# fake_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(fakedf)]

# temp_arr = []
# for ind,s in enumerate(data2['All_Text']):
#     if ind % 100 == 0:
#         print (ind)
#     temp_arr.append(to_list(nlp(s))[:-1])
# unlabelled_corpus = temp_arr.copy()
true_corpus = list(read_corpus(truedf,tokens_only=True))
fake_corpus = list(read_corpus(fakedf,tokens_only=True))
unlabelled_corpus = list(read_corpus(data2['All_Text'], tokens_only=True))

In [33]:
sentence_stream = [s for s in unlabelled_corpus + true_corpus + fake_corpus]
phrases = Phrases(sentence_stream)
bigram = Phraser(phrases)
sentence_stream = [bigram[s] for s in sentence_stream]
unlabelled_corpus = [bigram[s] for s in unlabelled_corpus]
true_corpus = [bigram[s] for s in true_corpus]
fake_corpus = [bigram[s] for s in fake_corpus]

2018-01-25 10:12:58,563 : INFO : collecting all words and their counts
2018-01-25 10:12:58,564 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2018-01-25 10:12:59,111 : INFO : PROGRESS: at sentence #10000, processed 381532 words and 134268 word types
2018-01-25 10:12:59,568 : INFO : PROGRESS: at sentence #20000, processed 691920 words and 203976 word types
2018-01-25 10:13:00,047 : INFO : PROGRESS: at sentence #30000, processed 1023549 words and 269582 word types
2018-01-25 10:13:00,523 : INFO : PROGRESS: at sentence #40000, processed 1357244 words and 328018 word types
2018-01-25 10:13:01,098 : INFO : PROGRESS: at sentence #50000, processed 1696278 words and 381982 word types
2018-01-25 10:13:01,588 : INFO : PROGRESS: at sentence #60000, processed 2018082 words and 431703 word types
2018-01-25 10:13:02,079 : INFO : PROGRESS: at sentence #70000, processed 2359057 words and 483652 word types
2018-01-25 10:13:02,561 : INFO : PROGRESS: at sentence #80000, processed 2

In [38]:
phrases = Phrases(sentence_stream)
trigram = Phraser(phrases)
sentence_stream = [trigram[s] for s in sentence_stream]
unlabelled_corpus = [trigram[s] for s in unlabelled_corpus]
true_corpus = [trigram[s] for s in true_corpus]
fake_corpus = [trigram[s] for s in fake_corpus]

2018-01-25 10:17:00,138 : INFO : collecting all words and their counts
2018-01-25 10:17:00,138 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2018-01-25 10:17:00,632 : INFO : PROGRESS: at sentence #10000, processed 361516 words and 143367 word types
2018-01-25 10:17:01,035 : INFO : PROGRESS: at sentence #20000, processed 655926 words and 220404 word types
2018-01-25 10:17:01,496 : INFO : PROGRESS: at sentence #30000, processed 969993 words and 294000 word types
2018-01-25 10:17:01,936 : INFO : PROGRESS: at sentence #40000, processed 1285848 words and 360255 word types
2018-01-25 10:17:02,388 : INFO : PROGRESS: at sentence #50000, processed 1606653 words and 421960 word types
2018-01-25 10:17:02,797 : INFO : PROGRESS: at sentence #60000, processed 1911086 words and 478690 word types
2018-01-25 10:17:03,253 : INFO : PROGRESS: at sentence #70000, processed 2234079 words and 537783 word types
2018-01-25 10:17:03,693 : INFO : PROGRESS: at sentence #80000, processed 25

In [40]:
true_corpus[:3]

[['We',
  'stayed',
  'for',
  'a',
  'one',
  'night',
  'getaway',
  'with',
  'family',
  'on',
  'a',
  'thursday',
  'Triple',
  'AAA',
  'rate',
  'of',
  'was',
  'a',
  'steal',
  'th_floor',
  'room',
  'complete',
  'with',
  'in',
  'plasma_TV',
  'bose',
  'stereo',
  'voss',
  'and',
  'evian',
  'water',
  'and',
  'gorgeous',
  'bathroom',
  'no',
  'tub',
  'but',
  'was',
  'fine',
  'for',
  'us',
  'Concierge',
  'was',
  'very',
  'helpful',
  'You_cannot_beat',
  'this',
  'location',
  'Only',
  'flaw',
  'was',
  'breakfast',
  'was',
  'pricey',
  'and',
  'service',
  'was',
  'very',
  'very',
  'slow',
  'hours',
  'for',
  'four',
  'kids',
  'and',
  'four_adults',
  'on',
  'a',
  'friday',
  'morning',
  'even_though',
  'there',
  'were',
  'only',
  'two',
  'other',
  'tables',
  'in',
  'the',
  'restaurant',
  'Food',
  'was',
  'very',
  'good',
  'so',
  'it',
  'was',
  'worth',
  'the',
  'wait',
  'I',
  'would',
  'return',
  'in',
  'a',
  'he

In [41]:
pickle.dump(unlabelled_corpus, open('./input/unlabelled_corpus_raw.p','wb'))
pickle.dump(fake_corpus, open('./input/fake_corpus_raw.p','wb'))
pickle.dump(true_corpus, open('./input/true_corpus_raw.p','wb'))

In [28]:
# unlabelled_corpus = pickle.load(open('./input/unlabelled_corpus_raw.p','rb'))
# true_corpus = pickle.load(open('./input/true_corpus_raw.p','rb'))
# fake_corpus = pickle.load(open('./input/fake_corpus_raw.p','rb'))

[['stayed',
  'one_night',
  'getaway',
  'family',
  'thursday',
  'Triple',
  'AAA',
  'rate',
  'steal',
  'th_floor',
  'room',
  'complete',
  'plasma_TV',
  'bose',
  'stereo',
  'voss',
  'evian',
  'water',
  'gorgeous',
  'bathroom',
  'tub',
  'fine',
  'us',
  'Concierge',
  'helpful',
  'beat',
  'location',
  'flaw',
  'breakfast',
  'pricey',
  'service',
  'slow',
  'hours',
  'four',
  'kids',
  'four_adults',
  'friday',
  'morning',
  'even_though',
  'two',
  'tables',
  'restaurant',
  'Food',
  'good',
  'worth',
  'wait_return',
  'heartbeat',
  'gem',
  'chicago'],
 ['Triple',
  'rate',
  'upgrade',
  'view',
  'room',
  'less_than',
  'also',
  'included',
  'breakfast',
  'vouchers',
  'great',
  'view_river',
  'lake',
  'Wrigley',
  'Bldg',
  'Tribune',
  'Bldg',
  'major',
  'restaurants',
  'Shopping',
  'Sightseeing',
  'attractions',
  'within_walking_distance',
  'Large',
  'room',
  'comfortable',
  'bed'],
 ['comes',
  'little',
  'late',
  'm',
  'fin

In [42]:
def filter_stream(df, stopword_set):
    sentence_stream = []
    for s in df:
        s2 = [c  for c in s if c not in stopword_set] 
        sentence_stream.append(s2)
    return sentence_stream

unlabelled_corpus = filter_stream(unlabelled_corpus, all_stop_set)
true_corpus = filter_stream(true_corpus, all_stop_set)
fake_corpus = filter_stream(fake_corpus, all_stop_set)

In [49]:
true_corpus[:3]

[['stayed',
  'one',
  'night',
  'getaway',
  'family',
  'thursday',
  'Triple',
  'AAA',
  'rate',
  'steal',
  'th_floor',
  'room',
  'complete',
  'plasma_TV',
  'bose',
  'stereo',
  'voss',
  'evian',
  'water',
  'gorgeous',
  'bathroom',
  'tub',
  'fine',
  'us',
  'Concierge',
  'helpful',
  'You_cannot_beat',
  'location',
  'flaw',
  'breakfast',
  'pricey',
  'service',
  'slow',
  'hours',
  'four',
  'kids',
  'four_adults',
  'friday',
  'morning',
  'even_though',
  'two',
  'tables',
  'restaurant',
  'Food',
  'good',
  'worth',
  'wait',
  'return',
  'heartbeat',
  'gem',
  'chicago'],
 ['Triple',
  'rate',
  'upgrade',
  'view',
  'room',
  'less_than',
  'also',
  'included',
  'breakfast',
  'vouchers',
  'great',
  'view',
  'river',
  'lake',
  'Wrigley',
  'Bldg',
  'Tribune',
  'Bldg',
  'major',
  'restaurants',
  'Shopping',
  'Sightseeing',
  'attractions',
  'within_walking_distance',
  'Large',
  'room',
  'comfortable',
  'bed'],
 ['comes',
  'little

In [47]:
pickle.dump(unlabelled_corpus, open('./input/unlabelled_corpus_clean.p','wb'))
pickle.dump(fake_corpus, open('./input/fake_corpus_clean.p','wb'))
pickle.dump(true_corpus, open('./input/true_corpus_clean.p','wb'))