In [1]:
import numpy as np
import csv
import pandas as pd
import gensim,logging
from gensim.parsing import PorterStemmer
from gensim.models import Word2Vec, Doc2Vec, Phrases
from gensim.models.phrases import Phraser
import nltk
import pickle
import warnings
import re
import multiprocessing

import spacy
from spacy.tokenizer import Tokenizer

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
warnings.filterwarnings(action='ignore')

assert gensim.models.doc2vec.FAST_VERSION > -1

np.random.seed(0)
cores = multiprocessing.cpu_count()



In [6]:
stop_words = pd.read_csv('./input/stopwords.csv',names=['stop'])
new_stop = stop_words.stop.map(lambda x: str.capitalize(x))
all_stop_set = set(stop_words.stop.append(new_stop,ignore_index=True))

#generate combinations of all_stop_words, bigrams
#generate bigram
bigram_stops=[]
for a in all_stop_set:
    for b in all_stop_set:
        bigram_stop = a+"_"+b
        bigram_stops.append(bigram_stop)
all_stop_set = all_stop_set.union(bigram_stops)

In [5]:
df = pd.read_csv('./input/deceptive-opinion.csv')
df2 = pd.read_csv('./input/Hotel_Reviews.csv')
data2 = df2[~df2['lat'].isnull()]
data2 = data2.reset_index(drop=True)

In [6]:
all_unlabelled = []
for i,line in enumerate(data2['Negative_Review']):
    inp = ""
    if (np.random.randint(0,2) == 0):
        inp = data2.loc[i,'Negative_Review'] + ' ' + data2.loc[i,'Positive_Review']
    else:
        inp = data2.loc[i,'Positive_Review'] + ' ' + data2.loc[i,'Negative_Review']
    all_unlabelled.append(inp)

data2['All_Text'] = pd.Series(all_unlabelled)

In [7]:
data2[['All_Text','Negative_Review','Positive_Review']].head(3)

Unnamed: 0,All_Text,Negative_Review,Positive_Review
0,I am so angry that i made this post available...,I am so angry that i made this post available...,Only the park outside of the hotel was beauti...
1,No real complaints the hotel was great great ...,No Negative,No real complaints the hotel was great great ...
2,Location was good and staff were ok It is cut...,Rooms are nice but for elderly a bit difficul...,Location was good and staff were ok It is cut...


In [8]:
# generate truth and fake df
truedf = df[df.deceptive=='truthful'].loc[:,'text']
fakedf = df[df.deceptive=='deceptive'].loc[:,'text']
truedfy = df[df.deceptive=='truthful'].loc[:,'deceptive']
fakedfy = df[df.deceptive=='deceptive'].loc[:,'deceptive']
truedfy.replace({'truthful':1},inplace=True)
fakedfy.replace({'deceptive':0},inplace=True)

In [None]:
def read_corpus(sentdf, tokens_only=False):
    for i, line in enumerate(sentdf):
        if tokens_only:
            yield list(gensim.utils.tokenize(line))
        else:
            yield gensim.models.doc2vec.TaggedDocument(list(gensim.utils.tokenize(line)),[i])

In [None]:
nlp = spacy.load('en_core_web_sm')
def to_list(doc):
    return [t.text for t in doc]

# true_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(truedf)]
# fake_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(fakedf)]

# temp_arr = []
# for ind,s in enumerate(data2['All_Text']):
#     if ind % 100 == 0:
#         print (ind)
#     temp_arr.append(to_list(nlp(s))[:-1])
# unlabelled_corpus = temp_arr.copy()
true_corpus = list(read_corpus(truedf,tokens_only=True))
fake_corpus = list(read_corpus(fakedf,tokens_only=True))
unlabelled_corpus = list(read_corpus(data2['All_Text'], tokens_only=True))

In [None]:
pickle.dump(unlabelled_corpus, open('./input/unlabelled_corpus_raw.p','wb'))
pickle.dump(fake_corpus, open('./input/fake_corpus_raw.p','wb'))
pickle.dump(true_corpus, open('./input/true_corpus_raw.p','wb'))

In [10]:
unlabelled_corpus = pickle.load(open('./input/unlabelled_corpus_raw.p','rb'))
true_corpus = pickle.load(open('./input/true_corpus_raw.p','rb'))
fake_corpus = pickle.load(open('./input/fake_corpus_raw.p','rb'))

In [11]:
sentence_stream = [s for s in true_corpus + fake_corpus]
phrases = Phrases(sentence_stream)
bigram = Phraser(phrases)
sentence_stream = [bigram[s] for s in sentence_stream]
true_corpus = [bigram[s] for s in true_corpus]
fake_corpus = [bigram[s] for s in fake_corpus]
unlabelled_corpus = [bigram[s] for s in unlabelled_corpus]

2018-02-05 14:21:05,177 : INFO : collecting all words and their counts
2018-02-05 14:21:05,180 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2018-02-05 14:21:05,514 : INFO : collected 96727 word types from a corpus of 239406 words (unigram + bigrams) and 1600 sentences
2018-02-05 14:21:05,515 : INFO : using 96727 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2018-02-05 14:21:05,520 : INFO : source_vocab length 96727
2018-02-05 14:21:06,282 : INFO : Phraser built with 828 828 phrasegrams


In [5]:
fake_corpus[:3]

[['My_husband',
  'and',
  'I',
  'visited',
  'the',
  'Fairmont_Chicago',
  'Millennium_Park',
  'for',
  'our_honeymoon',
  'The',
  'customer_service',
  'was',
  'amazing',
  'From',
  'the',
  'time',
  'we',
  'booked',
  'our',
  'packege',
  'to',
  'the',
  'time',
  'we_checked',
  'in',
  'everything',
  'was',
  'absolutely',
  'amazing',
  'These',
  'people',
  'were',
  'proficient',
  'respectful',
  'and',
  'very',
  'thoughtful',
  'The',
  'Fairmont',
  'had',
  'a',
  'lounge',
  'a',
  'wine',
  'room',
  'a',
  'bar',
  'and',
  'a',
  'restaurant',
  'I',
  'couldn_t',
  'decide',
  'where',
  'I',
  'wanted',
  'to',
  'go',
  'first',
  'After',
  'we',
  'put',
  'our_bags',
  'up',
  'we',
  'headed',
  'down',
  'to',
  'the',
  'wine',
  'room',
  'It',
  'was',
  'totally',
  'delicious',
  'We',
  'also',
  'got',
  'free',
  'wine',
  'just',
  'because',
  'it',
  'was',
  'our_honeymoon',
  'Then',
  'after',
  'a_few',
  'glasses',
  'of',
  'wine',

In [6]:
# phrases = Phrases(sentence_stream, threshold = 2, min_count=2,common_terms=all_stop_set)
# trigram = Phraser(phrases)
# sentence_stream = [trigram[s] for s in sentence_stream]
# true_corpus = [trigram[s] for s in true_corpus]
# fake_corpus = [trigram[s] for s in fake_corpus]
# unlabelled_corpus = [trigram[s] for s in unlabelled_corpus]

2018-02-05 11:42:49,445 : INFO : collecting all words and their counts
2018-02-05 11:42:49,447 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2018-02-05 11:42:49,709 : INFO : collected 106413 word types from a corpus of 217927 words (unigram + bigrams) and 1600 sentences
2018-02-05 11:42:49,711 : INFO : using 106413 counts as vocab in Phrases<0 vocab, min_count=2, threshold=2, max_vocab_size=40000000>
2018-02-05 11:42:49,715 : INFO : source_vocab length 106413
2018-02-05 11:42:51,170 : INFO : Phraser built with 3046 3046 phrasegrams


In [7]:
true_corpus[3:6]

[['The',
  'Omni_Chicago',
  'really',
  'delivers',
  'on',
  'all',
  'fronts',
  'from',
  'the',
  'spaciousness',
  'of',
  'the',
  'rooms',
  'to',
  'the',
  'helpful',
  'staff',
  'to',
  'the',
  'prized',
  'location',
  'on',
  'Michigan_Avenue',
  'While',
  'this',
  'address',
  'in_Chicago',
  'requires',
  'a',
  'high',
  'level',
  'of',
  'quality',
  'the',
  'Omni',
  'delivers',
  'Check_in',
  'for',
  'myself',
  'and',
  'a',
  'whole',
  'group',
  'of',
  'people',
  'with',
  'me',
  'was',
  'under',
  'minutes',
  'the',
  'staff',
  'had',
  'plentiful',
  'recommendations',
  'for',
  'dining',
  'and',
  'events',
  'and',
  'the',
  'rooms_are',
  'some',
  'of',
  'the',
  'largest',
  'you',
  'll',
  'find',
  'at',
  'this',
  'price',
  'range',
  'in_Chicago',
  'Even',
  'the',
  'standard',
  'room',
  'has',
  'a',
  'separate',
  'living',
  'area',
  'and',
  'work',
  'desk',
  'The',
  'fitness_center',
  'has',
  'free',
  'weights',
  

In [8]:
pickle.dump(bigram, open('./input/t2_m2_bigram.p','wb'))
pickle.dump(trigram, open('./input/t2_m2_trigram.p','wb'))

In [12]:
def filter_stream(df, stopword_set):
    sentence_stream = []
    for s in df:
        s2 = [c  for c in s if c not in stopword_set] 
        sentence_stream.append(s2)
    return sentence_stream

unlabelled_corpus = filter_stream(unlabelled_corpus, all_stop_set)
true_corpus = filter_stream(true_corpus, all_stop_set)
fake_corpus = filter_stream(fake_corpus, all_stop_set)

In [13]:
pickle.dump(unlabelled_corpus, open('./input/unlabelled_corpus_clean3.p','wb'))
pickle.dump(fake_corpus, open('./input/fake_corpus_clean3.p','wb'))
pickle.dump(true_corpus, open('./input/true_corpus_clean3.p','wb'))