In [1]:
import numpy as np
import scipy.stats as stats
import csv
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import gensim as gs 
from gensim import corpora, models, similarities
import logging
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,make_scorer, recall_score,precision_score,fbeta_score

import gensim,logging
from gensim.parsing import PorterStemmer
from gensim.models import Word2Vec, Doc2Vec, Phrases
from gensim.models.phrases import Phraser
from wikipedia import search,page
import multiprocessing
import collections
import re
import warnings
import spacy
import nltk
import pickle

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
warnings.filterwarnings(action='ignore')

assert gensim.models.doc2vec.FAST_VERSION > -1

np.random.seed(0)
cores = multiprocessing.cpu_count()



In [2]:
df = pd.read_csv('../input/deceptive-opinion.csv')

stop_words = pd.read_csv('../input/stopwords.csv',names=['stop'])
new_stop = stop_words.stop.map(lambda x: str.capitalize(x))
all_stop_set = set(stop_words.stop.append(new_stop,ignore_index=True))


In [3]:
#generate combinations of all_stop_words, bigrams
#generate bigram
bigram_stops=[]
for a in all_stop_set:
    for b in all_stop_set:
        bigram_stop = a+"_"+b
        bigram_stops.append(bigram_stop)

In [4]:
all_stop_set = all_stop_set.union(bigram_stops)


In [5]:
def read_corpus(sentdf, tokens_only=False):
    for i, line in enumerate(sentdf):
        if tokens_only:
            yield list(gensim.utils.tokenize(line))
        else:
            yield gensim.models.doc2vec.TaggedDocument(list(gensim.utils.tokenize(line)),[i])

In [6]:
truedf = df[df.deceptive=='truthful'].loc[:,'text']
fakedf = df[df.deceptive=='deceptive'].loc[:,'text']
truedfy = df[df.deceptive=='truthful'].loc[:,'deceptive']
fakedfy = df[df.deceptive=='deceptive'].loc[:,'deceptive']
truedfy.replace({'truthful':1},inplace=True)
fakedfy.replace({'deceptive':0},inplace=True)

In [7]:
truedf.iloc[189]

'I stayed at The Talbott for 3 nights on business and was very pleased. The staff was friendly as can be immediately confirming the lore of the midwest. I was upgraded to a suite which was bigger than my apartment and certainly more luxurious. The free wi-fi came in handy as I needed to work remotely while there. Everything from the comfort of the bed to the staff and location made this a great stay. Oh, and I got to workout at the huge Equinox right next door for free. \n'

In [8]:
from spacy.tokenizer import Tokenizer
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

# prefix_re = re.compile(r'''^[\[\("']''')
# suffix_re = re.compile(r'''[\]\)"']$''')
# infix_re = re.compile(r'''[-~]''')
# simple_url_re = re.compile(r'''^https?://''')

# nlp.tokenizer = Tokenizer(nlp.vocab, 
#                           prefix_search = prefix_re.search,
#                           suffix_search = suffix_re.search,
#                           infix_finditer = infix_re.finditer,
#                           token_match = simple_url_re.match)

In [9]:
true_corpus = list(read_corpus(truedf,tokens_only=True))
fake_corpus = list(read_corpus(fakedf,tokens_only=True))
def to_list(doc):
    return [t.text for t in doc]
# true_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(truedf)]
# fake_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(fakedf)]

In [10]:
phrase_list = []

sentence_stream = [s for s in true_corpus + fake_corpus]
phrases = Phrases(sentence_stream, common_terms=all_stop_set)
bigram = Phraser(phrases)
sentence_stream = [bigram[s] for s in sentence_stream]
true_corpus = [bigram[s] for s in true_corpus]
fake_corpus = [bigram[s] for s in fake_corpus]

phrases = Phrases(sentence_stream, common_terms=all_stop_set)
trigram = Phraser(phrases)
sentence_stream = [trigram[s] for s in sentence_stream]
true_corpus = [trigram[s] for s in true_corpus]
fake_corpus = [trigram[s] for s in fake_corpus]

def filter_stream(df, stopword_set):
    sentence_stream = []
    for s in df:
        s2 = [c  for c in s if c not in stopword_set] 
        sentence_stream.append(s2)
    return sentence_stream

# sentence_stream2 = []
# for s in sentence_stream:
#     s2 = [c  for c in s if c not in all_stop_set] 
#     sentence_stream2.append(s2)

sentence_stream2 = filter_stream(sentence_stream, all_stop_set)
true_corpus = filter_stream(true_corpus, all_stop_set)
fake_corpus = filter_stream(fake_corpus, all_stop_set)

2018-02-01 16:39:02,652 : INFO : collecting all words and their counts
2018-02-01 16:39:02,654 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2018-02-01 16:39:02,936 : INFO : collected 102558 word types from a corpus of 239406 words (unigram + bigrams) and 1600 sentences
2018-02-01 16:39:02,937 : INFO : using 102558 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2018-02-01 16:39:02,938 : INFO : source_vocab length 102558
2018-02-01 16:39:03,997 : INFO : Phraser built with 532 532 phrasegrams
2018-02-01 16:39:04,991 : INFO : collecting all words and their counts
2018-02-01 16:39:04,992 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2018-02-01 16:39:05,253 : INFO : collected 104362 word types from a corpus of 229300 words (unigram + bigrams) and 1600 sentences
2018-02-01 16:39:05,254 : INFO : using 104362 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2018-0

In [11]:
true_corpus[3:6]

[['Omni_Chicago',
  'really',
  'delivers',
  'fronts',
  'spaciousness',
  'rooms',
  'helpful',
  'staff',
  'prized',
  'location',
  'Michigan_Avenue',
  'address',
  'Chicago',
  'requires',
  'high',
  'level',
  'quality',
  'Omni',
  'delivers',
  'Check',
  'whole',
  'group',
  'people',
  'minutes',
  'staff',
  'plentiful',
  'recommendations',
  'dining',
  'events',
  'rooms',
  'largest',
  'll',
  'find',
  'price',
  'range',
  'Chicago',
  'Even',
  'standard',
  'room',
  'separate',
  'living',
  'area',
  'work',
  'desk',
  'fitness_center',
  'free',
  'weights',
  'weight',
  'machines',
  'two',
  'rows',
  'cardio',
  'equipment',
  'shared',
  'room',
  'others',
  'feel',
  'cramped',
  'way',
  'great',
  'property'],
 ['asked',
  'high',
  'floor',
  'away',
  'elevator',
  'got',
  'room',
  'pleasantly',
  'decorated',
  'functional',
  'clean',
  'didn_t',
  'need',
  'whole',
  'lot',
  'service',
  'pleasant',
  'prompt',
  'used',
  'fitness_center',

In [12]:
true_corpus_raw = pickle.load(open('../input/true_corpus_raw.p','rb'))
fake_corpus_raw = pickle.load(open('../input/fake_corpus_raw.p','rb'))
sentence_stream_raw = true_corpus_raw + fake_corpus_raw

In [13]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

compound = []
neg = []
neu = []
pos = []
# raw_corpus = true_corpus + fake_corpus
for s in sentence_stream_raw:
    sent = sia.polarity_scores(' '.join(s))
    compound.append(sent['compound'])
    neg.append(sent['neg'])
    neu.append(sent['neu'])
    pos.append(sent['pos'])

vader_sent = pd.DataFrame({'compound':compound, 'neg':neg, 'neu':neu, 'pos':pos})

In [14]:
from empath import Empath
lexicon = Empath()

lexicon_results = pd.DataFrame(columns=lexicon.cats)
for ind, s in enumerate(sentence_stream_raw):
    lexicon_results = lexicon_results.append(pd.Series([np.nan]), ignore_index=True)
    results = (lexicon.analyze(s))
    if (ind % 100 == 0):
        print(ind)
    for k in results.keys():
        lexicon_results[k].iloc[ind] = results[k]

lexicon_results.drop(columns=[0],inplace=True)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


In [15]:
lexicon_results.head(7)

Unnamed: 0,help,office,dance,money,wedding,domestic_work,sleep,medical_emergency,cold,hate,...,weapon,children,monster,ocean,giving,contentment,writing,rural,positive_emotion,musical
0,1.0,1.0,0.0,2.0,1.0,3.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0
1,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2.0,0.0,0.0,1.0,1.0,2.0,4.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
3,1.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,1.0,0.0
5,1.0,5.0,0.0,0.0,0.0,2.0,3.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0
6,2.0,4.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0


In [16]:
true_fake_corpus = []
for ind,s in enumerate(sentence_stream2):
    true_fake_corpus.append(gensim.models.doc2vec.TaggedDocument(s,[ind]))


In [17]:
from gensim import models
# true_raw = pickle.load(open('../input/true_corpus_raw.p','rb'))
# fake_raw = pickle.load(open('../input/fake_corpus_raw.p','rb'))

dictionary = corpora.Dictionary(sentence_stream2)

corpus_bow = [dictionary.doc2bow(s) for s in sentence_stream2]
tfidf = models.TfidfModel(corpus_bow)

2018-02-01 16:40:39,575 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-02-01 16:40:39,708 : INFO : built Dictionary(11248 unique tokens: ['stayed', 'one_night', 'getaway', 'family', 'thursday']...) from 1600 documents (total 108880 corpus positions)
2018-02-01 16:40:39,811 : INFO : collecting document frequencies
2018-02-01 16:40:39,812 : INFO : PROGRESS: processing document #0
2018-02-01 16:40:39,842 : INFO : calculating IDF weights for 1600 documents and 11247 features (96014 matrix non-zeros)


In [58]:
num_topics = 200
chunksize = 400
passes = 5

model = models.LdaModel(corpus_bow, id2word=dictionary, num_topics=num_topics,alpha = 'auto',eta='auto',random_state=0, chunksize=chunksize, passes=passes)
# model = models.LdaModel(tfidf[corpus_bow], id2word=dictionary, num_topics=num_topics)
# model.update(corpus_bow[500:len(corpus_bow)])

2018-02-01 17:09:21,666 : INFO : using autotuned alpha, starting with [0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005

2018-02-01 17:09:35,305 : INFO : optimized alpha [0.00527095020334451, 0.004959244095375119, 0.005215892194659711, 0.005083688896740689, 0.004991033992934928, 0.005356331812112217, 0.005014783956877158, 0.005127114536731649, 0.005363461664473786, 0.005227112411766237, 0.005389489838086051, 0.005022764105175165, 0.005233628365361045, 0.005283018483720381, 0.0048180078818361675, 0.0051229675911916945, 0.0048355812431981815, 0.005561166915709085, 0.005205906021670386, 0.005175522041687184, 0.005366697468575304, 0.005180371415617881, 0.005358919408208738, 0.005269575260459907, 0.005250670588870427, 0.005308620105329579, 0.005201165234243254, 0.005022095052759483, 0.004888109959901652, 0.004877750252962914, 0.0056507751843947905, 0.005075287109803838, 0.005296079757546449, 0.004882168035433085, 0.004854080006337792, 0.005388010745226896, 0.005092386767764484, 0.004966339623749995, 0.0049689984299392306, 0.0050875691917964, 0.0051227211083959175, 0.005331029630565932, 0.005050580014183715, 0

2018-02-01 17:09:37,289 : INFO : optimized alpha [0.005470458472540526, 0.0049924253587764695, 0.005315402572970904, 0.005182322467648663, 0.005096359500037315, 0.005636039723135151, 0.005069026325830735, 0.005255891033823811, 0.0056278402441726685, 0.005349997725952707, 0.005576139550009039, 0.005135080366764809, 0.005646431106685336, 0.005483373203817877, 0.00476082423110218, 0.00517728861705581, 0.004798973644039584, 0.005711789077745737, 0.005327753224338526, 0.005350063564277073, 0.005389382870904782, 0.00532510623033924, 0.005835913378237706, 0.0055082858250708605, 0.005251363206489525, 0.005508761400105343, 0.005376351561284213, 0.0052732436475231665, 0.004843386071501203, 0.0049392311770049095, 0.0059339883895199955, 0.005247275680868352, 0.005473731809796969, 0.004880079410372103, 0.004852954727513598, 0.005770991341112435, 0.005139149177609087, 0.005251882221036354, 0.005023412210599566, 0.005297586393795436, 0.00525900516323497, 0.006085383231040178, 0.0051343361230945525, 0

2018-02-01 17:09:43,294 : INFO : optimized alpha [0.005643131855671078, 0.005061500993948923, 0.005712641362206645, 0.0054535990405418676, 0.005223182593303658, 0.005996761158990687, 0.005175791866157882, 0.0054508149910008725, 0.00581846138272869, 0.005507651479734619, 0.005835723512938506, 0.005597719285792453, 0.00594457059557896, 0.005781877870112787, 0.004715463845822986, 0.005337873945644912, 0.004873214109960138, 0.0064235181401211434, 0.005457446873021427, 0.005554037268019258, 0.00583233362260697, 0.005690093130988138, 0.006038117814666218, 0.005884142273812978, 0.005466004782877472, 0.005910790385253667, 0.005574493569843309, 0.005448518601721765, 0.004802360228576812, 0.005001607489165526, 0.007008503865523617, 0.005481413013967576, 0.005999702389683275, 0.004930062154408886, 0.004939318685981566, 0.006335799937791216, 0.0054461717161452886, 0.005426677546433455, 0.005180212545980218, 0.005453522036767494, 0.005414125920290742, 0.006516788353784578, 0.0053193227107188575, 0.

2018-02-01 17:09:44,835 : INFO : optimized alpha [0.005691481429187879, 0.0050528708497384, 0.005684801225988819, 0.005412602139534045, 0.005218141285720985, 0.006032058464571385, 0.005202661277329164, 0.005471182674956992, 0.005842590502169417, 0.005545068368921819, 0.005883777590489639, 0.005594108738339615, 0.00601725863280048, 0.005842189859630582, 0.004676322171374264, 0.0053261082010624, 0.004836383147820861, 0.006410579551631691, 0.005483354073624925, 0.005556897821707629, 0.0058145237246665215, 0.005703283368190314, 0.006204907062616995, 0.005932299394448094, 0.005475230706394454, 0.005940066410954207, 0.005583021777199573, 0.005507990944051487, 0.004761762714193076, 0.004978059612929017, 0.007124117098947006, 0.0055687238542324125, 0.0059983980758962505, 0.004892367513903403, 0.004896372645449355, 0.006363150927889725, 0.005410736798262633, 0.005630969225802925, 0.005148910865109815, 0.005496519404443841, 0.005412499002251306, 0.006711144264350726, 0.005318527358708396, 0.0055

2018-02-01 17:09:46,956 : INFO : optimized alpha [0.005853013026969174, 0.005067802846469748, 0.005893945777558015, 0.005597116371764744, 0.005253325432164802, 0.0063800070061979805, 0.005248155008470475, 0.005645698079267512, 0.0060916243998350795, 0.0056762548359586724, 0.006219765137050615, 0.005811295229068411, 0.0061609141768507346, 0.006080022382012664, 0.004635686555597997, 0.005448613238560813, 0.0048572270640973355, 0.006948974643666452, 0.005630041177124951, 0.005670871466792772, 0.006158694853562971, 0.005947749891246657, 0.00626781100638347, 0.0061853406161661234, 0.005734472269292005, 0.0062832668255251144, 0.005685722861439318, 0.005626983025566185, 0.004739068037753878, 0.004998180939006627, 0.007839660666957538, 0.005688210271901947, 0.006296923021348579, 0.004927905242673986, 0.004851822878715515, 0.0067664842335848395, 0.005572412217733096, 0.005716630795607692, 0.005220892980166766, 0.005536593525445473, 0.0055579075541864, 0.006898255146195387, 0.005396715636815032,

2018-02-01 17:09:48,697 : INFO : optimized alpha [0.005988395422841232, 0.005134451245908235, 0.0059692611286711805, 0.005705712621573157, 0.005320448192910413, 0.006574794706137966, 0.0053150129208849855, 0.005737479848908827, 0.006255252750681635, 0.005780189111103653, 0.006366127905996237, 0.005909754008131052, 0.006517373482534123, 0.00635011052989468, 0.004599970844786224, 0.0054998345243034855, 0.0048328878262944835, 0.0070908235682602344, 0.00572720528154198, 0.005849899778070495, 0.006177986698173182, 0.006060134525366476, 0.006676270438684996, 0.0063704171460066735, 0.005744489519348046, 0.006462438067655625, 0.005830333082221017, 0.005880193988246481, 0.0047065782283357786, 0.005063916944633626, 0.008166470258070699, 0.005815045972955342, 0.006419347033235753, 0.004922927522399654, 0.005071040357362298, 0.007115647978799058, 0.005623420020237217, 0.00606199636423714, 0.005271720253135059, 0.005696281697165415, 0.005671880835829916, 0.0076259189590196655, 0.005470122153110362,

2018-02-01 17:09:53,923 : INFO : optimized alpha [0.006139580750334275, 0.005193149224409551, 0.006329001469022945, 0.005999144442322583, 0.005455382413075745, 0.006903851511925297, 0.005411326357041452, 0.005937646037499747, 0.0064365960932262775, 0.005935011495657474, 0.006601876976791351, 0.006405293037803199, 0.006744008305589346, 0.006631953621600353, 0.004562363108661138, 0.0056773129558291115, 0.004944855267671926, 0.00795313683593657, 0.005845095600385485, 0.006028946624820429, 0.006618399617921072, 0.006437709801468359, 0.006809925080095235, 0.00671213247049815, 0.005968541987883879, 0.006872141175196389, 0.006003057668298082, 0.00605375766298363, 0.004676890371837725, 0.0051323856546129165, 0.009493943018781877, 0.006052958378274287, 0.006949154937880176, 0.004970352221550593, 0.005144492201764659, 0.007678913048943995, 0.0058832081517759295, 0.006214085053481512, 0.005421765470431557, 0.005825414840623532, 0.00583532808348754, 0.007948433258478922, 0.005640896138077015, 0.00

2018-02-01 17:09:55,392 : INFO : optimized alpha [0.006185816645844954, 0.0051953153862038986, 0.006296655184007276, 0.005959985717057658, 0.005455442654806686, 0.006934458932853134, 0.005442249127626859, 0.005972590380729181, 0.006463586192843675, 0.005976246956434902, 0.006640644323193513, 0.006383527382158097, 0.006847087578254705, 0.006696216782234976, 0.004529731274836001, 0.00566995264477333, 0.00491119509713014, 0.007922689300162836, 0.005869263487177226, 0.0060407547798412235, 0.006581452723469914, 0.00644639264223806, 0.00701837842662776, 0.006750682577608514, 0.005975255580411445, 0.006902462151166207, 0.006009589147895388, 0.006099851449796187, 0.004647013387336575, 0.0051299296254410124, 0.009625395292822785, 0.006133239335147182, 0.006940247257453665, 0.00493637667291794, 0.005103002260552767, 0.00768953532949312, 0.005845741876957547, 0.006472790965917816, 0.0053962468962931495, 0.005883208998288011, 0.005826358106000937, 0.008162250854995183, 0.005639160088319366, 0.0060

2018-02-01 17:09:57,440 : INFO : optimized alpha [0.006351329611050939, 0.005219825475146903, 0.0065192224500719735, 0.006173281074708559, 0.005514904519767403, 0.007297597406408705, 0.005506511293025587, 0.006151781606496025, 0.006720539171906168, 0.00611076345802093, 0.007005346422167111, 0.0066131648210212375, 0.007004860000309344, 0.006973875738694699, 0.004495714669864639, 0.005805487654682206, 0.0049317028979446054, 0.008554118530128907, 0.0060129847981386835, 0.0061642001637084885, 0.0069678181210844296, 0.00670936782991962, 0.007050901055864178, 0.007023838630934653, 0.0062399845601285794, 0.007270156516438607, 0.006121214007682514, 0.006241089577505671, 0.004628775995817261, 0.00514464882512255, 0.010563674793452818, 0.006263243268092765, 0.007312179575413182, 0.0049800730973749135, 0.005059831101578191, 0.008150155102139463, 0.006016268887907675, 0.0065697106969300366, 0.0054860840981397, 0.0059379912959565285, 0.005991246161272078, 0.008318200159602513, 0.005730848464415484,

2018-02-01 17:09:59,182 : INFO : optimized alpha [0.006484922109287582, 0.005318482499459684, 0.006592706694177081, 0.006286924667991806, 0.00558484411926626, 0.007476118501459737, 0.005571074027267131, 0.006236062233740087, 0.006883855651149238, 0.006218443512281935, 0.0071458146338709005, 0.006717659048207902, 0.007387991520091763, 0.007267783550686174, 0.004465818289008051, 0.005865794670869037, 0.004909706476129564, 0.008699549833432779, 0.006136094830299545, 0.006365380740708667, 0.006988959108878071, 0.006808916972057598, 0.0075327444167438634, 0.007192446978446988, 0.006253479274047, 0.007454175835606199, 0.006263370476065289, 0.006503540539724506, 0.004601457408930037, 0.005207728288786725, 0.010956093653813457, 0.006395273511236825, 0.007436029304398522, 0.004976494271560614, 0.005271446597093973, 0.008542681349688967, 0.006065321315955058, 0.006925347253713218, 0.005540269233597939, 0.006111983809364055, 0.006103090664317753, 0.009143543717797641, 0.0058128817075680786, 0.006

2018-02-01 17:10:04,390 : INFO : optimized alpha [0.006633779905073918, 0.005395008510470806, 0.006956786998816493, 0.006600818144903656, 0.0057417516629613, 0.00782934175205116, 0.005664011159103202, 0.00644048671938858, 0.007070919106289046, 0.006376562418480453, 0.007371049761291102, 0.007252440278186505, 0.007611867052685037, 0.007593607714761457, 0.004434180569022922, 0.006028527176850071, 0.005020474737435436, 0.009646316400975411, 0.006245379240436048, 0.006566868269444769, 0.007424417723161647, 0.007222439540995203, 0.007658193800979027, 0.00753654940407076, 0.006482437151474809, 0.0078831190701405, 0.006426984974713685, 0.006664817639837331, 0.004576631860841442, 0.0052733128883180385, 0.012634249920066813, 0.006633922283743797, 0.008117689167225503, 0.005021913143006879, 0.005342043390848681, 0.009236245216708445, 0.006322451450126308, 0.007079357954075672, 0.005690928202008023, 0.006261536823313024, 0.006281129290778525, 0.009396845847114726, 0.005985172940196829, 0.00654957

2018-02-01 17:10:05,836 : INFO : optimized alpha [0.006684013874871109, 0.005397098257072335, 0.0069258800362772465, 0.006557025776553639, 0.0057411085577614915, 0.00784767534223712, 0.005703875459839559, 0.0064792771431624535, 0.0071072624448533656, 0.006427428702109207, 0.007399688129750911, 0.0072103761444244135, 0.00776876419888926, 0.007655053180788961, 0.0044063880835360026, 0.006025385994255548, 0.004989284495322365, 0.00959306468249408, 0.006267939600631376, 0.006587761473348549, 0.007386416847192053, 0.007226132135137045, 0.007953974757945738, 0.007564380064990415, 0.006486696004739377, 0.007914285574372015, 0.006443060484448035, 0.006702810772514971, 0.004551070110669808, 0.005275994660049699, 0.012795050073595698, 0.006701259914346604, 0.008119528620365997, 0.004990758495630662, 0.005301705871350788, 0.009256528119816114, 0.006282836822703609, 0.007361928180143539, 0.00567542445679944, 0.0063290351380108495, 0.0062756545744646845, 0.009661328957247493, 0.005982360545376315, 

2018-02-01 17:10:07,768 : INFO : optimized alpha [0.006848692402362879, 0.005435174486840015, 0.007163339101703754, 0.006784675383961827, 0.005793754155747283, 0.008231643681478655, 0.005776387031312899, 0.006669815510639695, 0.0073798611817239845, 0.00656626059750046, 0.007767885309142258, 0.007465835477637996, 0.007928492114815831, 0.0079580148560704, 0.004377306639303115, 0.006153509061515417, 0.005009491189513403, 0.010273297120294487, 0.006398668866232849, 0.006704720918587505, 0.007786196081510359, 0.00749638990138098, 0.007987588963616784, 0.007851433445680478, 0.006741145888526976, 0.008320902564079379, 0.006558536335176731, 0.006844261985340607, 0.004536137670255262, 0.00529044201546654, 0.013996133943130273, 0.0068424980475323225, 0.008574478294202612, 0.005032991306631918, 0.005259606501104217, 0.009775535028399035, 0.006446622044203339, 0.007464478732310055, 0.005763095680869473, 0.006387601976873885, 0.006450736409585192, 0.009782123685341561, 0.006098673976805379, 0.00676

2018-02-01 17:10:09,528 : INFO : optimized alpha [0.0069811854745148475, 0.00554084695727999, 0.0072415511146667905, 0.006916293875649978, 0.005867038842906984, 0.008442318318971206, 0.0058443731060595855, 0.006746877284585649, 0.007550268806393401, 0.006666823265081792, 0.00791541061573358, 0.007576845432694999, 0.008328091187233785, 0.008278769276959358, 0.0043517629125486585, 0.006212257457412823, 0.004989319703995746, 0.01043670791547745, 0.00653117611367514, 0.006906763566218705, 0.007809475043861326, 0.0075885769260497045, 0.00857881094659696, 0.008022169056279312, 0.006746454717410521, 0.008509555085473715, 0.006704989273383816, 0.007104214941377788, 0.004512715944375612, 0.005351470161637562, 0.014429575366858844, 0.0069862979281252536, 0.008692039058188243, 0.00503052141471271, 0.005465346841403656, 0.010182884763203436, 0.006494059968755388, 0.007830022476562124, 0.005815813019407493, 0.006560910689784713, 0.006549964754289264, 0.010669908457819979, 0.00620612687548277, 0.006

2018-02-01 17:10:14,726 : INFO : optimized alpha [0.0071284775287784475, 0.005620185675103298, 0.00762450686432025, 0.007247774120062915, 0.006041342474971305, 0.008799946630483588, 0.005940067032660047, 0.006951045962412401, 0.0077310785794272915, 0.006799705195301871, 0.008114872315971624, 0.00818830916839851, 0.008548849645856225, 0.00862506961084239, 0.004324612516768668, 0.006377748723273969, 0.005099706801076972, 0.01146216339602819, 0.006638353411512073, 0.00712035846372295, 0.008279540320357168, 0.008014394245410667, 0.008695602484725038, 0.008367877181113061, 0.006980812271565079, 0.00895062862614015, 0.006878513437863567, 0.007265274767587301, 0.004491558175260404, 0.005414825966876093, 0.01646175441940899, 0.007251296721948272, 0.009521044517737852, 0.005074338108354457, 0.005533744834629155, 0.010966721698760735, 0.006743868072896473, 0.007999576547548739, 0.005957972088796956, 0.006703105148416999, 0.0067208112822181075, 0.010871342497739819, 0.006420335391937668, 0.007053

2018-02-01 17:10:16,158 : INFO : optimized alpha [0.007183021588632035, 0.005622005992369916, 0.007594885609628817, 0.007210704814160051, 0.00603995675624819, 0.00881173898400202, 0.00598388346790877, 0.00699372511073134, 0.007777543417276176, 0.006849184061327131, 0.008132897538948297, 0.008136415116597701, 0.008737152958420548, 0.008689988136613506, 0.0043005383924326875, 0.006378838776741851, 0.005070479820079894, 0.011381073607479359, 0.006659507563133557, 0.007144793050263515, 0.008247123808694827, 0.008026232296062677, 0.009027118212880006, 0.008391107150399318, 0.0069827613246084275, 0.008989929659945396, 0.006898519045332272, 0.007300962621028295, 0.00446933244958754, 0.0054132562121003746, 0.016604924556477458, 0.00731706238011901, 0.009501881528485344, 0.005045475037969079, 0.0054943276199239865, 0.010970138661605653, 0.006702315849115494, 0.008301228937203763, 0.00593723420916243, 0.006758270920686163, 0.006719070805061841, 0.011155070123608111, 0.006420945355380736, 0.00709

2018-02-01 17:10:18,094 : INFO : optimized alpha [0.007347246073086539, 0.005673160376214797, 0.007836138292945933, 0.007454837520745658, 0.006096321134501662, 0.009210278197564467, 0.0060598591290272705, 0.007190673701103174, 0.008054343234179848, 0.006969636220626375, 0.008523122650142791, 0.008400854076181425, 0.008891108124079741, 0.009012973679341531, 0.004275263210979374, 0.00651078207928601, 0.005098867441845166, 0.012111873232003231, 0.006788883220498906, 0.00727313851258027, 0.00867541995721484, 0.008330553604640373, 0.00905336885541906, 0.008671134232169126, 0.007232562233988567, 0.009408316933678896, 0.007012861323275718, 0.007442591373858289, 0.0044569323992936505, 0.005427523314568328, 0.018052766858296113, 0.007470880214624708, 0.010007577030823108, 0.005086504088431756, 0.0054530732968995944, 0.01154952928728138, 0.006870766966776542, 0.0084095562723569, 0.006023129024774913, 0.006815260891249342, 0.00689931527400846, 0.011259899431969785, 0.0065793871890629605, 0.007276

2018-02-01 17:10:19,786 : INFO : optimized alpha [0.007472945834729908, 0.005772181865239315, 0.007919514594098889, 0.007587436665989058, 0.006168318378634316, 0.009401994380003964, 0.006121522239772343, 0.007272547020522066, 0.008205509299606416, 0.007074747878494158, 0.008670941886369243, 0.008518846135914557, 0.009322122524678211, 0.009347609381964073, 0.004253089164072719, 0.006568364306924532, 0.005080074325999464, 0.012272494514714113, 0.006931450439386149, 0.007501344597723385, 0.008686330922335521, 0.008435553544291787, 0.009755106127955222, 0.00883611757757041, 0.007235683905994921, 0.00960061591536156, 0.007158592575050775, 0.0077123038657728946, 0.0044365478775688135, 0.0054868796950360175, 0.018583967570141247, 0.0076093068263909805, 0.010123607749272347, 0.005084942300480117, 0.005654190956448699, 0.012023530007567406, 0.006916847444432492, 0.008812720415612945, 0.006069675631786247, 0.0069880341963011175, 0.007014908551003587, 0.01227856657819699, 0.00671953270243622, 0.0

2018-02-01 17:10:24,902 : INFO : optimized alpha [0.007625409749178255, 0.005864270882354912, 0.008323470456369733, 0.007932577986875924, 0.006350544206909615, 0.009777397976008542, 0.006215385238417585, 0.007489342661063074, 0.0083864824617555, 0.007200110379707361, 0.008856981081343532, 0.00916001616289588, 0.0095624669791988, 0.00970731608994982, 0.004229422682451907, 0.0067321030082912086, 0.005190770091451438, 0.013379023696464732, 0.007042850680014074, 0.007716600506556318, 0.009180119293167453, 0.008883049864822646, 0.009877600723933622, 0.009196647867503899, 0.007470210152253439, 0.010059875454313428, 0.00733114127626059, 0.007867022110518235, 0.004418245532239009, 0.0055484416212799965, 0.02103144810374256, 0.007839342252222166, 0.011065862352409108, 0.005127448897016812, 0.00572085215413097, 0.012926514928535934, 0.007165820154712551, 0.008984529256779035, 0.006214145470796008, 0.007128944228408844, 0.007185223547530461, 0.012467035423255151, 0.006963642321515016, 0.007571801

In [59]:
topic_dists = np.zeros([len(sentence_stream2),num_topics])
    
for i,item in enumerate(corpus_bow):       
    dists = model.get_document_topics(item)        
    indices = list(dict(dists).keys())        
    vals = list(dict(dists).values())        
    topic_dists[i,indices] = vals

topic_dists = pd.DataFrame(topic_dists, columns = ['topic'+str(a) for a in range(num_topics)])

In [61]:
unlabelled_raw = pickle.load(open('../input/unlabelled_corpus_clean2.p','rb'))
raw_corpus_bow = [dictionary.doc2bow(s) for s in unlabelled_raw]

In [63]:
raw_tp_dists = np.zeros([len(raw_corpus_bow),num_topics])
for i,item in enumerate(raw_corpus_bow):
    if (i% 1000)== 0:
        print(i)
    dists = model.get_document_topics(item)        
    indices = list(dict(dists).keys())        
    vals = list(dict(dists).values())        
    raw_tp_dists[i,indices] = vals
raw_tp_dists = pd.DataFrame(raw_tp_dists, columns = ['topic'+str(a) for a in range(num_topics)])

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000


KeyboardInterrupt: 

In [20]:
# topic_dists = list(model[tfidf[corpus_bow]])
# topic_dists = [ [c for (b,c) in a  ]for a in topic_dists]
# topic_dists = pd.DataFrame(topic_dists,columns = ['topic'+str(a) for a in range(num_topics)] )

In [21]:
topic_dists.head(4)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,...,topic90,topic91,topic92,topic93,topic94,topic95,topic96,topic97,topic98,topic99
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018424,...,0.0,0.132309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.049134,0.701142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.126772,0.0,0.0,0.0,0.026488,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.058193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
turk_model = gensim.models.doc2vec.Doc2Vec(dm=0, size=100,min_count=1, window=5,workers=cores, seed=8, negative=5)
turk_model.build_vocab(true_fake_corpus)

2018-02-01 16:41:01,212 : INFO : collecting all words and their counts
2018-02-01 16:41:01,214 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-02-01 16:41:01,237 : INFO : collected 11248 word types and 1600 unique tags from a corpus of 1600 examples and 108880 words
2018-02-01 16:41:01,239 : INFO : Loading a fresh vocabulary
2018-02-01 16:41:01,262 : INFO : min_count=1 retains 11248 unique words (100% of original 11248, drops 0)
2018-02-01 16:41:01,264 : INFO : min_count=1 leaves 108880 word corpus (100% of original 108880, drops 0)
2018-02-01 16:41:01,302 : INFO : deleting the raw counts dictionary of 11248 items
2018-02-01 16:41:01,304 : INFO : sample=0.001 downsamples 29 most-common words
2018-02-01 16:41:01,307 : INFO : downsampling leaves estimated 101418 word corpus (93.1% of prior 108880)
2018-02-01 16:41:01,309 : INFO : estimated required memory for 11248 words and 100 dimensions: 15262400 bytes
2018-02-01 16:41:01,344 : INFO : resetting lay

In [23]:
turk_model.train(true_fake_corpus, total_examples=turk_model.corpus_count, epochs=turk_model.iter)

2018-02-01 16:41:01,594 : INFO : training model with 8 workers on 11248 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
2018-02-01 16:41:02,102 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-02-01 16:41:02,114 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-02-01 16:41:02,140 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-02-01 16:41:02,148 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-02-01 16:41:02,153 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-02-01 16:41:02,161 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-02-01 16:41:02,164 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-02-01 16:41:02,166 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-02-01 16:41:02,167 : INFO : training on 544400 raw words (515024 effective words) took 0.6s, 927014 effective words/s

515024

In [55]:
from sklearn.preprocessing import StandardScaler
true_fake_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in true_fake_corpus])

# vader_sent = vader_sent.apply(np.square)
true_fake_vec2 = pd.concat([true_fake_vec, vader_sent, lexicon_results, topic_dists], axis=1)

# ss = StandardScaler()
# true_fake_vec_all = ss.fit_transform(true_fake_vec)

In [56]:
all_y = pd.concat([truedfy, fakedfy], axis= 0)
X_train, X_test, y_train, y_test = train_test_split(true_fake_vec, all_y, train_size=0.75, random_state=8)
Xv_train, Xv_test, yv_train, yv_test = train_test_split(vader_sent, all_y,train_size=0.75, random_state=8)
Xe_train, Xe_test, ye_train, ye_test = train_test_split(lexicon_results, all_y, train_size=0.75, random_state=8)
Xlda_train, Xlda_test, ylda_train, ylda_test = train_test_split(topic_dists, all_y, train_size=0.75, random_state=8)
Xa_train, Xa_test, ya_train, ya_test = train_test_split(true_fake_vec2, all_y,train_size=0.75, random_state=8)

In [26]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

def run_test(X, y, cv_val, scoring):
    gnb = GaussianNB()
    dtree = DecisionTreeClassifier()
    svm2 = svm.SVC(random_state=8)
    xg = XGBClassifier()
    rf = RandomForestClassifier()
    # logreg_cv = linear_model.LogisticRegressionCV(Cs=100, cv=5, penalty='l1',scoring='accuracy',solver='liblinear',n_jobs=-1)
    print('Gaussian NB:')
    scorelist = cross_val_score(gnb, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('DecisionTree')
    scorelist = cross_val_score(dtree, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('SVM:')
    scorelist = cross_val_score(svm2, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('XGB Default:')
    scorelist = cross_val_score(xg, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('Rand Forest:')
    scorelist = cross_val_score(rf, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    # print('Logistics Regression:')
    # scorelist = cross_val_score(logreg_cv, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    # print(scorelist, np.mean(scorelist))

In [27]:
# run_test(Xe_train, ye_train, 5, 'f1')

In [28]:
# run_test(Xv_train, yv_train, 5, 'f1')

In [29]:
run_test(X_train, y_train, 5, 'f1')

Gaussian NB:
[0.63799283 0.55849057 0.70175439 0.57761733 0.65551839] 0.6262747013425103
DecisionTree
[0.50806452 0.57740586 0.56170213 0.50806452 0.5770751 ] 0.5464624232944908
SVM:
[0.66850829 0.67036011 0.67036011 0.67036011 0.66852368] 0.6696224593166027
XGB Default:
[0.575      0.54545455 0.55793991 0.62096774 0.60392157] 0.580656754036114
Rand Forest:
[0.55111111 0.44019139 0.54298643 0.56108597 0.54385965] 0.5278469091967544


In [30]:
# run_test(Xlda_train, ylda_train, 5, 'f1')

In [57]:
run_test(Xa_train, ya_train, 5, 'f1')

Gaussian NB:
[0.70192308 0.67961165 0.736      0.72146119 0.69406393] 0.706611968312753
DecisionTree
[0.68503937 0.68644068 0.7219917  0.71311475 0.69105691] 0.6995286827914244
SVM:
[0.76923077 0.7008547  0.70866142 0.7394958  0.72413793] 0.7284761233524231
XGB Default:
[0.84297521 0.8173913  0.824      0.78813559 0.80314961] 0.8151303420957896
Rand Forest:
[0.69406393 0.58215962 0.64150943 0.63849765 0.73362445] 0.657971018409336


In [32]:
# svm_clf = svm.SVC(random_state=8)
# svm_params = {
#     "kernel":['rbf','linear'],
#     'C':[0.1,0.2,0.4,0.6,0.8,1,10],
#     'gamma': np.logspace(-1,1,9)
# }
# scorer = make_scorer(fbeta_score,beta=0.5)
# svm_gs = GridSearchCV(svm_clf, svm_params, cv=5, scoring=scorer, n_jobs=-1)
# svm_gs.fit(Xa_train,ya_train)
# best_clf = svm_gs.best_estimator_
# print(best_clf)
# best_pred = best_clf.predict(Xa_test)

In [33]:
# performance = {'accuracy': accuracy_score(best_pred,y_test),
#                 'recall': recall_score(best_pred,y_test),
#                 'precision': precision_score(best_pred,y_test)}
# print(performance)

In [41]:
xg_clf = XGBClassifier()
xg_params = {
#     'booster'=['gbtree'],
    'colsample_bytree':[0.15,0.4,0.85],
    'max_depth':[4,8,16,20],
    'subsample':[0.7,0.95],
    'min_child_weight':[1,3,9],
    'gamma':[0,0.01,0.05,0.3,0.6,1]
}
scorer = make_scorer(fbeta_score,beta=0.5)
xg_gs = GridSearchCV(xg_clf, xg_params, cv=5, scoring=scorer, n_jobs=-1)
xg_gs.fit(Xa_train,ya_train)
best_xg_clf = xg_gs.best_estimator_
print(best_xg_clf)
best_pred = best_xg_clf.predict(Xa_test)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.15,
       gamma=0.3, learning_rate=0.1, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.7)


In [42]:
performance = {'accuracy': accuracy_score(best_pred,y_test),
                'recall': recall_score(best_pred,y_test),
                'precision': precision_score(best_pred,y_test)}
print(performance)

{'accuracy': 0.83, 'recall': 0.8404255319148937, 'precision': 0.8061224489795918}
