In [1]:
import numpy as np
import scipy.stats as stats
import csv
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import gensim as gs 
from gensim import corpora, models, similarities
import logging
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,make_scorer, recall_score,precision_score,fbeta_score

import gensim,logging
from gensim.parsing import PorterStemmer
from gensim.models import Word2Vec, Doc2Vec, Phrases
from gensim.models.phrases import Phraser
from wikipedia import search,page
import multiprocessing
import collections
import re
import warnings
import spacy
import nltk
import pickle

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
warnings.filterwarnings(action='ignore')

assert gensim.models.doc2vec.FAST_VERSION > -1

np.random.seed(0)
cores = multiprocessing.cpu_count()



In [2]:
df = pd.read_csv('../input/deceptive-opinion.csv')

stop_words = pd.read_csv('../input/stopwords.csv',names=['stop'])
new_stop = stop_words.stop.map(lambda x: str.capitalize(x))
all_stop_set = set(stop_words.stop.append(new_stop,ignore_index=True))


In [3]:
#generate combinations of all_stop_words, bigrams
#generate bigram
bigram_stops=[]
for a in all_stop_set:
    for b in all_stop_set:
        bigram_stop = a+"_"+b
        bigram_stops.append(bigram_stop)

In [4]:
all_stop_set = all_stop_set.union(bigram_stops)


In [5]:
def read_corpus(sentdf, tokens_only=False):
    for i, line in enumerate(sentdf):
        if tokens_only:
            yield list(gensim.utils.tokenize(line))
        else:
            yield gensim.models.doc2vec.TaggedDocument(list(gensim.utils.tokenize(line)),[i])

In [6]:
truedf = df[df.deceptive=='truthful'].loc[:,'text']
fakedf = df[df.deceptive=='deceptive'].loc[:,'text']
truedfy = df[df.deceptive=='truthful'].loc[:,'deceptive']
fakedfy = df[df.deceptive=='deceptive'].loc[:,'deceptive']
truedfy.replace({'truthful':1},inplace=True)
fakedfy.replace({'deceptive':0},inplace=True)

In [7]:
truedf.iloc[189]

'I stayed at The Talbott for 3 nights on business and was very pleased. The staff was friendly as can be immediately confirming the lore of the midwest. I was upgraded to a suite which was bigger than my apartment and certainly more luxurious. The free wi-fi came in handy as I needed to work remotely while there. Everything from the comfort of the bed to the staff and location made this a great stay. Oh, and I got to workout at the huge Equinox right next door for free. \n'

In [8]:
from spacy.tokenizer import Tokenizer
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

# prefix_re = re.compile(r'''^[\[\("']''')
# suffix_re = re.compile(r'''[\]\)"']$''')
# infix_re = re.compile(r'''[-~]''')
# simple_url_re = re.compile(r'''^https?://''')

# nlp.tokenizer = Tokenizer(nlp.vocab, 
#                           prefix_search = prefix_re.search,
#                           suffix_search = suffix_re.search,
#                           infix_finditer = infix_re.finditer,
#                           token_match = simple_url_re.match)

In [9]:
true_corpus = list(read_corpus(truedf,tokens_only=True))
fake_corpus = list(read_corpus(fakedf,tokens_only=True))
def to_list(doc):
    return [t.text for t in doc]
# true_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(truedf)]
# fake_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(fakedf)]

In [10]:
phrase_list = []

sentence_stream = [s for s in true_corpus + fake_corpus]
phrases = Phrases(sentence_stream)
bigram = Phraser(phrases)
sentence_stream = [bigram[s] for s in sentence_stream]
true_corpus = [bigram[s] for s in true_corpus]
fake_corpus = [bigram[s] for s in fake_corpus]

# phrases = Phrases(sentence_stream)
# trigram = Phraser(phrases)
# sentence_stream = [trigram[s] for s in sentence_stream]
# true_corpus = [trigram[s] for s in true_corpus]
# fake_corpus = [trigram[s] for s in fake_corpus]

def filter_stream(df, stopword_set):
    sentence_stream = []
    for s in df:
        s2 = [c  for c in s if c not in stopword_set] 
        sentence_stream.append(s2)
    return sentence_stream

# sentence_stream2 = []
# for s in sentence_stream:
#     s2 = [c  for c in s if c not in all_stop_set] 
#     sentence_stream2.append(s2)

sentence_stream2 = filter_stream(sentence_stream, all_stop_set)
true_corpus = filter_stream(true_corpus, all_stop_set)
fake_corpus = filter_stream(fake_corpus, all_stop_set)

2018-02-05 13:40:58,204 : INFO : collecting all words and their counts
2018-02-05 13:40:58,206 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2018-02-05 13:40:58,567 : INFO : collected 96727 word types from a corpus of 239406 words (unigram + bigrams) and 1600 sentences
2018-02-05 13:40:58,568 : INFO : using 96727 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2018-02-05 13:40:58,570 : INFO : source_vocab length 96727
2018-02-05 13:40:59,415 : INFO : Phraser built with 828 828 phrasegrams


In [11]:
# not_seen = set()
# for a in bigram.phrasegrams.keys():
#     if a not in trigram.phrasegrams.keys():
#         not_seen.add(a)
# for a in trigram.phrasegrams.keys():
#     if a not in bigram.phrasegrams.keys():
#         not_seen.add(a)

NameError: name 'trigram' is not defined

In [None]:
# pickle.dump(bigram, open('../input/plain_bigram.p','wb'))
# pickle.dump(trigram, open('../input/plain_trigram.p','wb'))

In [12]:
true_corpus_raw = pickle.load(open('../input/true_corpus_raw.p','rb'))
fake_corpus_raw = pickle.load(open('../input/fake_corpus_raw.p','rb'))
sentence_stream_raw = true_corpus_raw + fake_corpus_raw

In [13]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

compound = []
neg = []
neu = []
pos = []
# raw_corpus = true_corpus + fake_corpus
for s in sentence_stream_raw:
    sent = sia.polarity_scores(' '.join(s))
    compound.append(sent['compound'])
    neg.append(sent['neg'])
    neu.append(sent['neu'])
    pos.append(sent['pos'])

vader_sent = pd.DataFrame({'compound':compound, 'neg':neg, 'neu':neu, 'pos':pos})

In [14]:
from empath import Empath
lexicon = Empath()

lexicon_results = pd.DataFrame(columns=lexicon.cats)
for ind, s in enumerate(sentence_stream_raw):
    lexicon_results = lexicon_results.append(pd.Series([np.nan]), ignore_index=True)
    results = (lexicon.analyze(s))
    if (ind % 100 == 0):
        print(ind)
    for k in results.keys():
        lexicon_results[k].iloc[ind] = results[k]

lexicon_results.drop(columns=[0],inplace=True)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


In [15]:
lexicon_results.head(7)

Unnamed: 0,help,office,dance,money,wedding,domestic_work,sleep,medical_emergency,cold,hate,...,weapon,children,monster,ocean,giving,contentment,writing,rural,positive_emotion,musical
0,1.0,1.0,0.0,2.0,1.0,3.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0
1,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2.0,0.0,0.0,1.0,1.0,2.0,4.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
3,1.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,1.0,0.0
5,1.0,5.0,0.0,0.0,0.0,2.0,3.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0
6,2.0,4.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0


In [16]:
true_fake_corpus = []
for ind,s in enumerate(sentence_stream2):
    true_fake_corpus.append(gensim.models.doc2vec.TaggedDocument(s,[ind]))


In [17]:
from gensim import models
# true_raw = pickle.load(open('../input/true_corpus_raw.p','rb'))
# fake_raw = pickle.load(open('../input/fake_corpus_raw.p','rb'))

dictionary = corpora.Dictionary(sentence_stream2)

corpus_bow = [dictionary.doc2bow(s) for s in sentence_stream2]
tfidf = models.TfidfModel(corpus_bow)

2018-02-05 13:44:59,217 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-02-05 13:44:59,337 : INFO : built Dictionary(11389 unique tokens: ['We_stayed', 'one_night', 'getaway', 'family', 'thursday']...) from 1600 documents (total 111597 corpus positions)
2018-02-05 13:44:59,422 : INFO : collecting document frequencies
2018-02-05 13:44:59,423 : INFO : PROGRESS: processing document #0
2018-02-05 13:44:59,442 : INFO : calculating IDF weights for 1600 documents and 11388 features (98942 matrix non-zeros)


In [18]:
num_topics = 100
chunksize = 400
passes = 5

model = models.LdaModel(corpus_bow, id2word=dictionary, num_topics=num_topics,alpha = 'auto',eta='auto',random_state=0, chunksize=chunksize, passes=passes)
# model = models.LdaModel(tfidf[corpus_bow], id2word=dictionary, num_topics=num_topics)
# model.update(corpus_bow[500:len(corpus_bow)])

2018-02-05 13:44:59,456 : INFO : using autotuned alpha, starting with [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]
2018-02-05 13:44:59,460 : INFO : using serial LDA version on this node
2018-02-05 13:45:03,855 : INFO : running online (multi-pass) LDA training, 100 topics, 5 passes over the supplied corpus of 1600 documents, updating model once every 400 documents, evaluating perplexity every 1600 documents, iterating 50x with a convergence 

2018-02-05 13:45:08,292 : INFO : optimized alpha [0.01104410040985386, 0.010952229960695038, 0.011176264640096545, 0.010082435529413734, 0.011697526725979722, 0.012220447617177659, 0.010430507526481203, 0.011382527371687857, 0.011500930675323632, 0.011098972043608907, 0.012975755850271149, 0.011547993000990282, 0.011494777079479887, 0.012235249277467462, 0.010676377970724916, 0.010484673595984208, 0.010657090975561686, 0.010101866114515022, 0.010606970850205838, 0.011917136412570634, 0.012113512534759768, 0.009469694966596331, 0.01177208571868786, 0.010489275070019715, 0.010457208719901037, 0.010205089315042134, 0.012623136465648572, 0.010950558945589996, 0.011491822831224427, 0.010878543646326106, 0.011342070141697042, 0.01061002945761004, 0.011069560335278745, 0.010168554305984307, 0.009849375368215434, 0.010801055757773712, 0.012682809715985044, 0.009516617354292004, 0.009785036524876634, 0.01201925325992097, 0.01088233820199688, 0.009507491767516762, 0.010702742131316981, 0.0102336

2018-02-05 13:45:13,220 : INFO : optimized alpha [0.012180908225816493, 0.011327081763108348, 0.011530756073933371, 0.010894368034897356, 0.013031289815925893, 0.013863213031323044, 0.011078352458434956, 0.012450634645286207, 0.012601532606468497, 0.011797124602306345, 0.015333558848613139, 0.012249978361402979, 0.012565108159247806, 0.014013563598595104, 0.01114218834424768, 0.011058318861588172, 0.011537416399948222, 0.010319506495099282, 0.011165355057470983, 0.0137239086976971, 0.0134231824091446, 0.009396248840722926, 0.013726053324830035, 0.01074440936924483, 0.01098771957744048, 0.010410267527072088, 0.01425731459479849, 0.011522525931722108, 0.012602306626189767, 0.011663001187524176, 0.01236812595715727, 0.011209717150522222, 0.011718004441767207, 0.010563469281996479, 0.010103804908499266, 0.011273768953935816, 0.014528839554130526, 0.009459849196825574, 0.010139635577256136, 0.013480378813915412, 0.011700950359203597, 0.009421907698984417, 0.011268298273406118, 0.01064845674

2018-02-05 13:45:15,596 : INFO : optimized alpha [0.013230603350969901, 0.012124902944291948, 0.012164935622717, 0.01142916690081868, 0.01448678762302779, 0.015957059642434337, 0.011888990879051705, 0.013575291726206062, 0.013878009954716653, 0.012721724317237188, 0.01836762634187239, 0.01373069666980113, 0.013808705147497605, 0.01610174710548694, 0.011656540480608856, 0.011610901916804434, 0.012397804745822987, 0.010583117493615408, 0.011735516962666828, 0.015391306087064613, 0.014931088934798397, 0.009381843180889318, 0.015484874517986044, 0.01111151027339417, 0.011465657131003929, 0.01090789816606782, 0.016310301511032178, 0.012398056029124523, 0.014070151435803306, 0.012424399205397707, 0.013556142751263307, 0.0117800345525742, 0.012558057785586338, 0.01087971471989967, 0.010278410435853586, 0.012308392092289499, 0.016922740363454113, 0.009520614102598277, 0.01052808521880946, 0.01508850746772431, 0.012727266326080847, 0.009299962241977978, 0.01193936708379252, 0.011189322743469251

2018-02-05 13:45:20,033 : INFO : optimized alpha [0.014389362917820907, 0.012462763603102958, 0.01236594034991592, 0.012090089360214816, 0.01579216340070866, 0.017410945361234293, 0.012431298247929974, 0.014614243783442747, 0.014981685088570286, 0.013294745923439738, 0.020919047679327517, 0.014289262139154156, 0.014806741823438887, 0.01785686633285285, 0.01199472618437857, 0.012133520141109676, 0.013183799517573827, 0.010768380885698998, 0.012242355129488165, 0.017454803225362678, 0.016091343711581316, 0.009322892644276765, 0.017509868891550366, 0.011330629853673158, 0.011886868736008312, 0.011079218602461647, 0.01803657982298182, 0.01284690391711491, 0.015129891690091232, 0.013114650525974124, 0.014447897327325834, 0.012343892573582765, 0.013038430590719605, 0.011288219135902087, 0.01048821477806659, 0.012605153566598835, 0.01903845089966137, 0.009426557093688824, 0.010831524155187231, 0.016551396504732096, 0.013448091759465534, 0.00931604514099028, 0.012467745743735053, 0.01156799394

2018-02-05 13:45:22,364 : INFO : optimized alpha [0.015583004103332857, 0.013302139715730759, 0.012984835215339007, 0.012615572865194782, 0.01746946838185449, 0.01969734933776764, 0.013267047584487926, 0.015797444504034183, 0.016321292469890288, 0.014299144279312153, 0.02469395260263599, 0.015852675287669114, 0.016110724937575337, 0.020329558121646888, 0.01247177684797985, 0.012698811001156938, 0.014014296329891906, 0.011004371089724775, 0.01281400849863048, 0.019422329674287434, 0.01758796336701695, 0.009324802421775904, 0.019668495201561255, 0.011686614692244572, 0.012386907064345351, 0.011596042193088487, 0.020403495660640134, 0.013796953796355687, 0.016647019400402103, 0.013898121863892282, 0.01573941371664092, 0.012891782182627471, 0.013873033077627004, 0.011626867387843574, 0.010689257706151112, 0.01358367405961431, 0.02204485320467628, 0.009488350714459347, 0.011230564559233274, 0.018331623825244163, 0.014481496261983052, 0.009211684117372047, 0.013120692063421124, 0.01213236665

2018-02-05 13:45:26,756 : INFO : optimized alpha [0.016818497829989912, 0.013659897449616513, 0.01323355341722947, 0.01316216616072646, 0.018834025817722498, 0.02127680097992665, 0.013759004007796734, 0.0169388222390593, 0.017548634161115344, 0.014831074855192168, 0.027517775515688438, 0.016411389360789685, 0.017164709233909226, 0.02222459921696668, 0.012796266120253802, 0.013221256505416498, 0.01482082953141414, 0.011184474519798298, 0.013272847506052347, 0.021769064247734025, 0.018759335189167402, 0.009282721974431873, 0.022032728356698356, 0.011887799463958761, 0.012818044971156423, 0.011758241736047725, 0.022398222636885238, 0.014228361186905354, 0.017705202956105396, 0.014527215624030865, 0.016613540901808593, 0.013431630195933162, 0.014387644399535982, 0.012076323817733992, 0.010883735080348936, 0.01382133704246196, 0.024537227484357414, 0.009422178731880353, 0.011532988452627803, 0.02002458182118346, 0.015208113005543228, 0.009231506767472485, 0.013610452133925883, 0.01248702369

2018-02-05 13:45:29,030 : INFO : optimized alpha [0.018016984660258573, 0.014522221537709568, 0.013901854105925423, 0.013679483152625448, 0.020574322945707657, 0.023666409907369953, 0.014608303980286723, 0.018212507461432912, 0.018976684460533636, 0.015840858707551544, 0.03181178160381683, 0.018040175304375256, 0.018553226946734964, 0.024919431870344795, 0.013217485188142503, 0.013766500459144125, 0.015667255288690073, 0.011386457001459287, 0.013848624372585789, 0.023972907852185674, 0.02034938266156639, 0.009312569863661644, 0.024501640216373032, 0.012225107604843254, 0.013331878273491445, 0.012255797297722203, 0.025157398141449294, 0.015300383944372054, 0.01924692004006326, 0.015294108084603396, 0.017968183896548976, 0.013983008575897031, 0.015233208159148479, 0.012428083668333978, 0.011110797634142355, 0.014838500969995406, 0.028138916269972378, 0.009482805757996532, 0.011914005080535141, 0.022090541469429653, 0.016270521353351945, 0.009139225012524104, 0.014256207784316707, 0.01303

2018-02-05 13:45:33,404 : INFO : optimized alpha [0.019276660805409903, 0.014889296507708621, 0.014222769265271938, 0.01418023645976772, 0.021959646007208712, 0.025159139800429355, 0.015035152404204888, 0.01946146638444408, 0.02021820454205628, 0.016341545428185354, 0.03468189835892042, 0.018581025451894113, 0.019662313282884883, 0.026907430797206866, 0.013486291259793946, 0.014230395529930644, 0.016429051171562476, 0.011563055927580628, 0.01430841691936882, 0.026731357730142808, 0.02155445029626577, 0.009290585249805776, 0.02715338465409178, 0.01242144330947606, 0.01375209178721932, 0.012389504307661987, 0.027496439083455924, 0.015752408021226422, 0.020235110308691687, 0.01592410328131971, 0.018877535142539332, 0.014513284336682017, 0.015746610442519137, 0.012878530954464925, 0.01130105789623863, 0.015029704150609433, 0.031006085598907784, 0.009430447322560235, 0.01220776788881947, 0.024011395019822315, 0.016959262674618662, 0.009160504172021482, 0.014757100393009207, 0.01338945233108

2018-02-05 13:45:35,700 : INFO : optimized alpha [0.020527929196006836, 0.01579056689599803, 0.014896940900132138, 0.014677768863645583, 0.023772899161096032, 0.027696430218322875, 0.01589728198864839, 0.020842561868305524, 0.02172856419842077, 0.01735513314150122, 0.039529012966781116, 0.020252139230566478, 0.021152548628998326, 0.02975375559483774, 0.013888450845630698, 0.014755939418557747, 0.017248014838272728, 0.01174313323880583, 0.014890074776416446, 0.029301860576416865, 0.023155188385106835, 0.009335141361257205, 0.029928846450546342, 0.012741961153034847, 0.014234819121644163, 0.012858718266653779, 0.030556795006521253, 0.01689133310001505, 0.02184852418834378, 0.01668718224496792, 0.02021009269339613, 0.01503259526478672, 0.016630364727375693, 0.013223607259019986, 0.01151544747239362, 0.01603639122642508, 0.035071219812049505, 0.009488867629596467, 0.012582534592723243, 0.02626286271703061, 0.018018362935132877, 0.009061867977337972, 0.015422188979536197, 0.0139424414714317

In [19]:
topic_dists = np.zeros([len(sentence_stream2),num_topics])
    
for i,item in enumerate(corpus_bow):       
    dists = model.get_document_topics(item)        
    indices = list(dict(dists).keys())        
    vals = list(dict(dists).values())        
    topic_dists[i,indices] = vals

topic_dists = pd.DataFrame(topic_dists, columns = ['topic'+str(a) for a in range(num_topics)])

In [20]:
# raw_tp_dists = np.zeros([len(raw_corpus_bow),num_topics])
# for i,item in enumerate(raw_corpus_bow):
#     if (i% 1000)== 0:
#         print(i)
#     dists = model.get_document_topics(item)        
#     indices = list(dict(dists).keys())        
#     vals = list(dict(dists).values())        
#     raw_tp_dists[i,indices] = vals
# raw_tp_dists = pd.DataFrame(raw_tp_dists, columns = ['topic'+str(a) for a in range(num_topics)])

In [21]:
# topic_dists = list(model[tfidf[corpus_bow]])
# topic_dists = [ [c for (b,c) in a  ]for a in topic_dists]
# topic_dists = pd.DataFrame(topic_dists,columns = ['topic'+str(a) for a in range(num_topics)] )

In [22]:
topic_dists.head(4)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,...,topic90,topic91,topic92,topic93,topic94,topic95,topic96,topic97,topic98,topic99
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.042672,0.0,0.0,0.0,0.020658,0.0,0.0,0.059842,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
turk_model = gensim.models.doc2vec.Doc2Vec(dm=0, size=100,min_count=1, window=5,workers=cores, seed=8, negative=5)
turk_model.build_vocab(true_fake_corpus)

2018-02-05 13:45:42,985 : INFO : collecting all words and their counts
2018-02-05 13:45:42,989 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-02-05 13:45:43,018 : INFO : collected 11389 word types and 1600 unique tags from a corpus of 1600 examples and 111597 words
2018-02-05 13:45:43,020 : INFO : Loading a fresh vocabulary
2018-02-05 13:45:43,039 : INFO : min_count=1 retains 11389 unique words (100% of original 11389, drops 0)
2018-02-05 13:45:43,040 : INFO : min_count=1 leaves 111597 word corpus (100% of original 111597, drops 0)
2018-02-05 13:45:43,094 : INFO : deleting the raw counts dictionary of 11389 items
2018-02-05 13:45:43,095 : INFO : sample=0.001 downsamples 33 most-common words
2018-02-05 13:45:43,097 : INFO : downsampling leaves estimated 104666 word corpus (93.8% of prior 111597)
2018-02-05 13:45:43,098 : INFO : estimated required memory for 11389 words and 100 dimensions: 15445700 bytes
2018-02-05 13:45:43,125 : INFO : resetting lay

In [24]:
turk_model.train(true_fake_corpus, total_examples=turk_model.corpus_count, epochs=turk_model.iter)

2018-02-05 13:45:43,283 : INFO : training model with 8 workers on 11389 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
2018-02-05 13:45:43,905 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-02-05 13:45:43,913 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-02-05 13:45:43,924 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-02-05 13:45:43,938 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-02-05 13:45:43,943 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-02-05 13:45:43,947 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-02-05 13:45:43,948 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-02-05 13:45:43,950 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-02-05 13:45:43,951 : INFO : training on 557985 raw words (531317 effective words) took 0.7s, 808427 effective words/s

531317

In [25]:
from sklearn.preprocessing import StandardScaler
true_fake_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in true_fake_corpus])

# vader_sent = vader_sent.apply(np.square)
true_fake_vec2 = pd.concat([true_fake_vec, vader_sent, lexicon_results, topic_dists], axis=1)

# ss = StandardScaler()
# true_fake_vec_all = ss.fit_transform(true_fake_vec)

In [26]:
all_y = pd.concat([truedfy, fakedfy], axis= 0)
X_train, X_test, y_train, y_test = train_test_split(true_fake_vec, all_y, train_size=0.75, random_state=8)
Xv_train, Xv_test, yv_train, yv_test = train_test_split(vader_sent, all_y,train_size=0.75, random_state=8)
Xe_train, Xe_test, ye_train, ye_test = train_test_split(lexicon_results, all_y, train_size=0.75, random_state=8)
Xlda_train, Xlda_test, ylda_train, ylda_test = train_test_split(topic_dists, all_y, train_size=0.75, random_state=8)
Xa_train, Xa_test, ya_train, ya_test = train_test_split(true_fake_vec2, all_y,train_size=0.75, random_state=8)

In [27]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

def run_test(X, y, cv_val, scoring):
    gnb = GaussianNB()
    dtree = DecisionTreeClassifier()
    svm2 = svm.SVC(random_state=8)
    xg = XGBClassifier()
    rf = RandomForestClassifier()
    # logreg_cv = linear_model.LogisticRegressionCV(Cs=100, cv=5, penalty='l1',scoring='accuracy',solver='liblinear',n_jobs=-1)
    print('Gaussian NB:')
    scorelist = cross_val_score(gnb, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('DecisionTree')
    scorelist = cross_val_score(dtree, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('Rand Forest:')
    scorelist = cross_val_score(rf, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('SVM:')
    scorelist = cross_val_score(svm2, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('XGB Default:')
    scorelist = cross_val_score(xg, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    # print('Logistics Regression:')
    # scorelist = cross_val_score(logreg_cv, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    # print(scorelist, np.mean(scorelist))

In [28]:
# run_test(Xe_train, ye_train, 5, 'f1')

In [29]:
# run_test(Xv_train, yv_train, 5, 'f1')

In [30]:
# run_test(X_train, y_train, 5, 'f1')

In [31]:
# run_test(Xlda_train, ylda_train, 5, 'f1')

In [32]:
run_test(Xa_train, ya_train, 5, 'f1')

Gaussian NB:
[0.75576037 0.66981132 0.6695279  0.74774775 0.70422535] 0.7094145372548887
DecisionTree
[0.69527897 0.64253394 0.69019608 0.67226891 0.68595041] 0.6772456611652407
Rand Forest:
[0.69124424 0.63207547 0.67256637 0.60952381 0.68907563] 0.6588971045573553
SVM:
[0.77911647 0.70638298 0.71372549 0.72803347 0.72649573] 0.7307508268164021
XGB Default:
[0.87179487 0.77732794 0.82786885 0.78333333 0.82553191] 0.817171381540702


In [33]:
# svm_clf = svm.SVC(random_state=8)
# svm_params = {
#     "kernel":['rbf','linear'],
#     'C':[0.1,0.2,0.4,0.6,0.8,1,10],
#     'gamma': np.logspace(-1,1,9)
# }
# scorer = make_scorer(fbeta_score,beta=0.5)
# svm_gs = GridSearchCV(svm_clf, svm_params, cv=5, scoring=scorer, n_jobs=-1)
# svm_gs.fit(Xa_train,ya_train)
# best_clf = svm_gs.best_estimator_
# print(best_clf)
# best_pred = best_clf.predict(Xa_test)

In [34]:
# performance = {'accuracy': accuracy_score(best_pred,y_test),
#                 'recall': recall_score(best_pred,y_test),
#                 'precision': precision_score(best_pred,y_test)}
# print(performance)

In [35]:
xg_clf = XGBClassifier()
xg_params = {
#     'booster'=['gbtree'],
    'colsample_bytree':[0.15,0.4,0.85],
    'max_depth':[4,8,16,20],
    'subsample':[0.7,0.95],
    'min_child_weight':[1,3,9],
    'gamma':[0,0.01,0.05,0.3,0.6,1]
}
scorer = make_scorer(fbeta_score,beta=0.5)
xg_gs = GridSearchCV(xg_clf, xg_params, cv=5, scoring=scorer, n_jobs=-1)
xg_gs.fit(Xa_train,ya_train)
best_xg_clf = xg_gs.best_estimator_
print(best_xg_clf)
best_pred = best_xg_clf.predict(Xa_test)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.4,
       gamma=0.05, learning_rate=0.1, max_delta_step=0, max_depth=20,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.7)


In [36]:
performance = {'accuracy': accuracy_score(best_pred,y_test),
                'recall': recall_score(best_pred,y_test),
                'precision': precision_score(best_pred,y_test)}
print(performance)

{'accuracy': 0.88, 'recall': 0.9065934065934066, 'precision': 0.8418367346938775}
