# Applying Naive Bayes To Differentiate Real from Fake

In [1]:
#Importing all the libs I think I need
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
from six.moves import range

In [15]:
#This file is scaled-down version of the one I created in the data wrangling module. That file was 2.5GB+ and contained
#account information as well. We'll only be working with tweets for this part and this file is only ~250MB.
tweets = pd.read_csv('Downloads/ML_Data.csv', encoding='Latin-1', low_memory=False)

tweets= tweets[~tweets.text.isnull()]
tweets.head()

Unnamed: 0.1,Unnamed: 0,user_id,timestamp,favorite_count,retweet_count,text,tweet_id,source,retweet_status_id,in_reply_to_status_id,Real_or_Fake
0,0,1868981000.0,3/22/2016 18:31,,,#IslamKills Are you trying to say that there w...,7.12e+17,,,,Fake
1,1,2571870000.0,10/10/2016 20:57,0.0,0.0,"Clinton: Trump should?ve apologized more, atta...",7.86e+17,"<a href=""http://twitterfeed.com"" rel=""nofollow...",,,Fake
2,2,1710805000.0,2/22/2017 12:43,,,RT @ltapoll: Who was/is the best president of ...,8.34e+17,,,,Fake
3,3,2584153000.0,12/26/2016 15:06,,,RT @jww372: I don't have to guess your religio...,8.13e+17,,,,Fake
4,4,1768260000.0,8/6/2017 2:36,,,RT @Shareblue: Pence and his lawyers decided w...,8.94e+17,,,,Fake


In [22]:
df1 = pd.DataFrame(tweets)

In [23]:
print(tweets['Real_or_Fake'].count())
print(tweets['Real_or_Fake'].value_counts())

1248047
Real    1044586
Fake     203461
Name: Real_or_Fake, dtype: int64


In [44]:
#In the next cell, CountVectorizer and HashingVectorizer are both returning memory errors so for the purpose of this exercise
#I'm going to size it down some by slicing the data and stick with CountVectorizer. I'll split the data to get as close to 50/50 as I can
#MEMErrors: 200000, 100000, 45000, 20000, 10000, 7000,
tweets_slice = tweets[200000:206500]

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
#Build vocabulary
text = tweets_slice['text']
vectorizer = CountVectorizer()
vectorizer.fit(text)

x = vectorizer.transform(text)
x = x.toarray()

In [52]:
#We want to use the tweet content to determine if it was made by either a bot or a real person, so X should be the tweet and
#Y is the indicator
def make_xy(tweets, vectorizer=None):   
    if vectorizer is None:
        vectorizer = CountVectorizer(min_df=0)
    X = vectorizer.fit_transform(tweets.text)
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (tweets.Real_or_Fake == 'Real').values.astype(np.int)
    return X, y
X, y = make_xy(tweets)

In [53]:
#Now we split the data into train and test sets
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y)
clf = MultinomialNB().fit(xtrain, ytrain)
print("MN Accuracy: %0.2f%%" % (100 * clf.score(xtest, ytest)))

MN Accuracy: 95.81%


In [54]:
#This cell is important because our if the accuracy on the training data and test data are significantly different, it can
#indicate overifitting or underfitting
training_accuracy = clf.score(xtrain, ytrain)
test_accuracy = clf.score(xtest, ytest)

print("Accuracy on training data: %0.2f" % (training_accuracy))
print("Accuracy on test data:     %0.2f" % (test_accuracy))

Accuracy on training data: 0.97
Accuracy on test data:     0.96


In [23]:
from sklearn.cross_validation import KFold
def cv_score(clf, X, y, scorefunc):
    result = 0.
    nfold = 10 #lets try 10 and see what happens
    for train, test in KFold(y.size, nfold): # split data into train/test groups, n times
        clf.fit(X[train], y[train]) # fit
        result += scorefunc(clf, X[test], y[test]) # evaluate score function on held-out data
    return result / nfold # average

In [24]:
def log_likelihood(clf, x, y):
    prob = clf.predict_log_proba(x)
    Fake = y == 0
    Real = ~Fake
    return prob[Fake, 0].sum() + prob[Real, 1].sum()

In [25]:
from sklearn.cross_validation import train_test_split
itrain, itest = train_test_split(range(tweets.shape[0]), train_size=0.7)
mask=np.ones(tweets.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

In [26]:
#the grid of parameters to search over
alphas = [0, .1, 1, 5, 10, 50]
min_dfs = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

#Find the best value for alpha and min_df, and the best classifier
best_alpha = None
best_min_df = None
maxscore=-np.inf
for alpha in alphas:
    for min_df in min_dfs:         
        vectorizer = CountVectorizer(min_df = min_df)       
        Xthis, ythis = make_xy(tweets, vectorizer)
        Xtrainthis=Xthis[mask]
        ytrainthis=ythis[mask]
        #your code here
        clf = MultinomialNB(alpha=alpha)
        cvscore = cv_score(clf, Xtrainthis, ytrainthis, log_likelihood)

        if cvscore > maxscore:
            maxscore = cvscore
            best_alpha, best_min_df = alpha, min_df
            
print("alpha: %f" % best_alpha)
print("min_df: %f" % best_min_df)

  'setting alpha = %.1e' % _ALPHA_MIN)


alpha: 1.000000
min_df: 0.001000
