Dataset: https://www.kaggle.com/hassanamin/textdb3Dataset: 

In [1]:
import pandas as pd


In [2]:
df=pd.read_csv('data/fake_or_real_news.csv')

In [4]:
df.head(10)
df.columns=['id','title','test','label']

In [5]:
df.head(10)

Unnamed: 0,id,title,test,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


In [6]:
# Get the Independent Features

X=df.drop('label',axis=1)

In [7]:
X.head()


Unnamed: 0,id,title,test
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello..."
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T..."
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...


In [8]:
# Get the Dependent features
y=df['label']

In [9]:
y.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [10]:
df.shape

(6335, 4)

In [11]:
y=pd.get_dummies(y,drop_first=True)

In [12]:
y.head()

Unnamed: 0,REAL
0,0
1,0
2,1
3,0
4,1


In [13]:
# finding if there is some null value or not
df.isnull().sum()

id       0
title    0
test     0
label    0
dtype: int64

In [14]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [15]:
news=df.copy()

In [16]:
news.head()

Unnamed: 0,id,title,test,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [17]:
# Just taking a look
news['title'][10]

'With all three Clintons in Iowa, a glimpse at the fire that has eluded Hillary Clinton’s campaign'

In [18]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(news)):
    review = re.sub('[^a-zA-Z]', ' ', news['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [19]:
# have a look how our corpus looks like after cleaning our data
news['title'][10],corpus[10]

('With all three Clintons in Iowa, a glimpse at the fire that has eluded Hillary Clinton’s campaign',
 'three clinton iowa glimps fire elud hillari clinton campaign')

In [20]:
#Applying Countvectorizer
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=4500,ngram_range=(1,3))
X = cv.fit_transform(corpus).toarray()

In [21]:
X.shape

(6335, 4500)

In [22]:
# Divide the dataset into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
cv.get_feature_names()[:20]

['abandon',
 'abc',
 'abc news',
 'abdullah',
 'abedin',
 'abedin weiner',
 'abedin weiner laptop',
 'abil',
 'aboard',
 'abolish',
 'abort',
 'abort answer',
 'absolut',
 'abstain',
 'abstain un',
 'abstain un vote',
 'absurd',
 'abus',
 'abus power',
 'accept']

In [24]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 4500,
 'min_df': 1,
 'ngram_range': (1, 3),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [25]:
count_df = pd.DataFrame(X_train, columns=cv.get_feature_names())

In [26]:
count_df.head()


Unnamed: 0,abandon,abc,abc news,abdullah,abedin,abedin weiner,abedin weiner laptop,abil,aboard,abolish,...,young women,youth,youtub,zakharova,zero,zika,zika scaremong,zika viru,zionist,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Building Model

In [27]:
from sklearn import metrics
import numpy as np
# MultinomialNB Algorithm¶
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()

In [28]:
classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred)
cm

  y = column_or_1d(y, warn=True)


accuracy:   0.819


array([[507, 121],
       [108, 531]])

In [29]:
classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
score

  y = column_or_1d(y, warn=True)


0.819258089976322

As the Accuracy is not impressive , we are now going to do some Hyperparameter Optimisations

In [30]:
classifier=MultinomialNB(alpha=0.1)

In [31]:

previous_score=0
for alpha in np.arange(0,1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train,y_train)
    y_pred=sub_classifier.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {}, Score : {}".format(alpha,score))

  y = column_or_1d(y, warn=True)
  'setting alpha = %.1e' % _ALPHA_MIN)
  y = column_or_1d(y, warn=True)


Alpha: 0.0, Score : 0.7995264404104183
Alpha: 0.1, Score : 0.8145224940805051


  y = column_or_1d(y, warn=True)


Alpha: 0.2, Score : 0.8161010260457774


  y = column_or_1d(y, warn=True)


Alpha: 0.30000000000000004, Score : 0.8145224940805051


  y = column_or_1d(y, warn=True)


Alpha: 0.4, Score : 0.8168902920284136


  y = column_or_1d(y, warn=True)


Alpha: 0.5, Score : 0.8145224940805051


  y = column_or_1d(y, warn=True)


Alpha: 0.6000000000000001, Score : 0.8161010260457774


  y = column_or_1d(y, warn=True)


Alpha: 0.7000000000000001, Score : 0.8153117600631413


  y = column_or_1d(y, warn=True)


Alpha: 0.8, Score : 0.8176795580110497


  y = column_or_1d(y, warn=True)


Alpha: 0.9, Score : 0.8200473559589582


In [32]:
## Get Features names
feature_names = cv.get_feature_names()

In [33]:
classifier.coef_[0]

array([ -8.70258695, -10.16892402, -10.16892402, ..., -10.16892402,
       -10.16892402,  -9.42170961])

In [34]:
# Some Frequent Real News
sorted(zip(classifier.coef_[0], feature_names), reverse=True)[:20]

[(-3.7833549448766224, 'trump'),
 (-4.301826632466006, 'clinton'),
 (-4.562713536424641, 'obama'),
 (-4.796220938134157, 'gop'),
 (-5.0272604595379295, 'hillari'),
 (-5.040341090781036, 'donald'),
 (-5.053595095274211, 'donald trump'),
 (-5.094444886489441, 'debat'),
 (-5.181519440659494, 'republican'),
 (-5.293726692839438, 'new'),
 (-5.391669682351764, 'say'),
 (-5.391669682351764, 'hillari clinton'),
 (-5.50025751849337, 'hous'),
 (-5.510739794766881, 'sander'),
 (-5.53203985456297, 'state'),
 (-5.55380349919933, 'democrat'),
 (-5.564865558905188, 'cruz'),
 (-5.658064509523739, 'campaign'),
 (-5.682787647610534, 'iran'),
 (-5.788287235655762, 'win')]

In [35]:
# Some Frequent Fake News
sorted(zip(classifier.coef_[0], feature_names))[:50]

[(-10.16892401604059, 'abc'),
 (-10.16892401604059, 'abc news'),
 (-10.16892401604059, 'abedin weiner'),
 (-10.16892401604059, 'abedin weiner laptop'),
 (-10.16892401604059, 'abil'),
 (-10.16892401604059, 'aboard'),
 (-10.16892401604059, 'abstain'),
 (-10.16892401604059, 'abstain un'),
 (-10.16892401604059, 'abstain un vote'),
 (-10.16892401604059, 'access pipelin'),
 (-10.16892401604059, 'accid'),
 (-10.16892401604059, 'achiev'),
 (-10.16892401604059, 'acquit'),
 (-10.16892401604059, 'acquit malheur'),
 (-10.16892401604059, 'acquit malheur wildlif'),
 (-10.16892401604059, 'acquitt'),
 (-10.16892401604059, 'across countri'),
 (-10.16892401604059, 'addict'),
 (-10.16892401604059, 'adhd'),
 (-10.16892401604059, 'admit zika'),
 (-10.16892401604059, 'advanc mosul'),
 (-10.16892401604059, 'advert'),
 (-10.16892401604059, 'advertis'),
 (-10.16892401604059, 'ag lynch'),
 (-10.16892401604059, 'ag lynch told'),
 (-10.16892401604059, 'agent'),
 (-10.16892401604059, 'airstrik yemen'),
 (-10.16892