In [1]:
import pandas as pd


In [2]:
df=pd.read_csv('data/fake_or_real_news.csv')

In [3]:
df.head(10)
df.columns=['id','title','test','label']

In [4]:
df.head(10)

Unnamed: 0,id,title,test,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


In [5]:
# Get the Independent Features

X=df.drop('label',axis=1)

In [6]:
X.head()


Unnamed: 0,id,title,test
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello..."
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T..."
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...


In [7]:
# Get the Dependent features
y=df['label']

In [8]:
y.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [10]:
df.shape

(6335, 4)

In [11]:
y=pd.get_dummies(y,drop_first=True)

In [12]:
y.head()

Unnamed: 0,REAL
0,0
1,0
2,1
3,0
4,1


In [13]:
# finding if there is some null value or not
df.isnull().sum()

id       0
title    0
test     0
label    0
dtype: int64

In [14]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [16]:
news=df.copy()
news.head()

Unnamed: 0,id,title,test,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [17]:
# Just taking a look
news['title'][10]

'With all three Clintons in Iowa, a glimpse at the fire that has eluded Hillary Clinton’s campaign'

In [18]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()
corpus = []
for i in range(0, len(news)):
    review = re.sub('[^a-zA-Z]', ' ', news['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [19]:
# have a look how our corpus looks like after cleaning our data
news['title'][10],corpus[10]

('With all three Clintons in Iowa, a glimpse at the fire that has eluded Hillary Clinton’s campaign',
 'three clinton iowa glimpse fire eluded hillary clinton campaign')

In [20]:
#Applying Tf-Idf
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(max_features=4500,ngram_range=(1,3))
X = tf.fit_transform(corpus).toarray()

In [21]:
X.shape

(6335, 4500)

In [22]:
# Divide the dataset into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
tf.get_feature_names()[:20]

['abandon',
 'abc',
 'abdullah',
 'abedin',
 'abedin weiner',
 'abortion',
 'absolutely',
 'abstains',
 'abstains un',
 'abstains un vote',
 'absurd',
 'abuse',
 'accept',
 'accept election',
 'accept election result',
 'accepts',
 'access',
 'access pipeline',
 'accident',
 'accidentally']

In [26]:
tf.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 4500,
 'min_df': 1,
 'ngram_range': (1, 3),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [27]:
count_df = pd.DataFrame(X_train, columns=tf.get_feature_names())

In [28]:
count_df.head()


Unnamed: 0,abandon,abc,abdullah,abedin,abedin weiner,abortion,absolutely,abstains,abstains un,abstains un vote,...,young,young woman,youth,youtube,zero,zika,zika scaremongering,zika virus,zionist,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Building Model

In [29]:
from sklearn import metrics
import numpy as np
# MultinomialNB Algorithm¶
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()

In [30]:
classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred)
cm

accuracy:   0.804


  y = column_or_1d(y, warn=True)


array([[495, 133],
       [115, 524]])

In [31]:
classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
score

  y = column_or_1d(y, warn=True)


0.8042620363062352

In [32]:
classifier=MultinomialNB(alpha=0.1)

In [33]:

previous_score=0
for alpha in np.arange(0,1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train,y_train)
    y_pred=sub_classifier.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {}, Score : {}".format(alpha,score))

  y = column_or_1d(y, warn=True)
  'setting alpha = %.1e' % _ALPHA_MIN)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Alpha: 0.0, Score : 0.7900552486187845
Alpha: 0.1, Score : 0.8066298342541437
Alpha: 0.2, Score : 0.8066298342541437


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Alpha: 0.30000000000000004, Score : 0.8089976322020521
Alpha: 0.4, Score : 0.8145224940805051
Alpha: 0.5, Score : 0.813733228097869
Alpha: 0.6000000000000001, Score : 0.8082083662194159
Alpha: 0.7000000000000001, Score : 0.8074191002367798
Alpha: 0.8, Score : 0.8074191002367798
Alpha: 0.9, Score : 0.8050513022888713


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [35]:
## Get Features names
feature_names = tf.get_feature_names()

In [36]:
classifier.coef_[0]

array([-8.92565723, -9.3729333 , -8.71535109, ..., -9.3729333 ,
       -9.3729333 , -8.96392092])

In [37]:
# Some Frequent Real News
sorted(zip(classifier.coef_[0], feature_names), reverse=True)[:20]

[(-4.757005797857291, 'trump'),
 (-5.160715122579867, 'clinton'),
 (-5.233326961488936, 'obama'),
 (-5.414523355034957, 'gop'),
 (-5.605557794345454, 'debate'),
 (-5.612438912945993, 'donald'),
 (-5.6178192990218765, 'donald trump'),
 (-5.669917851007034, 'republican'),
 (-5.750571890793384, 'hillary'),
 (-5.891734598371253, 'new'),
 (-5.921382759652154, 'sander'),
 (-5.951274835518564, 'hillary clinton'),
 (-5.991477704709746, 'house'),
 (-6.017332623508087, 'cruz'),
 (-6.039075483475198, 'say'),
 (-6.073752228479853, 'state'),
 (-6.085657327566709, 'campaign'),
 (-6.111178620189949, 'iran'),
 (-6.252169249601238, 'deal'),
 (-6.275355622093241, 'bush')]

In [38]:
# Some Frequent Fake News
sorted(zip(classifier.coef_[0], feature_names))[:50]

[(-9.372933303035607, 'abc'),
 (-9.372933303035607, 'abedin weiner'),
 (-9.372933303035607, 'abstains'),
 (-9.372933303035607, 'abstains un'),
 (-9.372933303035607, 'abstains un vote'),
 (-9.372933303035607, 'access pipeline'),
 (-9.372933303035607, 'accident'),
 (-9.372933303035607, 'accidentally'),
 (-9.372933303035607, 'achievement'),
 (-9.372933303035607, 'acquitted'),
 (-9.372933303035607, 'across country'),
 (-9.372933303035607, 'active'),
 (-9.372933303035607, 'adhd'),
 (-9.372933303035607, 'admits zika'),
 (-9.372933303035607, 'admitted'),
 (-9.372933303035607, 'advance mosul'),
 (-9.372933303035607, 'advert'),
 (-9.372933303035607, 'ag lynch'),
 (-9.372933303035607, 'ag lynch told'),
 (-9.372933303035607, 'agent'),
 (-9.372933303035607, 'aggressive'),
 (-9.372933303035607, 'airstrikes yemen'),
 (-9.372933303035607, 'akbar'),
 (-9.372933303035607, 'al nusra'),
 (-9.372933303035607, 'aleppo'),
 (-9.372933303035607, 'alien'),
 (-9.372933303035607, 'alien muslim'),
 (-9.3729333030

#### Passive Aggressive Classifier Algorithm¶


In [43]:
import numpy as np

from sklearn.linear_model import PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0)


In [46]:
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred)
cm

  y = column_or_1d(y, warn=True)


accuracy:   0.787


array([[500, 128],
       [142, 497]])

#### Using Hashing Vectorizer

In [49]:
hv=HashingVectorizer(n_features=4500)
X=hv.fit_transform(corpus).toarray()

In [57]:
X.shape


(6335, 4500)

In [58]:

## Divide the dataset into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
classifier=PassiveAggressiveClassifier()
classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred)
cm

  y = column_or_1d(y, warn=True)


accuracy:   0.750


array([[457, 171],
       [146, 493]])

IN THIS DATASET WE WERE ABLE TO ATTAIN ACCURACY USING THE NAIVEBAYES CLASSIFIER WHICH GAVE US 82%. NOT THE PASSIVE AGGRESSIVE CLASSIFIER