# 📰 News Article Classification (Fake/Real)
A beginner-friendly machine learning project to classify news as Fake or Real using NLP techniques.

In [11]:
# Step 1: Import libraries
import pandas as pd
import numpy as np

import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier

In [12]:
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Atharva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Atharva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
fake = pd.read_csv('Fake.xs')

In [14]:
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [15]:
geniune = pd.read_csv('True.csv')

In [16]:
geniune.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [17]:
fake['label'] = 1
geniune['label'] = 0

In [18]:
fake.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [19]:
geniune.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [20]:
fake.shape

(23481, 5)

In [21]:
geniune.shape

(21417, 5)

In [22]:
articles_data = pd.concat([geniune,fake], ignore_index=True)

In [23]:
articles_data.shape

(44898, 5)

In [24]:
articles_data.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [25]:
articles_data.tail()

Unnamed: 0,title,text,subject,date,label
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",1
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",1
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",1
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",1
44897,10 U.S. Navy Sailors Held by Iranian Military ...,21st Century Wire says As 21WIRE predicted in ...,Middle-east,"January 12, 2016",1


In [26]:
articles_data

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0
...,...,...,...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",1
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",1
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",1
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",1


In [27]:
articles_data.drop(columns=['text','subject','date'], inplace=True)

In [28]:
articles_data

Unnamed: 0,title,label
0,"As U.S. budget fight looms, Republicans flip t...",0
1,U.S. military to accept transgender recruits o...,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,0
3,FBI Russia probe helped by Australian diplomat...,0
4,Trump wants Postal Service to charge 'much mor...,0
...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,1
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,1
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,1
44896,How to Blow $700 Million: Al Jazeera America F...,1


In [29]:
#1. Removing Stopwords
#2. Tokenize sentences into words
#3. Lemmatize

In [30]:
word_lem = WordNetLemmatizer()

In [31]:
def clean_data(text):
    words = []
    word_tokens = word_tokenize(text)
    for word in word_tokens:
        if word not in stopwords.words('english'):
            result = re.sub('[^a-zA-Z]', ' ' , word)
            result = result.lower()
            result = word_lem.lemmatize(result)
            words.append(result)
    return ' '.join(words)

In [32]:
clean_data('hello,123 Good Evening')

'hello     good evening'

In [33]:
articles_data['title'] = articles_data['title'].apply(clean_data)

In [34]:
input = articles_data['title']
output = articles_data['label']

In [37]:
x_train, x_test, y_train, y_test = train_test_split(input, output, test_size=0.2)

In [38]:
#TfidfVectorizer
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(35918,) (8980,) (35918,) (8980,)


In [39]:
x_train

8248      bridgegate   name remain secret   u s  appeal...
16134    catalan pro independence party pdecat say take...
5179     russia shrug session uproar   say daily contac...
20698    eu  s barnier worried uk  s post brexit plan i...
2059     rolling back taliban grip afghan territory key...
                               ...                        
14493    zimbabwe  s mugabe dig heel ruling party move ...
3714     ex cia chief brennan testify house intelligenc...
40620    poverty pimp al sharpton us the bible to say d...
6076     trump sign executive order manufacturing   inf...
29293    former attorney general   congress need to tak...
Name: title, Length: 35918, dtype: object

In [40]:
x_test

22646    read former fbi director james comey   full pr...
19536    brazil supreme court sends new temer graft cha...
11845    vietnam prosecute another former oil executive...
38331      unvetted   illegals turn germany into a thir...
32467    laura ingraham   gop senior senator   laughed ...
                               ...                        
25728    trump tried to fire the woman he worked so har...
25844    trump come unglued   continues miss universe f...
14385    south korea fear missile advance north year th...
21697    nba kowtow to racist   order player to stand f...
557       fcc chief plan ditch u s   net neutrality   rule
Name: title, Length: 8980, dtype: object

In [41]:
y_train

8248     0
16134    0
5179     0
20698    0
2059     0
        ..
14493    0
3714     0
40620    1
6076     0
29293    1
Name: label, Length: 35918, dtype: int64

In [42]:
y_test

22646    1
19536    0
11845    0
38331    1
32467    1
        ..
25728    1
25844    1
14385    0
21697    1
557      0
Name: label, Length: 8980, dtype: int64

In [43]:
vector = TfidfVectorizer()

In [44]:
x_train_vector = vector.fit_transform(x_train)
x_test_vector = vector.transform(x_test)

In [45]:
x_test_vector.shape

(8980, 17111)

In [46]:
x_train_vector.shape

(35918, 17111)

In [47]:
#model creation

In [48]:
model = PassiveAggressiveClassifier(max_iter=50, random_state=7)

In [49]:
#train model with training dataset

In [50]:
model.fit(x_train_vector, y_train)

In [51]:
#evaluate model with test dataset

In [52]:
pred = model.predict(x_test_vector)

In [53]:
pred

array([1, 0, 0, ..., 0, 1, 0])

In [54]:
from sklearn.metrics import accuracy_score

In [55]:
accuracy_score(pred, y_test)

0.9841870824053452

In [56]:
def predict(model_name, text):
    input_dataset = pd.DataFrame([text], columns=['title'])
    input_dataset['title'] = input_dataset['title'].apply(clean_data)
    text_vector = vector.transform(input_dataset['title'])
    result = model_name.predict(text_vector)
    if result[0] == 0 :
        return 'Its a Geniune Article'
    if result[0] == 1 :
        return 'Its a Fake Article'

In [57]:
predict(model, 'Trump says Russia probe will be fair, but timeline unclear: NYT')

'Its a Geniune Article'