In [1]:
import itertools
import pandas as pd
import numpy as np
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
# Import dataset
df=pd.read_csv('merged_data.csv',encoding='utf-8')

# Get the shape
df.shape

(22273, 4)

In [3]:
df.head()

Unnamed: 0,ID,headline,text,label
0,0,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,0
1,1,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",0
2,2,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,0
3,3,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,0
4,4,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",0


In [4]:
# Change the labels
df.loc[(df['label'] == 1) , ['label']] = 'FAKE'
df.loc[(df['label'] == 0) , ['label']] = 'REAL'

In [5]:
df.head()

Unnamed: 0,ID,headline,text,label
0,0,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,REAL
1,1,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",REAL
2,2,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,REAL
3,3,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,REAL
4,4,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",REAL


In [6]:
# Isolate the labels
labels = df.label
labels.head()

0    REAL
1    REAL
2    REAL
3    REAL
4    REAL
Name: label, dtype: object

In [7]:
#Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['text'].values.astype('str'), labels, test_size=0.2, random_state=7)

In [8]:
#Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

In [9]:
# Fit & transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [10]:
# Initialize the PassiveAggressiveClassifier and fit training sets
pa_classifier=PassiveAggressiveClassifier(max_iter=50)
pa_classifier.fit(tfidf_train,y_train)

PassiveAggressiveClassifier(max_iter=50)

In [11]:
# Predict and calculate accuracy
y_pred=pa_classifier.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 96.16%


In [12]:
# Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[1905,   91],
       [  80, 2379]])

## Reference

* Coding

https://medium.com/swlh/detecting-fake-news-with-python-and-machine-learning-f78421d29a06
* Datasets

https://www.kaggle.com/c/fake-news/data

https://www.kaggle.com/jruvika/fake-news-detection