In [56]:
# Importing libreries
import pandas as pd 
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import string
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [57]:
pip install stop_words

Note: you may need to restart the kernel to use updated packages.


In [58]:
from stop_words import get_stop_words

In [59]:
# Reading data
df = pd.read_csv('news.csv')
df.head(4) 

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE


In [60]:
# Checking shape
df.shape

(6335, 4)

In [61]:
#Checking duplicates and shape afterwards
df.drop_duplicates(inplace=True)
df.shape

(6335, 4)

In [62]:
# Checking missing values
df.isnull().sum()
# no null values

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [63]:
# Counting values on target
df.label.value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [64]:
### Preprocessing data
# cleaning data

In [65]:

stopwords = stopwords.words('english')

In [66]:
def clean_data(text):
    
    # converting text into lower case
    text = text.lower() 
    
    #taking only words from text without number nor special characters using Reg Ex 
    text = re.sub('[^a-zA-Z]' , ' ' , text)
    
    # split the data and make token
    token = text.split() 
    
    # Lematize the word and remove stopwords 
    news = [word for word in token if not word in stopwords]  
    
    # join all the token with space
    cleaned_text = ' '.join(news) 
    
    return cleaned_text


    

In [67]:
df['text'] = df.text.apply(lambda x : clean_data(x))

In [68]:
corpus = df.text.values
corpus[0]



In [69]:
labels = df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [70]:
x_train,x_test,y_train,y_test=train_test_split(df['text'].values.astype('str'), labels, test_size=0.2, random_state=7)


In [71]:
#Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

In [72]:
# Fit & transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [73]:
# Initialize the PassiveAggressiveClassifier and fit training sets
pa_classifier=PassiveAggressiveClassifier(max_iter=50)

pa_classifier.fit(tfidf_train,y_train)

PassiveAggressiveClassifier(max_iter=50)

In [74]:
# Predict and calculate accuracy
y_pred=pa_classifier.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 92.98%


In [75]:
# Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[589,  49],
       [ 40, 589]], dtype=int64)

In [None]:
# From the confusion matrix we can make the following conclusions:

In [None]:
"""
Our model successfully predicted 589 positives
Our model successfully predicted 589 negatives.
Our model predicted 49 false positives
Our model predicted 40 false negatives
"""