## Import Libraries and Load Data

In [1]:
#Libraries
import numpy as np
import pandas as pd
import spacy


In [2]:
#Load Data
df=pd.read_csv("Fake_Real_Data.csv")

In [3]:
#lets explore the data
df.shape

(9900, 2)

In [4]:
df.head(5)

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [5]:
#distribution of the labels
df.label.value_counts()

Fake    5000
Real    4900
Name: label, dtype: int64

The classes of the label look balanced. Let's create a new feature to give the labels a binary number

In [6]:
#new feature
df['label_no']=df.label.map({'Fake':0, 'Real':1})

#let's check our df out
df.head()

Unnamed: 0,Text,label,label_no
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [7]:
#Lets check out some random examples
import random
random_index=random.randint(0, len(df)-5)
for row in df[["Text", "label_no"]][random_index:random_index+5].itertuples():
  _, text, label_no=row
  print(f"Label: {label_no}", "(real news)" if label_no > 0 else "(not real news")
  print(f"Text:\n {text} \n")
  print("-----\n")


Label: 0 (not real news
Text:
  Fox News FINALLY Tells Truth About Itself, Then Immediately Falls Back Into Old Pattern Of Lies Fox News, purveyor of lies branded as  news,  has finally decided to drop the act and tell the world who they really are. In a fairly surprising move, they re dropping their  fair and balanced  slogan. The slogan was Roger Ailes  from the time the network was launched, but it s come under increasing fire for sexual harassment and assault allegations. The slogan is too closely tied to the worst harasser there was there Ailes himself so the network s management decided to drop it.And just when you thought they might actually be doing something right, they twisted reality again to fit their own warped view of themselves. The new slogan will be,  Most Watched. Most Trusted. Sure most trusted by people who don t know what facts are. When Roger Ailes came up with  fair and balanced,  it was because he felt doing right-wing news branded as such was a recipe for failu

#Get Spacy Word Vectors

In [8]:
df.head()

Unnamed: 0,Text,label,label_no
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Text,
    df.label,
    test_size=0.2,
    random_state=2022
)

# Modelling

In [10]:
from sklearn.naive_bayes import MultinomialNB #popular for nlp probelms
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer


In [11]:
from sklearn.pipeline import Pipeline

#MN Pipeline
pipe_mn=Pipeline(steps=[('cv', CountVectorizer()), ('mn', MultinomialNB())])

#train and fit data
pipe_mn.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()), ('mn', MultinomialNB())])

In [12]:
y_pred=pipe_mn.predict(X_test)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        Fake       0.98      0.97      0.97      1024
        Real       0.97      0.98      0.97       956

    accuracy                           0.97      1980
   macro avg       0.97      0.97      0.97      1980
weighted avg       0.97      0.97      0.97      1980



In [13]:
#Make a prediction
ex1="Trumep and Obama went to isreal and iran"
pipe_mn.predict([ex1])

array(['Fake'], dtype='<U4')

In [17]:
#prediction Proba
pipe_mn.predict_proba([ex1])

array([[0.64257325, 0.35742675]])

In [18]:
pipe_mn.classes_

array(['Fake', 'Real'], dtype='<U4')

#Saving and Loading the Model

In [19]:
import joblib
pipeline_file=open("fake_news_classifier.pkl", "wb")
joblib.dump(pipe_mn, pipeline_file)
pipeline_file.close()