# FAKE NEWS DETECTION USING NLP

## Importing libraries

In [112]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

## Loading dataset

In [113]:
true= pd.read_csv("True.csv")
fake= pd.read_csv("Fake.csv")
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


## Understanding the dataset

In [114]:
true.info(), fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


(None, None)

In [115]:
true.isnull().sum(), fake.isnull().sum()

(title      0
 text       0
 subject    0
 date       0
 dtype: int64,
 title      0
 text       0
 subject    0
 date       0
 dtype: int64)

In [116]:
true.shape, fake.shape

((21417, 4), (23481, 4))

### Adding output column as it is a supervised problem

In [117]:
true['label']=1
fake['label']=0
true.shape, fake.shape
true.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


### merging the two datasets into 1 single dataset

In [118]:
df = pd.concat([true, fake], axis=0)
df = df.sample(frac=1).reset_index(drop=True)
df.head()


Unnamed: 0,title,text,subject,date,label
0,Hezbollah says Israel pushing region to war,BEIRUT (Reuters) - Lebanon s Hezbollah accused...,worldnews,"October 1, 2017",1
1,Thai hotels booked up ahead of funeral of reve...,BANGKOK (Reuters) - Hotels in Bangkok s bustli...,worldnews,"October 16, 2017",1
2,NRA’s Ted Nugent Unleashes Disgusting Attack ...,"NRA board member, draft-dodger, pedophile, and...",News,"September 19, 2016",0
3,New Iran sanctions 'in pipeline' before Trump ...,WASHINGTON (Reuters) - The Iran-related sancti...,politicsNews,"February 3, 2017",1
4,Calm Down! Democrats Are Going To Stomp Trump...,After Tuesday s semi-coronation of Donald Trum...,News,"May 4, 2016",0


In [119]:
df= df[["text","label"]]
df.head()

Unnamed: 0,text,label
0,BEIRUT (Reuters) - Lebanon s Hezbollah accused...,1
1,BANGKOK (Reuters) - Hotels in Bangkok s bustli...,1
2,"NRA board member, draft-dodger, pedophile, and...",0
3,WASHINGTON (Reuters) - The Iran-related sancti...,1
4,After Tuesday s semi-coronation of Donald Trum...,0


In [120]:
def clean_text(text):
    text= text.lower()
    text= re.sub(r'\W',' ', text)
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [121]:
df['clean_text']= df['text'].apply(clean_text)
df.head()

Unnamed: 0,text,label,clean_text
0,BEIRUT (Reuters) - Lebanon s Hezbollah accused...,1,beirut reuters lebanon s hezbollah accused isr...
1,BANGKOK (Reuters) - Hotels in Bangkok s bustli...,1,bangkok reuters hotels in bangkok s bustling o...
2,"NRA board member, draft-dodger, pedophile, and...",0,nra board member draft dodger pedophile and al...
3,WASHINGTON (Reuters) - The Iran-related sancti...,1,washington reuters the iran related sanctions ...
4,After Tuesday s semi-coronation of Donald Trum...,0,after tuesday s semi coronation of donald trum...


In [122]:
X= df['clean_text']
y= df['label']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state= 42, test_size=0.2)

In [123]:
tfidf = TfidfVectorizer(stop_words="english",
    max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [124]:
lr= LogisticRegression(class_weight="balanced",
    max_iter=1000,
    solver="liblinear")
lr.fit(X_train_tfidf,y_train)

In [125]:
y_pred = lr.predict(X_test_tfidf)

In [126]:
print("accuracy:", accuracy_score(y_test,y_pred))

accuracy: 0.9880846325167038


In [127]:
print("confusion matrix:")
print(confusion_matrix(y_test,y_pred))

confusion matrix:
[[4629   61]
 [  46 4244]]


In [128]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4690
           1       0.99      0.99      0.99      4290

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [129]:
def predict_news(news_text):
    cleaned = clean_text(news_text)
    vectorized = tfidf.transform([cleaned])
    prediction = lr.predict(vectorized)
    
    return "Fake News" if prediction[0] == 0 else "Real News"

In [136]:
sample_news = """

Local authorities confirmed that a road maintenance project will begin next week to improve traffic safety in the area.

"""
print(predict_news(sample_news))

Real News
