# Fake and Real News Prediction

#### Perform Natural Language Processing 

#### Import Relevant Libraries / Dependancies

In [192]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import autoreload
import missingno as msno
import os
import sys
import csv
import re
import string

from collections import Counter, defaultdict

#from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 1000)

sb.set_style("darkgrid")
sb.set()

%matplotlib inline

In [193]:
df = pd.read_csv(r"D:\Open Classroom\Datasets\Indian Fake News\news_dataset.csv")
df.head()

Unnamed: 0,label,text
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...
1,FAKE,A four-minute-long video of a woman criticisin...
2,FAKE,"Republic Poll, a fake Twitter account imitatin..."
3,REAL,"Delhi teen finds place on UN green list, turns..."
4,REAL,Delhi: A high-level meeting underway at reside...


In [194]:
df.shape

(3729, 2)

In [195]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3729 entries, 0 to 3728
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   3729 non-null   object
 1   text    3721 non-null   object
dtypes: object(2)
memory usage: 58.4+ KB


In [196]:
df.isnull().sum()

label    0
text     8
dtype: int64

In [197]:
df.dropna(axis = 'index', how = 'any', inplace = True)
df.shape

(3721, 2)

In [198]:
df['get_label'] = df['label'].replace({'REAL':0, 'FAKE':1})
df.head()

Unnamed: 0,label,text,get_label
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...,0
1,FAKE,A four-minute-long video of a woman criticisin...,1
2,FAKE,"Republic Poll, a fake Twitter account imitatin...",1
3,REAL,"Delhi teen finds place on UN green list, turns...",0
4,REAL,Delhi: A high-level meeting underway at reside...,0


In [199]:
df.drop(columns = ['label'], inplace = True)
df.sample(frac = 1).head(3)

Unnamed: 0,text,get_label
1893,Many things will have to work out to end the c...,0
1673,Don’t need Twitter to explain what @anuragkash...,0
2093,Guneet Monga comes out in support of Anurag Ka...,0


#### Exploratory Data Analysis

In [200]:
def word_drop(text):
    text = text.lower()
    text = re.sub("\[.*?\]", " ", text)
    text = re.sub("\\W", " ", text)
    text = re.sub("http?://\S+|www.\S+", " ", text)
    text = re.sub("<.*?>+", " ", text)
    text = re.sub("[%s]" % re.escape(string.punctuation), " ", text)
    text = re.sub("\n", " ", text)
    text = re.sub("\w*\d\w*", " ", text)
    return text


In [201]:
df['text'] = df['text'].apply(word_drop)
df.sample(4)

Unnamed: 0,text,get_label
3080,read also be it winning hearts or winning the...,0
2546,now that i am shooting again i feel like a ki...,0
58,a video of a mock drill by chinese security of...,1
977,a screenshot of a tweet seemingly by congress...,1


#### Train the Dataset

In [218]:
x = df['text']
y = df['get_label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)


#### Vectorization

In [203]:
vect = TfidfVectorizer()

x_train_vect = vect.fit_transform(x_train)
x_train_vect

<2976x31645 sparse matrix of type '<class 'numpy.float64'>'
	with 584761 stored elements in Compressed Sparse Row format>

In [204]:
x_test_vect = vect.fit_transform(x_test.astype('U'))
x_test_vect

<745x16112 sparse matrix of type '<class 'numpy.float64'>'
	with 149172 stored elements in Compressed Sparse Row format>

#### Build and Evalute NLP Model

##### 1. Decision Tree Classifier Model

##### 2. Logistic Regression

In [211]:
LR = LogisticRegression()

LR.fit(x_train_vect, y_train)

LR_pred = LR.predict(x_train_vect)
%time print(LR_pred)

print('Accuracy Score: ', accuracy_score(LR_pred, y_train))
print('Classification Report: ', classification_report(LR_pred, y_train))

[1 0 0 ... 0 0 0]
Wall time: 994 µs
Accuracy Score:  0.9993279569892473
Classification Report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00      1488
           1       1.00      1.00      1.00      1488

    accuracy                           1.00      2976
   macro avg       1.00      1.00      1.00      2976
weighted avg       1.00      1.00      1.00      2976



##### 3. Support Vector Machine Classifier

In [212]:
svc_model = SVC()

svc_model.fit(x_train_vect, y_train)

svc_pred = svc_model.predict(x_train_vect)
%time print(svc_pred)

%time print('Accuracy Score: ', round(accuracy_score(svc_pred, y_train)*100))
print('Classification Report: ', classification_report(svc_pred, y_train))

[1 0 0 ... 0 0 0]
Wall time: 0 ns
Accuracy Score:  100.0
Wall time: 5 ms
Classification Report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00      1488
           1       1.00      1.00      1.00      1488

    accuracy                           1.00      2976
   macro avg       1.00      1.00      1.00      2976
weighted avg       1.00      1.00      1.00      2976



##### 4. Random Forest Classifier

In [213]:
rfc = RandomForestClassifier()

rfc.fit(x_train_vect, y_train)

rfc_pred = rfc.predict(x_train_vect)
%time print(rfc_pred)

%time print('Accuracy Score: ', round(accuracy_score(rfc_pred, y_train)*100))
print('Classification Report: ', classification_report(rfc_pred, y_train))

[1 0 0 ... 0 0 0]
Wall time: 1e+03 µs
Accuracy Score:  100.0
Wall time: 2 ms
Classification Report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00      1488
           1       1.00      1.00      1.00      1488

    accuracy                           1.00      2976
   macro avg       1.00      1.00      1.00      2976
weighted avg       1.00      1.00      1.00      2976



#### Manual Testing

In [214]:
df.sample(frac = 1).head(3)

Unnamed: 0,text,get_label
2126,an image of a girl photographed while tending ...,1
3379,times now s twitter handle on friday fell for ...,1
2708,a black and white photo of two women sitting t...,1


In [217]:
def output_label(get_label):
    if get_label == 0:
        return "REAL"
    elif get_label == 1:
        return "FAKE"

def manual_testing(bulletin):
    testing_news = {"text": [bulletin]}
    new_df_test = pd.DataFrame(testing_news)
    new_df_test["text"] = new_df_test["text"].apply(word_drop)
    new_x_test = new_df_test["text"]
    new_xv_test = vect.transform(new_x_test)
    rfc_mt = rfc.predict(new_xv_test)
    svc_mt = svc_model.predict(new_xv_test)
    lr_mt = LR.predict(new_xv_test)
    
    
    #print("\n\n\n", new_df_test.loc[0, "Headlines"])
    return (print(f'\n\n\nFake News Detection Application \nPredictive Model Outcomes\n\nLogistic Regression: {output_label(lr_mt)} \nRandom Forest Classifier: {output_label(rfc_mt)} \nSupport Vector Classifier: {output_label(svc_mt)}'))
        

In [216]:
headline = str(input()).title()
manual_testing(headline)

 Pope Francis used his annual Christmas Day message to rebuke Donald Trump without even mentioning his name. The Pope delivered his message just days after members of the United Nations condemned Trump s move to recognize Jerusalem as the capital of Israel. The Pontiff prayed on Monday for the  peaceful coexistence of two states within mutually agreed and internationally recognized borders. We see Jesus in the children of the Middle East who continue to suffer because of growing tensions between Israelis and Palestinians,  Francis said.  On this festive day, let us ask the Lord for peace for Jerusalem and for all the Holy Land. Let us pray that the will to resume dialogue may prevail between the parties and that a negotiated solution can finally be reached. The Pope went on to plead for acceptance of refugees who have been forced from their homes, and that is an issue Trump continues to fight against. Francis used Jesus for which there was  no place in the inn  as an analogy. Today, as

ValueError: X has 6 features, but DecisionTreeClassifier is expecting 31645 features as input.

In [None]:
df['text'].loc[152]