# 1.) Project's each line of code contains an explanation.

# 2.) The explanation for each line of the code will ease the task of working on the project and understanding it.

Author: Abhyas Mall (Research Intern - CAIR - DRDO)

Data set features->

1.)id: Unique id for a news article.

2.)title: Title of news article.

3.)author: Author of the news article.

4.)text: Text of the article; might be incomplete.

5.)label: A label that marks whether the news article is genuine/fake (ie:1/0), 1 for fake and 0 for genuine article.

Training and testing dataset: https://www.kaggle.com/c/fake-news/data?select=train.csv

# Importing the Dependencies

In [38]:
import numpy as np
#numpy is very useful for making numpy-arrays
import pandas as pd
#helpful in creating data frame and storing data into it
import re #importing regular expression-useful for searching text in a paragraph.

#USING OF THE NATURAL LANGUAGE TOOLKIT
#STEPS INVOLVING IN NATURAL LANGUAGE PROCESSING-

#STEP-1: Tokenizing is not required-
#tokenization will be done using ".split()" during the stemming process

#STEP2: for stopwords elimination we import stopwords
from nltk.corpus import stopwords 
#nltk will help analyzing, pull apart text, help using of stopwords, tag things such as part of speech named to ND, etc.
#stopwords are of no use to the computing system, eg: a, the, etc.
#using of stopwords will save the processing time.
#corpus- is basically the important content of the text
#nltk- natural language toolkit

#STEP3: For stemming:
from nltk.stem.porter import PorterStemmer 
#stemming takes a word, removes its prefix and suffix, and returns the root word of it.
#NLTK- is a toolkit 

#STEP4: To identify significant/important words in a document
from sklearn.feature_extraction.text import TfidfVectorizer
#they are imported to convert text into feature vectors.
#TfidfVectorizer's main role is to find the most important/significant words from the document
#tf-term(words) frequency
#idf- inverse document frequency
#TF(t)=freq. of the term in a doc/total no. of words in a doc
#IDF(t)=log(Total no. of documents)/no. of documents with term t in it
#Tfidf=tf*idf
#if Tdidf is higher it means the word is more significant.

#STEP5: To split the data-set.
from sklearn.model_selection import train_test_split
#they are imported to split data-set into training and testing data

#STEP6:To import the ML model
from sklearn.linear_model import LogisticRegression
#they are imported to use the Logistic Regression model

#STEP 7:For evaluation of the model.
from sklearn.metrics import accuracy_score
#to evaluate the performance of the model
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

In [40]:
#first we need to download stopwords from nltk library
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mallabhyas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
#to check-out if the downloaded stopwords are sufficient or we need to add
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Data Pre-processing:

In [44]:
#loading dataset into a pandas dataframe
news_dataset=pd.read_csv('train.csv')

In [46]:
news_dataset.head()
#to take an overview of the dataframe
#label 1 signifies fake news
#label 0 signifies genuine news

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [48]:
news_dataset.shape
#to know the no. of articles (no. of rows and columns)

(20800, 5)

In [50]:
#to check if values are missing, or no. of missing values in the dataset
news_dataset.isnull().sum()
#this will count the number of missing values in "EACH COLUMN".
#While preparing dataset we might not get the author/title
#But label has no null value therefore that is not a problem.
#the missing values are insignificant in number, when compared to the size of the data set
#if missing values were more in number then we would have used methods such as imputation.
#imputation: processing, that's gonna replace the missing values with appropriate values.
#hence, imputation isn't required in this case

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [52]:
#replacing the null values with empty string
news_dataset=news_dataset.fillna('')

In [54]:
news_dataset.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [56]:
#for our prediction we are gonna use the column title and author
#hence, we will combine title and author
#we will not use the text because mostly they are huge paragraphs,
#will take huge computation power, also won't do much good to our model.

#if the accuracy is not satisfactory then we might use texts as well.

#merging the author name and news title in content column

news_dataset['content']=news_dataset['author']+' '+news_dataset['title']

In [58]:
print(news_dataset['content'])
#this column will be used to make predictions.

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object


# Stemming:
Stemming is the process of NLP in which we reduce the words to its Root Word.
eg:actor,actress,acting --> root word is "act"
so basically the prefixes and suffixes will be removed.

We gotta reduce words for performance enhancement->
after stemming we'll perform #vectorizing. 
#in vectorizing we will convert the words into feature vectors-or, numeric data.

text data->numeric data->model

In [60]:
port_stem=PorterStemmer()
#function is stored in port_stem

In [62]:
#we are creating a function stemming
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content)
    #regular expression library uses sub which substitutes
    #certain values '^'-means exclusion
    #it means we are mentioning a set which contains a-z and A-Z
    #in our data set we don't want numbers and (,.)quotation marks etc
    #all we want is text data- so we only take data in form of a-z or A-Z
    #excluding everything other than a-z and A-Z
    #' 'space means if there is anything other than a-z A-Z it will get replaced by a blank space
    
    stemmed_content=stemmed_content.lower()
    #here we convert all the alphabets into lower case
    #because ML model will consider upper case words and lower case words differently even if they mean the same
    #so that the processing is done smoothly
    
    stemmed_content=stemmed_content.split()
    #tokenizing- basically the words will be stored in a list.
    
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    #we are stemming each word using port_stem(function) except the stopwords
    
    stemmed_content=' '.join(stemmed_content)
    #we will join all the words later on into a sentences
    
    return stemmed_content

In [64]:
#checking out the stemming function
stemming(news_dataset.loc[1,'content'])

'daniel j flynn flynn hillari clinton big woman campu breitbart'

In [None]:
news_dataset['content']=news_dataset['content'].apply(stemming)
#each row in the content column will be stemmed and stored

In [None]:
print(news_dataset['content'])
#no upper case letters
#no stopwords
#only root words

In [None]:
#separating the data and label
X=news_dataset['content'].values
Y=news_dataset['label'].values

In [None]:
print(X)

In [None]:
print(Y)

In [None]:
Y.shape

In [None]:
X.shape

# Converting the textual data into numeric data (feature vector)

In [None]:
vectorizer=TfidfVectorizer()
vectorizer.fit(X)
X=vectorizer.transform(X)

#tf-term frequency
#counts no. of times a word is repeating in a doc/paragraph
#and assigns a numeric value. the greater the more significant

#idf-inverse document frequency
#sometimes words that is repeated several times and might not have significance
#eg: building a system that predicts if review is positive or negative
#eg: we are analysing all reviews for movie avengers
#so the word avengers will obviously be repeated again and again
#and it also does not have much significance
#so the idf basically reduces its numeric/importance value

#Finally the Feature vector is created

In [None]:
print(X)

# SPLITTING DATASET TO TRAINING AND TEST DATA

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)
#80% of the data will be used for training
#20% of the data will be used for testing (0.2)
#Stratify=y is used so that the genuine and the fake news data is in equal proprtion as it was in original dataset
#Random test is basically used for reproducing the same set of train and test split.
#If random test isn't used then every time we run we might get a different set of training and testing data.
#can be any integer value

# Training the Logistic Regression Model

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)
#The Sigmoid function curve will be plotted using logistic regression for the above training data

# Evaluation 

In [None]:
#Accuracy score on the training data
X_train_prediction = model.predict(X_train)
#model will predict X_train dataset and the label predicted by the model will be stored in the above variable.
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
#accuracy score will be calculated by putting predicted labels and original labels into the function.
#it is compared and the accuracy score is calculated

In [None]:
print(f"TRAINING DATA ACCURACY SCORE {training_data_accuracy}")

In [None]:
print(classification_report(Y_train,X_train_prediction))

In [None]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print(f'TEST DATA ACCURACY {test_data_accuracy}')

In [None]:
print(classification_report(Y_test,X_test_prediction))

# Making a predictive system

In [None]:
X_new = X_test[1]

prediction=model.predict(X_new)
if(prediction[0]==0):
    print("The news is Real")
else:
    print("The news is Fake")