<a href="https://colab.research.google.com/github/Yahnavi/FakeNewsDetectionSystem/blob/main/FakeNews_Detection_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd # data manipulation
import numpy as np #numerical python
import re
from sklearn.model_selection import train_test_split #to split data into train and test sets
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords #corpus=body nltk - natural language toolkit
from nltk.stem.porter import PorterStemmer # stemming -  removes prefix, suffix and return root word

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#printing stop words in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
#Data preprocessing
#loading dataset into pandas dataframe
news_dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/News.csv')

In [None]:
#print first 5 rows of dataset
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,class
0,0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [None]:
news_dataset.shape

(44919, 6)

In [None]:
news_dataset.tail()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,class
44914,21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
44915,21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
44916,21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
44917,21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
44918,21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


In [None]:
#count the number of missing values
news_dataset.isnull().sum()

Unnamed: 0     0
title          0
text           0
subject       21
date          21
class          0
dtype: int64

In [None]:
#replacing null values with empty string
news_dataset = news_dataset.fillna('')

In [None]:
news_dataset.isnull().sum()

Unnamed: 0    0
title         0
text          0
subject       0
date          0
class         0
dtype: int64

In [None]:
#merging title and text of dataset
news_dataset['content'] = news_dataset['title']+' '+news_dataset['text']
print(news_dataset['content'])

0         Donald Trump Sends Out Embarrassing New Year’...
1         Drunk Bragging Trump Staffer Started Russian ...
2         Sheriff David Clarke Becomes An Internet Joke...
3         Trump Is So Obsessed He Even Has Obama’s Name...
4         Pope Francis Just Called Out Donald Trump Dur...
                               ...                        
44914    'Fully committed' NATO backs new U.S. approach...
44915    LexisNexis withdrew two products from Chinese ...
44916    Minsk cultural hub becomes haven from authorit...
44917    Vatican upbeat on possibility of Pope Francis ...
44918    Indonesia to buy $1.14 billion worth of Russia...
Name: content, Length: 44919, dtype: object


In [None]:
#separating the content and class column from rest of the dataset
X = news_dataset.drop(columns = 'class', axis = 1)# x stores all the values of dataset except class
# removing data from column then axis = 1 removing data from row axis = 0
Y = news_dataset['class']# y stores the values of class column

In [None]:
print(X)

       Unnamed: 0                                              title  \
0               0   Donald Trump Sends Out Embarrassing New Year’...   
1               1   Drunk Bragging Trump Staffer Started Russian ...   
2               2   Sheriff David Clarke Becomes An Internet Joke...   
3               3   Trump Is So Obsessed He Even Has Obama’s Name...   
4               4   Pope Francis Just Called Out Donald Trump Dur...   
...           ...                                                ...   
44914       21412  'Fully committed' NATO backs new U.S. approach...   
44915       21413  LexisNexis withdrew two products from Chinese ...   
44916       21414  Minsk cultural hub becomes haven from authorities   
44917       21415  Vatican upbeat on possibility of Pope Francis ...   
44918       21416  Indonesia to buy $1.14 billion worth of Russia...   

                                                    text    subject  \
0      Donald Trump just couldn t wish all Americans ...       N

In [None]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
44914    1
44915    1
44916    1
44917    1
44918    1
Name: class, Length: 44919, dtype: int64


In [None]:
#stemming procedure : It is the process of reducing a word to its root word
#example : actor, actress, acting --> act is the root word
port_stem = PorterStemmer()


In [None]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content) #regEx for replacing all the other content except alphabets with'' from content column
  stemmed_content = stemmed_content.lower() # convert all characters to lowercase letters
  stemmed_content = stemmed_content.split() # a list of words is formed
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  #take each word and perform stemming and remove the stopwords
  stemmed_content = ' '.join(stemmed_content) # joining all the words
  return stemmed_content

In [None]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [None]:
print(news_dataset['content'])

0        donald trump send embarrass new year eve messa...
1        drunk brag trump staffer start russian collus ...
2        sheriff david clark becom internet joke threat...
3        trump obsess even obama name code websit imag ...
4        pope franci call donald trump christma speech ...
                               ...                        
44914    fulli commit nato back new u approach afghanis...
44915    lexisnexi withdrew two product chines market l...
44916    minsk cultur hub becom author minsk reuter sha...
44917    vatican upbeat possibl pope franci visit russi...
44918    indonesia buy billion worth russian jet jakart...
Name: content, Length: 44919, dtype: object


In [None]:
#seperating the data from class
X = news_dataset['content'].values
Y = news_dataset['class'].values

In [None]:
print(X)

['donald trump send embarrass new year eve messag disturb donald trump wish american happi new year leav instead give shout enemi hater dishonest fake news media former realiti show star one job countri rapidli grow stronger smarter want wish friend support enemi hater even dishonest fake news media happi healthi new year presid angri pant tweet great year america countri rapidli grow stronger smarter want wish friend support enemi hater even dishonest fake news media happi healthi new year great year america donald j trump realdonaldtrump decemb trump tweet went welll expect kind presid send new year greet like despic petti infantil gibberish trump lack decenc even allow rise gutter long enough wish american citizen happi new year bishop talbert swan talbertswan decemb one like calvin calvinstowel decemb impeach would make great year america also accept regain control congress miranda yaver mirandayav decemb hear talk includ mani peopl hate wonder hate alan sandov alansandov decemb us

In [None]:
print(Y)

[0 0 0 ... 1 1 1]


In [None]:
Y.shape

(44919,)

In [None]:
#converting textual data into numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [None]:
print(X)

  (0, 87885)	0.27953576139612984
  (0, 87808)	0.08560284638903479
  (0, 86738)	0.03134243059697376
  (0, 86582)	0.026991417291258653
  (0, 86500)	0.018629188081064226
  (0, 86486)	0.026950456105654805
  (0, 86402)	0.034252209076917166
  (0, 86079)	0.18427919118003872
  (0, 85205)	0.026377611747159076
  (0, 85194)	0.08887295020944012
  (0, 85183)	0.060314871149415694
  (0, 85155)	0.08887295020944012
  (0, 84510)	0.036714112292355584
  (0, 83947)	0.021123776977742027
  (0, 82290)	0.03720391157616558
  (0, 80231)	0.022650064414802874
  (0, 80171)	0.10874136332805802
  (0, 79494)	0.13560747021532726
  (0, 77498)	0.020340027195433256
  (0, 77123)	0.08887295020944012
  (0, 77069)	0.055919271871731666
  (0, 76803)	0.026255084547357036
  (0, 76559)	0.04448848404428745
  (0, 75916)	0.021895633444443983
  (0, 75894)	0.08560284638903479
  :	:
  (44918, 16760)	0.04487536891009754
  (44918, 16704)	0.0404179600525818
  (44918, 15819)	0.055775981763551634
  (44918, 15754)	0.027434086017826695
  (4491

In [None]:
#spliting the dataset to training and testing data
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, stratify = Y, random_state = 1)


In [None]:
#training logistic regression model
model = LogisticRegression()


In [None]:
model.fit(x_train,y_train)

In [None]:
#evaluation
x_train_prediction  = model.predict(x_train)
training_data_accuracy  = accuracy_score(x_train_prediction, y_train)

In [None]:
print('accuracy score of the training data: ', training_data_accuracy)

accuracy score of the training data:  0.9918463893140392


In [None]:
x_test_prediction  = model.predict(x_test)
test_data_accuracy  = accuracy_score(x_test_prediction, y_test)

In [None]:
print('accuracy score of the training data: ', test_data_accuracy)

accuracy score of the training data:  0.9851959038290294


In [None]:
#building predictive system
x_new = x_test[4]

prediction = model.predict(x_new)
print(prediction)

if(prediction[0]==0):
  print('true')
else:
  print('fake')

[0]
true


In [None]:
print(y_test[4])

0
