Import dependencies

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# printing the stop words in english
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data preprocessing

In [25]:
# loading dataset
news_dataset = pd.read_csv('/content/WELFake_Dataset.csv')
news_dataset.rename(columns={news_dataset.columns[0]:'id'}, inplace=True)

In [26]:
news_dataset.head(5)

Unnamed: 0,id,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [27]:
news_dataset.shape

(72134, 4)

In [28]:
# counting the number of missing values in dataset
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,558
text,39
label,0


In [29]:
# replacing the null values with emplty string
news_dataset = news_dataset.fillna('')

In [30]:
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,0
text,0
label,0


In [31]:
# merging the title and text coloumn

news_dataset['content'] = news_dataset['title']+' '+news_dataset['text']

In [32]:
print(news_dataset['content'])

0        LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1           Did they post their votes for Hillary already?
2        UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3        Bobby Jindal, raised Hindu, uses story of Chri...
4        SATAN 2: Russia unvelis an image of its terrif...
                               ...                        
72129    Russians steal research on Trump in hack of U....
72130     WATCH: Giuliani Demands That Democrats Apolog...
72131    Migrants Refuse To Leave Train At Refugee Camp...
72132    Trump tussle gives unpopular Mexican leader mu...
72133    Goldman Sachs Endorses Hillary Clinton For Pre...
Name: content, Length: 72134, dtype: object


In [33]:
# seprating the data and label

X = news_dataset.drop('label',axis=1)
Y = news_dataset['label']

In [34]:
print(X)
print(Y)

          id  ...                                            content
0          0  ...  LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1          1  ...     Did they post their votes for Hillary already?
2          2  ...  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3          3  ...  Bobby Jindal, raised Hindu, uses story of Chri...
4          4  ...  SATAN 2: Russia unvelis an image of its terrif...
...      ...  ...                                                ...
72129  72129  ...  Russians steal research on Trump in hack of U....
72130  72130  ...   WATCH: Giuliani Demands That Democrats Apolog...
72131  72131  ...  Migrants Refuse To Leave Train At Refugee Camp...
72132  72132  ...  Trump tussle gives unpopular Mexican leader mu...
72133  72133  ...  Goldman Sachs Endorses Hillary Clinton For Pre...

[72134 rows x 4 columns]
0        1
1        1
2        1
3        0
4        1
        ..
72129    0
72130    1
72131    0
72132    0
72133    1
Name: label, Length: 7213

Stemming procedure

stemming is the process of reducing a word to its root word

example:
actor,actress,acting : - act

In [35]:
port_stem = PorterStemmer()

In [36]:
def stemming(content):
  stemmed_content= re.sub('[^a-zA-Z]',' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem
                     (word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content


In [None]:
news_dataset['content'] = news_dataset.content.apply(stemming)

In [None]:
#seprating data and lable

X = news_dataset['content'].values
Y = news_dataset['label'].values

In [None]:
# converting the textual data to numerical data

vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

splitting the dataset to traing and test data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

training the model: logestic regression model

In [None]:
model = LogisticRegression()

In [None]:
model.fit(x_train, y_train)

evaluation

In [None]:
# accuracy score on the training data

x_train_prediction = model.predict(x_train)
traing_data_accuracy = accuracy_score(x_train_prediction, y_train)

print("accuracy score of the training data :", traing_data_accuracy)

In [None]:
# accuracy score on the test data

x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

print("accuracy score of the test data :", test_data_accuracy)

In [None]:
# making a predictive system

x_news = x_test[0]
prediction = model.predict(x_news)
print(prediction)

if prediction[0]==0:
  print("News is real")
else:
  print("News is fake")