## Reading and understanding the data

In [97]:
import pandas as pd 
import numpy as np


In [98]:
fake = pd.read_csv('../../Dataset/Fake.csv')
true = pd.read_csv('../../Dataset/True.csv')
fake.head()



Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [99]:
# dataset is balanced . 
fake.shape
true.shape

(21417, 4)

In [100]:
# defining labels for the dataset
fake['label'] = 0
true['label'] = 1
true.head()
# combining the two datasets
data = pd.concat([fake, true], axis=0)


In [101]:
data.shape

(44898, 5)

In [102]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [103]:
data.nunique()


title      38729
text       38646
subject        8
date        2397
label          2
dtype: int64

## Preprocessing 

In [104]:
# finding missing values in the dataset
data.isnull().sum()
# there is no missing value in the dataset . 


title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [105]:
# finding duplicate values in the dataset
data[data.duplicated(keep=False)]
# there are 209 duplicate rows in the dataset .
# droping duplicate values from the dataset
data.drop_duplicates(inplace=True)


In [106]:
# checking the shape of the dataset after droping duplicate values  
data.shape

(44689, 5)

In [107]:
# lookgng at what subject columns contains . 
data['subject'].value_counts()
data.groupby("subject")['label'].value_counts().reset_index()

Unnamed: 0,subject,label,count
0,Government News,0,1570
1,Middle-east,0,778
2,News,0,9050
3,US_News,0,783
4,left-news,0,4459
5,politics,0,6838
6,politicsNews,1,11220
7,worldnews,1,9991


In [108]:
# removing the subject column from the dataset as it is not useful for our model .
data.drop('subject', axis=1, inplace=True)

In [109]:
data.head()

Unnamed: 0,title,text,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...","December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...","December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,"December 25, 2017",0


In [110]:
# average length of the text column or article in the dataset
data['text'].str.len().mean()



np.float64(2467.0986820022827)

## NLP Section : 

In [111]:
# combinng the title and text column into a single column for better performance of the model .
data['content'] = data['title'] + ' ' + data['text']
# we will lower case the content column for better performance of the model .
data['content'] = data['content'].str.lower() 



In [112]:
# checking the average length of the content column before cleaning the text
data['content'].str.len().mean()

np.float64(2548.2740047886505)

In [113]:
# before we do any preprocessing we will split the dataset into train and test set .
# because we want to do preprocessing only on the train set and not on the test set to avoid data leakage .
# for testing we will have complete unseen data for the model to evaluate its performance on it .

from sklearn.model_selection import train_test_split
X = data[['content']]
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [114]:
# removing URLs and HTML tags from the content column 
# we will use regex (regular expression ) 
import re

def clean_text(text):
    text = re.sub(r"http\S+|www\S+", "", text)   # remove URLs
    text = re.sub(r"<.*?>", "", text)            # remove HTML tags
    return text

X_train["content"] = X_train["content"].apply(clean_text)



In [115]:
# checking the average length of the content column after cleaning the text
data['content'].str.len().mean()

np.float64(2548.2740047886505)

In [116]:
# we will normalize white spaces in the content column
# Remove extra spaces
# Convert multiple spaces → single space
# Remove leading & trailing spaces

# \s+ → matches multiple spaces, tabs, newlines
# " " → replaces them with single space
# .strip() → removes spaces at start & end



def normalize_space(text):
    return re.sub(r"\s+", " ", text).strip()

X_train["content"] = X_train["content"].apply(normalize_space)



In [117]:
# removing special characters from the content column
X_train["content"] = X_train["content"].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True)
# checking the average length of the content column after removing special characters
X_train['content'].str.len().mean()


np.float64(2470.414477916702)

In [118]:
print(len(X_train))
print(len(X_test))

35751
8938


### TF-IDF Configuration 

In [119]:
# now we will do stopwords removal and lemmatization in the content column . 
# TF-IDF has a built-in option to remove stopwords, so we will use that instead of doing it manually.

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
corpus = X_train['content'].tolist()
# as for ngram range we will use unigrams and bigrams for containtng the context as well as not pushing the memory .
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english') , max_features=16000 , ngram_range=(1,2) , min_df = 3 , max_df = 0.90) 
X_train_tfidf = vectorizer.fit_transform(corpus)
# now we will transform the test set using the same vectorizer to avoid data leakage and to have the same feature space for both train and test set .
X_test_tfidf = vectorizer.transform(X_test['content']) 
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)



(35751, 16000)
(8938, 16000)


In [120]:
# transfering the matrix to model development notebook for training the model and evaluating its performance on the test set .
import pickle

pickle.dump(X_train_tfidf, open("X_train.pkl", "wb"))
pickle.dump(X_test_tfidf, open("X_test.pkl", "wb"))
pickle.dump(y_train, open("y_train.pkl", "wb"))
pickle.dump(y_test, open("y_test.pkl", "wb"))


In [121]:
import joblib
joblib.dump(vectorizer, "vectorizer.pkl")


['vectorizer.pkl']