<a href="https://colab.research.google.com/github/amaluvincent/Fake-News-Detection/blob/main/data_pre_processing_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1.Setting up the environment

In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn nltk
!pip install tensorflow keras




# 2.Importing libraraies

In [None]:
# import necessary libabaries

import pandas as pd # For data manipulation and handling structured datasets.
import matplotlib.pyplot as plt  # For creating visualizations like plots and charts.
import seaborn as sns  # For advanced and aesthetically pleasing visualizations.
import numpy as np  # For numerical computations and handling arrays/matrices.
from sklearn.pipeline import Pipeline  # For creating a pipeline of machine learning models.
from sklearn.feature_extraction.text import TfidfVectorizer # For combining tokenization, TF-IDF transformation, and vectorization .
from sklearn import feature_extraction, linear_model, model_selection # For feature engineering,linear model and model selection.
from sklearn.model_selection import train_test_split #For splitting data into test and train sets.
from sklearn import metrics # For evaluating the performance of machine learning models.
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, confusion_matrix, classification_report  # For evaluating the performance.
from sklearn.model_selection import cross_val_score # For handling cross validation.
from sklearn.model_selection import GridSearchCV # For evaluating models
from sklearn.linear_model import LogisticRegression # For importing Logistic Regression model.
from sklearn.ensemble import RandomForestClassifier # FOr importing Random forest model.
from sklearn.tree import DecisionTreeClassifier # For importing Decision tree model.
from sklearn.naive_bayes import MultinomialNB  # FOr importing Naive Bayes model.
import string # For handling string operations relevant to text preprocessing.

import nltk  # For working with human language data(text).
nltk.download('stopwords') # Downloads a list of comomn words ("like","the","a","is") called stopwords.
nltk.download('punkt_tab') # Downloads the 'punkt' resource, which is used for tokenization .
nltk.download('wordnet') #For tasks like finding synonyms, antonyms, and understanding the relationships between words.
from nltk.corpus import stopwords  # For accessing the list of stopwords.
from nltk.stem import WordNetLemmatizer  # For reducing words to their base form.
from nltk.tokenize import word_tokenize # FOr splitting text into individual words.
from wordcloud import WordCloud  # FOr creating visual representation of word frequencies.
from collections import Counter # For counting the frequency of items .
import warnings # Importing warning module.
warnings.filterwarnings("ignore") #FOr ignoring warning messages.

from tensorflow.keras.models import Sequential  # For creating a linear stack of layers for LSTM
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional # help the model learn patterns in the text.
from tensorflow.keras.preprocessing.text import Tokenizer # breaking text intoo words
from tensorflow.keras.callbacks import EarlyStopping # tool for preventing overfitting
from tensorflow.keras.preprocessing.sequence import pad_sequences #To maintain uniform length
from sklearn.model_selection import KFold #  For splitting data into training and validation sets for k-fold cross-validation.
from tensorflow.keras.regularizers import l2 # To prevent overfitting by adding penalties to the model's complexity.
from sklearn.utils.class_weight import compute_class_weight # handle  imbalanced dataset


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# 3.Load dataset

In [None]:
# Load the ISOT dataset
true_news = pd.read_csv('/content/True.csv')
fake_news =  pd.read_csv('/content/Fake.csv')

# checking the size of each files
print('true:',true_news.shape)
print('fake:', fake_news.shape)


true: (21417, 4)
fake: (23481, 4)


In [None]:
# Display information about the true news dataset
print("True News Dataset Info:")
true_news.info()

# Display information about the fake news dataset
print("\nFake News Dataset Info:")
fake_news.info()


True News Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB

Fake News Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


# 4. Data cleaning and preparation

In [None]:
# Check for null values in each column of true_news and fake_news
print("Null values in True News dataset:")
print(true_news.isnull().sum())

print("\nNull values in Fake News dataset:")
print(fake_news.isnull().sum())


Null values in True News dataset:
title      0
text       0
subject    0
date       0
dtype: int64

Null values in Fake News dataset:
title      0
text       0
subject    0
date       0
dtype: int64


In [None]:
# Add a label to each dataframe
true_news['label'] = 0   # 0 for legitimate news
fake_news['label'] = 1   # 1 for fake news

# Combine both datasets
df = pd.concat([true_news, fake_news],ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)  # shuffle the data

# Print the combined dataset (first few rows)
print(df.head(5))
print(df.shape)



                                               title  \
0  FLASHBACK: ‘The Syrian War: What You’re Not Be...   
1  Trump administration defends travel ban in Sup...   
2   Lindsey Graham Is Getting All War Tingly Agai...   
3  Trump pulls nearly even with Clinton after Rep...   
4   The Internet Can’t Stop Laughing After Sean S...   

                                                text       subject  \
0  21st Century Wire says Back in August 2013, Un...   Middle-east   
1  NEW YORK (Reuters) - President Donald Trump’s ...  politicsNews   
2  Every once in a while, it seems like Sen. Lind...          News   
3   NEW YORK (Reuters) - Republican presidential ...  politicsNews   
4  White House Press Secretary Sean Spicer told r...          News   

                date  label  
0  December 18, 2016      1  
1   August 11, 2017       0  
2     August 1, 2017      1  
3     July 22, 2016       0  
4  February 14, 2017      1  
(44898, 5)


In [None]:
# Check for missing values after combined
print("Missing values in dataset:\n", df.isnull().sum())

Missing values in dataset:
 title      0
text       0
subject    0
date       0
label      0
dtype: int64


In [None]:
# print dataset info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB
None


In [None]:
# Removing unnecessary column(date)and (title)
df = df.drop(columns=["date", "title"])
print(df.head())

                                                text       subject  label
0  21st Century Wire says Back in August 2013, Un...   Middle-east      1
1  NEW YORK (Reuters) - President Donald Trump’s ...  politicsNews      0
2  Every once in a while, it seems like Sen. Lind...          News      1
3   NEW YORK (Reuters) - Republican presidential ...  politicsNews      0
4  White House Press Secretary Sean Spicer told r...          News      1


In [None]:
# Removing the word "Reuters"(which is the site were true news collected) in the text
df['text'] = df['text'].str.replace('Reuters', '', case=False)
print(df.head(5))


                                                text       subject  label
0  21st Century Wire says Back in August 2013, Un...   Middle-east      1
1  NEW YORK () - President Donald Trump’s adminis...  politicsNews      0
2  Every once in a while, it seems like Sen. Lind...          News      1
3   NEW YORK () - Republican presidential nominee...  politicsNews      0
4  White House Press Secretary Sean Spicer told r...          News      1


In [None]:
# Removing the word "Getty images" (which is a visual media company)& "Featured image"in the text
# eg:Print the text before cleaning for the 28th row
print("Before cleaning:")
print(df['text'].iloc[28])

# Perform the cleaning
df['text'] = df['text'].str.replace('Getty Images', '', case=False)
df['text'] = df['text'].str.replace('Featured image', '', case=False)

# Print the text after cleaning for the 28th row
print("\nAfter cleaning:")
print(df['text'].iloc[28])


Before cleaning:
Can you believe it? The clown that can t keep his hands off of women and the faux Indian socialist could be pairing up for a White House run. We really are in the age of the low information voter when these two are seriously considering running and people are seriously considering voting for them. NUTS! Joe Biden has been making his 2016 deliberations all about his late son since August.Aug. 1, to be exact   the day renowned Hillary Clinton-critic Maureen Dowd published a column that marked a turning point in the presidential speculation.BIDEN GAFFES: Read more: POLITICO

After cleaning:
Can you believe it? The clown that can t keep his hands off of women and the faux Indian socialist could be pairing up for a White House run. We really are in the age of the low information voter when these two are seriously considering running and people are seriously considering voting for them. NUTS! Joe Biden has been making his 2016 deliberations all about his late son since Augus

In [None]:
# Define stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to calculate changes per preprocessing step
def analyze_preprocessing(text):
    if pd.isnull(text) or text.strip() == "":  # Handle NaN or empty string input
        return {
            'Lowercasing Change (%)': 0,
            'Non-Alpha Removed (%)': 0,
            'Stopwords Removed (%)': 0,
            'Lemmatization Changes (%)': 0,
            'Cleaned Text': ''
        }
    # Initial count of tokens
    original_tokens = word_tokenize(text)   # Tokenization
    total_tokens = len(original_tokens)

     # Check if there are no tokens, return 0 for all changes
    if total_tokens == 0:
        return {
            'Lowercasing Change (%)': 0,
            'Non-Alpha Removed (%)': 0,
            'Stopwords Removed (%)': 0,
            'Lemmatization Changes (%)': 0,
        }

    # Step-by-step preprocessing
    lowercased_text = text.lower()    # Lowercasing
    lowercased_tokens = word_tokenize(lowercased_text)    #Tokenization
    alphabetic_tokens = [word for word in lowercased_tokens if word.isalpha()]  #Remove non-alphabetic tokens
    non_stopword_tokens = [word for word in alphabetic_tokens if word not in stop_words]  # Stopword removal
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in non_stopword_tokens]    # Lemmatization

    # Calculate changes as percentages
    #total_tokens = len(original_tokens)
    lowercasing_change = 100 if any(word.isupper() for word in original_tokens) else 0
    non_alpha_removed = (len(lowercased_tokens) - len(alphabetic_tokens)) / total_tokens * 100 if total_tokens > 0 else 0
    stopword_removed = (len(alphabetic_tokens) - len(non_stopword_tokens)) / total_tokens * 100 if total_tokens > 0 else 0
    lemmatization_changes = sum(1 for orig, lemma in zip(non_stopword_tokens, lemmatized_tokens) if orig != lemma) / total_tokens * 100 if non_stopword_tokens else 0

     # Join the lemmatized tokens to form the cleaned text
    cleaned_text = ' '.join(lemmatized_tokens)

    return {
        'Lowercasing Change (%)': lowercasing_change,
        'Non-Alpha Removed (%)': non_alpha_removed,
        'Stopwords Removed (%)': stopword_removed,
        'Lemmatization Changes (%)': lemmatization_changes,
        'Cleaned Text': cleaned_text
    }

# Apply the analysis function to each text in the dataframe and create a new dataframe to store results
df[['Lowercasing Change (%)', 'Non-Alpha Removed (%)', 'Stopwords Removed (%)',
    'Lemmatization Changes (%)','cleaned_text']] = df['text'].apply(lambda x: pd.Series(analyze_preprocessing(x)))

# Display average percentages across all text entries
average_changes = df[['Lowercasing Change (%)', 'Non-Alpha Removed (%)',
                      'Stopwords Removed (%)', 'Lemmatization Changes (%)']].mean()
print("Average Changes across all texts:\n", average_changes)

Average Changes across all texts:
 Lowercasing Change (%)       94.690187
Non-Alpha Removed (%)        13.402350
Stopwords Removed (%)        37.077629
Lemmatization Changes (%)     5.621640
dtype: float64


In [None]:
# Print the original and cleaned text
print(df[['text','cleaned_text','Lowercasing Change (%)', 'Non-Alpha Removed (%)',
          'Stopwords Removed (%)', 'Lemmatization Changes (%)']].head(10))  # Display the first 10 rows

                                                text  \
0  21st Century Wire says Back in August 2013, Un...   
1  NEW YORK () - President Donald Trump’s adminis...   
2  Every once in a while, it seems like Sen. Lind...   
3   NEW YORK () - Republican presidential nominee...   
4  White House Press Secretary Sean Spicer told r...   
5  Former Vice President Joe Biden was asked on M...   
6  SEOUL () - South Korean President Moon Jae-in ...   
7  CAIRO () - An Egyptian court sentenced a Briti...   
8  Listen:After creating a video that included a ...   
9  What a role model for women and young girls, a...   

                                        cleaned_text  Lowercasing Change (%)  \
0  century wire say back august united nation inv...                     100   
1  new york president donald trump administration...                     100   
2  every seems like lindsey graham might growing ...                     100   
3  new york republican presidential nominee donal...           

In [None]:
X = df['cleaned_text']
y = df['label']

# Check for duplicates in the input features
print(f"Number of duplicate rows in X: {X.duplicated().sum()}")

# Check for duplicates in the combination of X and y (to consider label alignment)
data = pd.DataFrame({'X': X, 'y': y})
print(f"Number of duplicate rows in combined X and y: {data.duplicated().sum()}")


Number of duplicate rows in X: 6468
Number of duplicate rows in combined X and y: 6467


In [None]:
# Combine X and y into a single DataFrame
data = pd.DataFrame({'X': X, 'y': y})

# Drop duplicates based on both features and labels
data = data.drop_duplicates()

# Separate the cleaned data back into X and y
X = data['X']
y = data['y']

# Print the number of duplicates after cleaning
print(f"Number of duplicate rows in X after cleaning: {X.duplicated().sum()}")
print(f"Number of duplicate rows in combined X and y after cleaning: {data.duplicated().sum()}")


Number of duplicate rows in X after cleaning: 1
Number of duplicate rows in combined X and y after cleaning: 0


In [None]:
# Check for NaN values in 'cleaned_text' after preprocessing
missing_values = df['cleaned_text'].isna().sum()
print(f"Number of NaN values in 'cleaned_text': {missing_values}")


Number of NaN values in 'cleaned_text': 0


# 5.Exploratory Data Analysis