### PYTHON LIBRARY SETUP

In [None]:
# Import the ***TextBlob*** class for sentiment analysis and define a function to extract polarity and subjectivity from text, as specified in the instructions.
!pip install textblob
!pip install kagglehub
!pip install tensorflow

In [None]:
import kagglehub
import numpy as np
import pandas as pd
import seaborn as sns
import os
import re
import pickle
import matplotlib.pyplot as plt
# Import the ***nltk*** library and download the necessary 'stopwords' and 'punkt' corpora for text preprocessing, as specified in the instructions.
# This ensures that these resources are available for tokenization and stop word removal.
import nltk
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from textblob import TextBlob


### DOWNLOAD AND LOAD DATASET

In [None]:
# Download Dataset
path_fake_real = kagglehub.dataset_download(
    "clmentbisaillon/fake-and-real-news-dataset"
)
print("fake/real path:", path_fake_real)

path_ai1 = kagglehub.dataset_download(
    "walidbenaouda/ai-isot-dataset"
)
path_ai2 = kagglehub.dataset_download(
    "atharvasoundankar/gen-ai-misinformation-detection-datase-20242025"
)

In [None]:
# Load the Fake and True CSV Datasets
fake = pd.read_csv(os.path.join(path_fake_real, "Fake.csv")) #cvs for fake
true = pd.read_csv(os.path.join(path_fake_real, "True.csv")) #cvs for true

# Load AI Datasets
ai_isot = pd.read_csv(os.path.join(path_ai1, "AI-ISOT dataset.csv"))
ai_gen = pd.read_csv(os.path.join(path_ai2, "generative_ai_misinformation_dataset.csv"))

In [None]:
print(f"-----------------------FAKE.CSV---------------------{fake.head()}\n") #shows headers
print(f"-----------------------TRUE.CSV---------------------{true.head()}\n") #shows headers
print(f"-----------------------AI_1.CSV----------------------{ai_isot.head()}\n")   #shows headers
print(f"-----------------------AI_2.CSV----------------------{ai_gen.head()}\n")   #shows headers

### TEXT CLEANING

#### Data Cleaning Function

In [None]:
# Cleaning text in the dataset
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www.\S+", "", text) # Gets rid of urls
    text = re.sub(r"<.*?>", "", text) # Gets rid of html
    text = re.sub(r"[^a-zA-Z0-9.,!?'â€™\s]", " ", text) # Makes sure punctuation still there
    text = re.sub(r"\s+", " ", text).strip() # Fixes spaces
    return text

#### Clean Fake vs True Dataset

In [None]:
# Clean real vs fake dataset
fake["is_real"] = 0 # Gives fake news a label
true["is_real"] = 1 # Gives true news a label

df_rf = pd.concat([fake, true], ignore_index=True) # Combines datasets

# Makes text in one place
df_rf["text"] = (
    df_rf["title"].fillna("") + " " +
    df_rf["text"].fillna("")
).str.strip()

# Gets rid of duplicates and empty text
df_rf = df_rf.drop_duplicates(subset=["text"])
df_rf = df_rf.dropna(subset=["text"])
df_rf = df_rf[df_rf["text"].str.strip() != ""]
# Cleans the text
df_rf["text"] = df_rf["text"].apply(clean_text)

# Picks the necessary columns and saves the cleaned file
df_rf_clean = df_rf[["text", "is_real"]]
df_rf_clean["source"] = "FAKE-REAL"
df_rf_clean.to_csv("clean_real_fake.csv", index=False)

# Show it was saved and shows portion of cleaned data set
print("saved: clean_real_fake.csv")
print(df_rf_clean.head())

#### Clean AI vs Human Dataset

##### AI-Dataset 1 - AI-ISOT Dataset

In [None]:
print(ai_isot.columns)

In [None]:
ai_isot_long = []

# Human-written REAL news : human, real
for x in ai_isot["Real News"].dropna():
    ai_isot_long.append({"text": x, "is_ai": 0, "is_real": 1})

# Human-written FAKE news : human, fake
for x in ai_isot["Fake News"].dropna():
    ai_isot_long.append({"text": x, "is_ai": 0, "is_real": 0})

# AI-generated Fake News : ai, fake
for x in ai_isot["AI-generated Fake News"].dropna():
    ai_isot_long.append({"text": x, "is_ai": 1, "is_real": 0})

ai_isot_df = pd.DataFrame(ai_isot_long)

ai_isot_df["text"] = ai_isot_df["text"].apply(clean_text)
ai_isot_df["source"] = "AI-ISOT"

ai_isot_df.to_csv("clean_ai_isot.csv", index=False)
print("Saved: clean_ai_isot.csv")

##### AI-Dataset 2 - AI_Gen Dataset

In [None]:
print(ai_gen.columns)

In [None]:
# Keep only needed columns
ai_gen = ai_gen[[
    "text", "is_misinformation", "model_signature",
    "date", "month", "country", "platform"
]]

# Drop rows with missing text
ai_gen = ai_gen.dropna(subset=["text"])
ai_gen = ai_gen[ai_gen["text"].str.strip() != ""]

# Clean text
ai_gen["text"] = ai_gen["text"].apply(clean_text)

# Convert model_signature to AI/Human label
# 1 = AI-generated, 0 = Human-written
ai_gen = ai_gen[ai_gen["model_signature"].isin(["GPT-like", "human"])]
ai_gen["is_ai"] = ai_gen["model_signature"].apply(
    lambda x: 1 if x == "GPT-like" else 0
)

# Convert misinformation column to binary label
ai_gen["is_real"] = ai_gen["is_misinformation"].apply(
    lambda x: 0 if x == 1 else 1
)

# Final cleaned AI-gen dataset
ai_gen_clean = ai_gen[[
    "text", "is_real", "is_ai", "date", "month", "country", "platform"
]]

ai_gen_clean.to_csv("clean_ai_gen.csv", index=False)
print("Saved: clean_ai_gen.csv")

#### Combined Master Datasets

In [None]:
combined = pd.concat([df_rf_clean, ai_isot_df, ai_gen], ignore_index=True)

combined = combined.drop_duplicates(subset=["text"])
combined = combined[combined["text"].str.strip() != ""]

combined.to_csv("combined_master_dataset.csv", index=False)
print("Saved: combined_master_dataset.csv")

print("Rows in final dataset:", len(combined))

#### Combine News Datasets Only for EDA:
* df_rf_clean
* ai_isot_df

In [None]:
combined_news = pd.concat([df_rf_clean, ai_isot_df], ignore_index=True)

combined_news = combined.drop_duplicates(subset=["text"])
combined_news = combined[combined["text"].str.strip() != ""]

combined_news.to_csv("combined_news_dataset.csv", index=False)
print("Saved: combined_news_dataset.csv")

print("Rows in Combined News Dataset:", len(combined_news))

In [None]:
combined_news.head()

## EDA
Perform an initial data overview of the ***combined_news*** DataFrame by displaying its first few rows, shape, and general information, then analyze and visualize the distribution of the ***is_real*** and ***source*** columns using count plots. Afterwards, handle ***NaN*** values in the ***is_ai*** column by replacing them with -1, and then analyze and visualize its distribution using a count plot. Finally, summarize the key findings from these initial checks.

### 1. Initial Data Overview








In [None]:
print("First 5 rows of combined_news DataFrame:")
print(combined_news.head())

print("\nShape of combined_news DataFrame:")
print(combined_news.shape)

print("\nGeneral information about combined_news DataFrame:")
combined_news.info()

### 2. Analyze and visualize the distribution of the ***is_real*** and ***source*** columns using count plots.

In [None]:
color = ['mediumpurple','gold']
plt.figure(figsize=(7, 5))
sns.countplot(data=combined_news, x='is_real', hue='is_real', palette=color, legend = False)
plt.title('Distribution of is_real (0=Fake, 1=Real)')
plt.xlabel('Is Real')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(data=combined_news, x='source', hue='source', palette=color, legend = False)
plt.title('Distribution of Source')
plt.xlabel('Source')
plt.ylabel('Count')
plt.show()

### 3. Handle NaN values in the ***is_ai*** column:
Handle `NaN` values in the `is_ai` column by replacing them with -1, and then analyze and visualize its distribution using a count plot.


In [None]:
combined_news['is_ai'] = combined_news['is_ai'].fillna(-1)

print("Combined_news DataFrame after handling NaN values in 'is_ai':")
print(combined_news.head())

In [None]:
plt.figure(figsize=(7, 5))
sns.countplot(data=combined_news, x='is_ai', hue='is_ai', palette=color, legend = False)
plt.title('Distribution of is_ai (0=Human, 1=AI, -1=Not applicable)')
plt.xlabel('Is AI')
plt.ylabel('Count')
plt.show()

### 4. Summary of Key Findings

**4.1. `combined_news` DataFrame Structure:**
*   The DataFrame `combined_news` contains 39,972 entries and 4 columns: `text`, `is_real`, `source`, and `is_ai`.
*   The `text`, `is_real`, and `source` columns are fully populated, with no missing values.
*   The `is_ai` column initially had a large number of missing (NaN) values, as indicated by only 895 non-null entries before handling.

**4.2. Distribution of `is_real`:**
*   The count plot for `is_real` shows a fairly balanced distribution between real (1) and fake (0) news entries, which is good for training classification models.

**4.3. Distribution of `source`:**
*   The `source` column is predominantly composed of entries from "FAKE-REAL", with a much smaller proportion from "AI-ISOT". This indicates that the dataset is heavily skewed towards traditional fake/real news rather than AI-specific news sources.

**4.4. Distribution of `is_ai` after handling NaNs:**
*   After replacing `NaN` values with -1, the count plot for `is_ai` clearly shows three categories:
    *   `-1` (Not applicable): This is the largest category, representing entries where AI attribution was not originally provided (mostly from the FAKE-REAL dataset).
    *   `0` (Human-written): A small number of entries are identified as human-written.
    *   `1` (AI-generated): An even smaller number of entries are identified as AI-generated.
*   This highlights that the `is_ai` column is largely sparse and mainly applicable to a specific subset of the `combined_news` data (i.e., from the AI-ISOT dataset).

### 5. Text Length and Word Count Analysis

Calculate the number of characters and words for each text entry in the `combined_news` DataFrame. Visualize the distributions of these metrics using histograms or density plots, and compare the average/median text lengths between real (is_real=1) and fake (is_real=0) news.


First, we will calculate the number of characters for each text entry and store it in a new column ***char_count*** in the ***combined_news*** DataFrame.



In [None]:
combined_news['char_count'] = combined_news['text'].str.len()
print("Added 'char_count' column to combined_news DataFrame.")
print(combined_news.head())

Calculate the number of words for each text entry and store it in a new column ***word_count***.



In [None]:
combined_news['word_count'] = combined_news['text'].apply(lambda x: len(str(x).split()))
print("Added 'word_count' column to combined_news DataFrame.")
print(combined_news.head())

Visualize the distribution of character count:



In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(combined_news['char_count'], bins=50, kde=True, color='mediumpurple')
plt.title('Distribution of Character Count')
plt.xlabel('Character Count')
plt.ylabel('Frequency')
plt.show()


Visualize the distribution of the ***word_count***.



In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(combined_news['word_count'], bins=50, kde=True, color='gold')
plt.title('Distribution of Word Count')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()

Calculate the average and median character count for real and fake news articles.



In [None]:
print("\nAverage and Median Character Count for Real vs. Fake News:")
print(combined_news.groupby('is_real')['char_count'].agg(['mean', 'median']))

Calculate the average and median word count for real and fake news articles.


In [None]:
print("Average and Median Word Count for Real vs. Fake News:")
print(combined_news.groupby('is_real')['word_count'].agg(['mean', 'median']))

Visually compare the ***char_count*** distribution between real and fake news using a box plot.



In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(data=combined_news, x='is_real', y='char_count', hue='is_real', palette=['mediumpurple', 'gold'], legend=False)
plt.title('Character Count Distribution by News Type (0=Fake, 1=Real)')
plt.xlabel('News Type')
plt.ylabel('Character Count')
plt.xticks([0, 1], ['Fake News', 'Real News'])
plt.show()

Visually compare the ***word_count*** distribution between real and fake news by creating a box plot.



In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(data=combined_news, x='is_real', y='word_count', hue='is_real', palette=['mediumpurple', 'gold'], legend=False)
plt.title('Word Count Distribution by News Type (0=Fake, 1=Real)')
plt.xlabel('News Type')
plt.ylabel('Word Count')
plt.xticks([0, 1], ['Fake News', 'Real News'])
plt.show()

### Summary of Text Length and Word Count Analysis

**5.1. Character and Word Counts:**
*   New columns ***char_count*** and ***word_count*** were successfully added to the ***combined_news*** DataFrame, providing quantitative metrics for text length.

**5.2. Distribution of Text Lengths (Character and Word Counts):**
*   Both character and word count distributions show a wide range, indicating variability in article lengths within the dataset. Most articles tend to be shorter, with a long tail extending towards very long articles.
*   The distributions appear to be right-skewed, meaning there are many shorter articles and fewer very long ones.

**5.3. Comparison of Real vs. Fake News (Character Count):**
*   **Average Character Count:** Fake news has a slightly higher average character count (approximately 2551) compared to real news (approximately 2424).
*   **Median Character Count:** The medians are very close (2281 for fake, 2265 for real), suggesting that for the majority of articles, the character lengths are quite similar.
*   The box plots for character count indicate that fake news tends to have a slightly wider spread and potentially more outliers on the higher end, suggesting some fake news articles can be considerably longer.

**5.4. Comparison of Real vs. Fake News (Word Count):**
*   **Average Word Count:** Similar to character count, fake news articles have a higher average word count (approximately 434) than real news (approximately 398).
*   **Median Word Count:** The medians are also close (390 for fake, 372 for real).
*   The box plots for word count also show a similar pattern to character count, with fake news potentially having a slightly larger range and more long-form outliers.

**Overall Conclusion:**
While there isn't a drastic difference, fake news articles in this combined dataset tend to be slightly longer on average in terms of both character and word counts compared to real news articles. This could be a subtle distinguishing feature, although the distributions largely overlap.

### 6. Word Frequency and Vocabulary Insights

Extract the most frequent words from the 'text' column. Visualize the top 20 most common words across the entire dataset, and then separately for real and fake news using bar charts or word clouds to identify distinctive vocabulary patterns.


In [None]:
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
print("NLTK stopwords and punkt corpora downloaded.")

Create a text preprocessing function that converts text to lowercase, tokenizes it, removes stopwords, and filters out non-alphabetic tokens, as described in the instructions.



In [None]:
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = str(text).lower()  # Convert to string and lowercase
    tokens = word_tokenize(text) # Tokenize the text
    # Remove stop words and non-alphabetic tokens
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

print("Text preprocessing function 'preprocess_text' created.")

Now that the preprocessing function is defined, let's apply it to the 'text' column of the ***combined_news*** DataFrame to tokenize and clean the text, storing the result in a new column called 'processed_text'.



In [None]:
nltk.download('punkt_tab', quiet=True)
print("NLTK 'punkt_tab' corpus downloaded.")

In [None]:
combined_news['processed_text'] = combined_news['text'].apply(preprocess_text)
print("Applied 'preprocess_text' to the 'text' column.")
print(combined_news[['text', 'processed_text']].head())

Now that the text is preprocessed and stored as lists of words in the 'processed_text' column, let's combine all these lists into a single flat list to prepare for counting the most frequent words across the entire dataset.



In [None]:
all_words = [word for sublist in combined_news['processed_text'] for word in sublist]
print(f"Total words in the dataset after preprocessing: {len(all_words)}")

Use the ***collections.Counter*** to find the 20 most frequent words, which is a required step for visualizing the top common words across the entire dataset.



In [None]:
word_freq = Counter(all_words)
top_20_words = word_freq.most_common(20)
print("Top 20 most common words across the entire dataset:")
print(top_20_words)

Extract the words and their corresponding frequencies from the ***top_20_words*** list and then use ***matplotlib.pyplot*** and ***seaborn*** to create a bar chart.



In [None]:
words = [word for word, count in top_20_words]
counts = [count for word, count in top_20_words]

plt.figure(figsize=(12, 7))
sns.barplot(x=words, y=counts, hue=words, palette='viridis', legend=False)
plt.title('Top 20 Most Common Words Across All News')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
print("Bar plot of top 20 most common words across the entire dataset displayed.")

Filter the ***combined_news*** DataFrame to create a separate DataFrame for real news (where ***is_real*** is 1) to analyze its unique vocabulary patterns.



In [None]:
real_news_df = combined_news[combined_news['is_real'] == 1]
print("Filtered real news DataFrame created.")
print(f"Number of real news entries: {len(real_news_df)}")
print(real_news_df.head())

Combine all the preprocessed words from the ***processed_text*** column into a single list, which is a necessary step before counting the most frequent words.



In [None]:
real_news_words = [word for sublist in real_news_df['processed_text'] for word in sublist]
print(f"Total words in real news dataset: {len(real_news_words)}")

Use ***Collections.Counter*** to find the 20 most frequent words specifically for real news.



In [None]:
real_word_freq = Counter(real_news_words)
top_20_real_words = real_word_freq.most_common(20)
print("Top 20 most common words in real news:")
print(top_20_real_words)

To visualize the top 20 most common words in real news, I need to extract the words and their corresponding frequencies from the ***top_20_real_words*** list and then create a bar chart.



In [None]:
words_real = [word for word, count in top_20_real_words]
counts_real = [count for word, count in top_20_real_words]

plt.figure(figsize=(12, 7))
sns.barplot(x=words_real, y=counts_real, hue=words_real, palette='viridis', legend=False)
plt.title('Top 20 Most Common Words in Real News')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
print("Bar plot of top 20 most common words in real news displayed.")

Filter the ***combined_news*** DataFrame to create a separate DataFrame for fake news (where ***is_real*** is 0), as specified in the instructions, to analyze its unique vocabulary patterns.



In [None]:
fake_news_df = combined_news[combined_news['is_real'] == 0]
print("Filtered fake news DataFrame created.")
print(f"Number of fake news entries: {len(fake_news_df)}")
print(fake_news_df.head())

In [None]:
fake_news_words = [word for sublist in fake_news_df['processed_text'] for word in sublist]
print(f"Total words in fake news dataset: {len(fake_news_words)}")

Now that all processed words from fake news are combined into a single list, let's find the 20 most frequent words specifically for fake news, as specified in the instructions.



In [None]:
fake_word_freq = Counter(fake_news_words)
top_20_fake_words = fake_word_freq.most_common(20)
print("Top 20 most common words in fake news:")
print(top_20_fake_words)

### 7. Sentiment Analysis

Perform sentiment analysis on the 'text' column to derive polarity (emotional tone, e.g., positive, negative, neutral) and subjectivity scores. Add these as new columns to the DataFrame and visualize their distributions. Compare the sentiment scores between real and fake news to identify potential differences in emotional content.


Define the ***get_sentiment*** function as specified in the instructions to extract polarity and subjectivity scores.



In [None]:
def get_sentiment(text):
    blob = TextBlob(str(text))
    return blob.sentiment.polarity, blob.sentiment.subjectivity

print("Imported TextBlob and defined 'get_sentiment' function.")

Now that the ***get_sentiment*** function is defined, let's apply it to the 'text' column of the ***combined_news*** DataFrame to calculate the polarity and subjectivity scores for each text entry. The results will be stored in new columns named 'polarity' and 'subjectivity' respectively.



In [None]:
combined_news[['polarity', 'subjectivity']] = combined_news['text'].apply(lambda x: pd.Series(get_sentiment(x)))
print("Added 'polarity' and 'subjectivity' columns to combined_news DataFrame.")
print(combined_news.head())

Visualize its distribution using a histogram or density plot to understand the overall emotional tone of the news articles.



In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(combined_news['polarity'], bins=50, kde=True, color='mediumpurple')
plt.title('Distribution of Polarity Scores')
plt.xlabel('Polarity Score')
plt.ylabel('Frequency')
plt.show()

Now that the 'subjectivity' column has been added to the DataFrame, let's visualize its distribution using a histogram or density plot to understand the degree of factual information versus personal opinions in the news articles.



In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(combined_news['subjectivity'], bins=50, kde=True, color='gold')
plt.title('Distribution of Subjectivity Scores')
plt.xlabel('Subjectivity Score')
plt.ylabel('Frequency')
plt.show()

Compare the average and median polarity scores between real and fake news articles to identify potential differences in emotional tone.



In [None]:
print("\nAverage and Median Polarity Scores for Real vs. Fake News:")
print(combined_news.groupby('is_real')['polarity'].agg(['mean', 'median']))

Compare the average and median subjectivity scores between real and fake news articles to identify potential differences in the degree of factual information versus personal opinions.



In [None]:
print("\nAverage and Median Subjectivity Scores for Real vs. Fake News:")
print(combined_news.groupby('is_real')['subjectivity'].agg(['mean', 'median']))

### Summary of Sentiment Analysis

**1. Polarity Scores:**
*   The distribution of polarity scores across all news articles is centered around zero, indicating a generally neutral emotional tone, with a slight tendency towards positive sentiment.
*   There is a relatively wide spread, suggesting a variety of emotional tones present in the dataset.
*   **Comparison (Real vs. Fake News):** Both real and fake news articles exhibit very similar average and median polarity scores, indicating that, on average, their emotional tones are quite close. Fake news has a slightly higher average polarity (approx. 0.0598) than real news (approx. 0.0573), but the difference is marginal.

**2. Subjectivity Scores:**
*   The distribution of subjectivity scores shows a considerable number of articles leaning towards factual (lower subjectivity) content, but also a significant portion with higher subjectivity, indicating opinion-based writing.
*   **Comparison (Real vs. Fake News):** A more notable difference is observed in subjectivity scores. Fake news articles have a significantly higher average subjectivity (approx. 0.4549) and median subjectivity (approx. 0.4552) compared to real news articles (average approx. 0.3624, median approx. 0.3676). This suggests that fake news tends to contain more personal opinions and less factual reporting than real news.

**Overall Conclusion:**
Sentiment analysis reveals a clearer distinction in subjectivity than in polarity. While both real and fake news exhibit similar emotional tones, fake news articles are generally more subjective, implying a higher presence of opinions, beliefs, and personal feelings compared to the more objective nature of real news. This difference in subjectivity could be a valuable feature for distinguishing between the two news types.

### Text Analysis Summary:

### Key Findings:

*   **Text Length Comparison:** Fake news articles tend to be slightly longer than real news articles on average. The average character count for fake news was approximately 2551, while for real news it was around 2424. Similarly, the average word count for fake news was about 434, compared to approximately 398 for real news. However, median lengths were very similar, and distributions largely overlapped, suggesting that while there's a slight tendency, length isn't a strong discriminator on its own.
*   **Word Frequency Patterns:**
    *   "Trump" is a highly frequent word across all news types, appearing as the most common word in fake and overall news, and the second most common in real news.
    *   Real news frequently features journalistic terms like "said" and source attributions like "reuters" (28,861 occurrences in real news).
    *   Fake news frequently uses words like "people," "president," "one," "donald," "like," "obama," "clinton," "video," and "hillary," suggesting a focus on specific figures, personal opinions, and potentially sensational content.
*   **Sentiment Polarity:** Both real and fake news exhibit very similar emotional tones, with average polarity scores being approximately 0.0573 for real news and 0.0598 for fake news. The distributions for polarity are centered around zero, indicating a generally neutral to slightly positive emotional tone for both categories.
*   **Sentiment Subjectivity:** A significant difference was observed in subjectivity. Fake news articles are notably more subjective, with an average subjectivity score of approximately 0.4549 and a median of 0.4552. In contrast, real news articles are more objective, with an average subjectivity score of approximately 0.3624 and a median of 0.3676. This indicates that fake news often contains more opinions and beliefs.




## LSTM

### Retrive data for LSTM training

In [None]:
lstm_df = pd.read_csv("combined_news_dataset.csv")

print(lstm_df.columns)
lstm_df.head()

### Train the Tokenizer on Text and Add Padding

In [None]:
# Tokenizer converts text to integer tokens based on vocabulary frequency
# Take input sequences of sam length, pad short sentences and truncate long sentences.
MAX_VOCAB = 30000
MAX_LEN = 300

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(lstm_df["text"])

### Function to build the LSTM Model

In [None]:
def make_lstm_model():
    model = Sequential([
        Embedding(MAX_VOCAB, 128, input_length=MAX_LEN),
        LSTM(128, return_sequences=False),
        Dropout(0.3),
        Dense(64, activation="relu"),
        Dropout(0.2),
        Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )

    return model

### Real vs Fake

#### Determine the Train-Test Split

In [None]:
# Convert text to sequences of integers
sequences_rf = tokenizer.texts_to_sequences(lstm_df["text"])
padded_rf = pad_sequences(sequences_rf, maxlen=MAX_LEN, padding="post")

In [None]:
X_rf = padded_rf
y_rf = lstm_df["is_real"]

X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(
    X_rf, y_rf, test_size=0.2, random_state=42
)

#### Build the LSTM model

In [None]:
lstm_rf = make_lstm_model()

#### Fit the model

In [None]:
history_rf = lstm_rf.fit(
    X_train_rf, y_train_rf,
    validation_data=(X_test_rf, y_test_rf),
    epochs=5,
    batch_size=64
)

#### Model Evaluation

In [None]:
y_pred = (lstm_rf.predict(X_test_rf) > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test_rf, y_pred))
print(classification_report(y_test_rf, y_pred))

lstm_rf.save("lstm_real_fake_model.keras")

with open("lstm_real_fake_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Model and tokenizer saved.")

##### Plot Learning Curves (Accuracy and Loss)

In [None]:
# Plot the model's accuracy and loss
# Create a figure with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Accuracy subplot
ax1.plot(history_rf.history['accuracy'], label='Training Accuracy')
ax1.plot(history_rf.history['val_accuracy'], label='Validation Accuracy')
ax1.set_title('Model Accuracy')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.legend()
ax1.grid(True)

# Loss subplot
ax2.plot(history_rf.history['loss'], label='Training Loss')
ax2.plot(history_rf.history['val_loss'], label='Validation Loss')
ax2.set_title('Model Loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.grid(True)

# Display plots
plt.tight_layout()
plt.show()

##### Confusion Matrices

In [None]:
# Predict probabilities
y_pred_probs = lstm_rf.predict(X_test_rf)

# Convert to binary predictions
y_pred = (y_pred_probs > 0.5).astype(int).flatten()


In [None]:
cm = confusion_matrix(y_test_rf, y_pred)
print(cm)


In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Real", "Fake"],
            yticklabels=["Real", "Fake"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix of Real vs Fake News")
plt.show()

### Human vs AI

#### Determine Train-Test Split

In [None]:
# Clean 'is_ai' labels properly - drop rows where is_ai is NaN
df_ai = lstm_df[lstm_df["is_ai"].notna()].copy()
# df_ai["is_ai"] = df_ai["is_ai"].astype(int)

print(df_ai["is_ai"].unique())
df_ai.head()

In [None]:
# Convert text to sequences of integers
sequences_ai = tokenizer.texts_to_sequences(df_ai["text"])
padded_ai = pad_sequences(sequences_ai, maxlen=MAX_LEN, padding="post")

In [None]:
X_ai = padded_ai

# Labels (0 = human, 1 = AI)
y_ai = df_ai["is_ai"].astype(int)

X_train_ai, X_test_ai, y_train_ai, y_test_ai = train_test_split(
    X_ai, y_ai, test_size=0.2, random_state=42
)

#### Build the LSTM model

In [None]:
lstm_ai = make_lstm_model()

#### Fit the model

In [None]:
history_ai = lstm_ai.fit(
    X_train_ai, y_train_ai,
    validation_data=(X_test_ai, y_test_ai),
    epochs=15,
    batch_size=64
)

#### Model Evaluation

In [None]:
y_pred = (lstm_ai.predict(X_test_ai) > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test_ai, y_pred))
print(classification_report(y_test_ai, y_pred))

lstm_ai.save("lstm_human_ai_model.keras")

with open("lstm_real_fake_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Model and tokenizer saved.")

##### Plot Learning Curves (Accuracy and Loss)

In [None]:
# Plot the model's accuracy and loss
# Create a figure with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Accuracy subplot
ax1.plot(history_ai.history['accuracy'], label='Training Accuracy')
ax1.plot(history_ai.history['val_accuracy'], label='Validation Accuracy')
ax1.set_title('Model Accuracy')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.legend()
ax1.grid(True)

# Loss subplot
ax2.plot(history_ai.history['loss'], label='Training Loss')
ax2.plot(history_ai.history['val_loss'], label='Validation Loss')
ax2.set_title('Model Loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.grid(True)

# Display plots
plt.tight_layout()
plt.show()

##### Confusion Matrices

In [None]:
# Predict probabilities
y_pred_probs = lstm_ai.predict(X_test_ai)

# Convert to binary predictions
y_pred_ai = (y_pred_probs > 0.5).astype(int).flatten()


In [None]:
cm = confusion_matrix(y_test_ai, y_pred_ai)
print(cm)


In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["AI", "HUM"],
            yticklabels=["AI", "HUM"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix of AI vs Human Written News")
plt.show()


### Predict Example Texts

In [None]:
example_texts = [
    "The government has announced new economic policies today...",
    "BREAKING: Scientists confirm aliens landed in Nevada last night!!!",
    "This article was generated by a large language model to simulate political commentary.",
    "A recent medical study found that exercise reduces stress levels.",
    "Admiral told lawmakers everyone on alleged drug boat was on a list of military targets",
    "Florida may be Trump's last chance to gain GOP seats through redistricting",
    "20 million under winter weather alerts as heavy snow blankets parts of U.S."
]

In [None]:
def predict_text(text):

    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    pad = pad_sequences(seq, maxlen=MAX_LEN, padding="post")

    pred_realfake = lstm_rf.predict(pad)[0][0]
    pred_humanai = lstm_ai.predict(pad)[0][0]

    result = {
        "Predicted Real vs Fake": "REAL" if pred_realfake >= 0.5 else "FAKE",
        "Confidence (Real)": float(pred_realfake),
        "Predicted Human vs AI": "AI-GENERATED" if pred_humanai >= 0.5 else "HUMAN",
        "Confidence (AI)": float(pred_humanai)
    }
    return result

In [None]:
for text in example_texts:
    print("\nTEXT:", text[:60], "...")
    result = predict_text(text)
    print(result)
    print()