<a href="https://colab.research.google.com/github/carlosprr29/ai-progetto-spagnoli/blob/main/01_Exploratory_Data_Analysis_WELFake.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =================================================================
# PROJECT: Detecting Fake News with BERT
# PHASE: Exploratory Data Analysis (EDA) - WELFake Dataset
# =================================================================

# 1. INSTALLATION AND LOADING OF LIBRARIES
# -----------------------------------------------------------------
print(" Installing and loading libraries...")
!pip install -q datasets pandas matplotlib seaborn wordcloud

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from datasets import load_dataset
import warnings

warnings.filterwarnings('ignore') # To clean up the output of unnecessary warnings

In [None]:
# 2. LOADING THE DATASET FROM HUGGING FACE
# -----------------------------------------------------------------
print(" Loading WELFake dataset (this may take a minute)...")
dataset_raw = load_dataset("davanstrien/WELFake")
df = pd.DataFrame(dataset_raw['train'])

# Initial cleaning: remove rows with nulls and duplicates
df = df.dropna(subset=['title', 'text', 'label'])
df = df.drop_duplicates()
print(f" Dataset successfully loaded with {len(df)} rows.")

In [None]:
# 3. CLASS BALANCE ANALYSIS (Bar chart)
# -----------------------------------------------------------------
print("\n Generating class balance chart...")
plt.figure(figsize=(7, 5))
sns.countplot(data=df, x='label', palette='viridis')
plt.title('News Distribution (0: Real, 1: Fake)')
plt.xticks([0, 1], ['Real', 'Fake'])
plt.show()

In [None]:
# 4. CONTENT EXPLORATION (Sampling)
# -----------------------------------------------------------------
print("\n Displaying 5 random news items from the dataset:")
display(df[['title', 'label']].sample(5))

In [None]:
# 5. TITLE LENGTH ANALYSIS
# ----------------------------------------------------------------
print("\n Analysing title length...")
df['title_len'] = df['title'].astype(str).apply(len)

plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='title_len', hue='label', kde=True, bins=100, palette='magma')
plt.title('Comparison of Title Length: Real vs Fake')
plt.xlim(0, 200) # We focus our attention on the most common range.
plt.xlabel('Number of characters')
plt.show()

In [None]:
# 6. WORD CLOUD GENERATION (Term visualisation)
# -----------------------------------------------------------------
print("\n Generating word clouds (WordClouds)...")

def generate_cloud(news_class, graph_title, colour_map):
    text = " ".join(df[df['label'] == news_class]['title'].astype(str))
    wc = WordCloud(width=800, height=400, background_color='white',
                   max_words=100, colormap=colour_map).generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.title(graph_title, fontsize=16)
    plt.axis('off')
    plt.show()

# Cloud for Real news (label 0)
generate_cloud(0, "Most common words in REAL headlines", "ocean")

# Cloud for Fake news (label 1)
generate_cloud(1, "Most common words in FAKE headlines", "Reds")

In [None]:
# 7. SPECIFIC KEYWORD SEARCH ENGINE
# -----------------------------------------------------------------
def analyse_term(term):
    filter = df[df['title'].str.contains(term, case=False, na=False)]
    if not filter.empty:
        count = filter['label'].value_counts(normalize=True) * 100
        print(f"\n The term '{term}' appears in {len(filter)} titles.")
        print(f"   Distribution: {count.to_dict()}")
    else:
        print(f"\n The term '{term}' was not found.")

analyse_term("Trump")
analyse_term("VIDEO")
analyse_term("Breaking")
analyse_term("Nasa")