In [None]:
# CETM47 AS2: Exploratory Data Analysis (EDA)
# Project: Real-Time Twitter Topic Classification  
# Author: <Your Name & Student ID>  
# Date: <today's date>

## 1. Introduction
#This notebook explores the structure, quality, and content of the raw tweet dataset for the NewsPulse Analytics project. The goal is to assess readiness for modeling, identify risks, and document cleaning/preprocessing choices.

In [None]:
## 2. Import Libraries and Set Up Paths
#import all necessary Python libraries for data manipulation and plotting. We also ensure output folders exist for saving figures.
import pandas as pd             # For data manipulation
import numpy as np              # For numeric operations
import matplotlib.pyplot as plt # For plotting
import seaborn as sns           # For advanced plots
from pathlib import Path        # For cross-platform file paths
import re                       # For regex
import html                     # For HTML unescaping
import os
#from pathlib import Path


# Set up figure output path and create it if it doesn't exist
fig_dir = Path("../reports/figures")
fig_dir.mkdir(parents=True, exist_ok=True)


In [None]:
## 3. Load Raw Data
#load the raw tweet data from JSON. The file is expected in `data/raw/`.
from pathlib import Path
data_path = Path("data/raw/CETM47_24_5-AS2-Data.json")
df = pd.read_json(data_path)
print(f"Loaded {len(df)} tweets.")
df.head()


# data_path = Path("data/raw/CETM47_24_5-AS2-Data.json")  # Path to your raw data file
# df = pd.read_json(data_path, lines=True)                # Load JSON lines as DataFrame
# print(f"Loaded {len(df)} tweets.")
# df.head()  # Display first few rows for a sanity check


In [None]:
## 4. Data Audit: Columns, Nulls, and Duplicates
#This cell checks for null values, duplicate IDs and texts to identify quality issues.
print("Columns:", list(df.columns))       # Print all column names
print(df.info())                          # Info: dtypes, non-null counts
print("Nulls per column:\n", df.isnull().sum())              # Number of nulls in each column
print("Duplicates by 'id':", df['id'].duplicated().sum())    # Duplicate tweet IDs
print("Duplicates by 'text':", df['text'].duplicated().sum())# Duplicate tweet texts


In [None]:
## 5. Remove Duplicates and Short Tweets
# Remove tweets with duplicate texts and tweets with fewer than 3 tokens (which are likely noise).
df = df.drop_duplicates(subset='text')
print(f"Rows after dropping duplicate texts: {len(df)}")

# Add a column for token count per tweet
df['token_len'] = df['text'].str.split().apply(len)
short_tweets = df[df['token_len'] < 3]            # Identify short tweets
print(f"Tweets with fewer than 3 tokens: {len(short_tweets)}")

# Remove short tweets
df = df[df['token_len'] >= 3].copy()
print(f"Rows after removing short tweets: {len(df)}")

In [None]:
## 6. Class Distribution
# Visualize the class imbalance in the dataset with a barplot.
plt.figure(figsize=(8,4))
# Use Seaborn's countplot to show the number of samples per class
#sns.countplot(x='label_name', data=df, order=df['label_name'].value_counts().index, palette='Set2')
sns.countplot(x='label_name', data=df, order=df['label_name'].value_counts().index, hue='label_name', palette='Set2', legend=False)

#sns.countplot(x='label_name', data=df, order=df['label_name'].value_counts().index)
plt.title("Tweet class distribution")
plt.ylabel("Count")
plt.xticks(rotation=30)
plt.tight_layout()
plt.savefig(fig_dir / "class_bar.png")
plt.show()


In [None]:
## 7. Tweet Length Statistics
# Explore distribution of tweet length (in tokens and characters).
df['char_len'] = df['text'].str.len()    # Add a column for character count

fig, axs = plt.subplots(1,2, figsize=(10,4))
# Plot histogram of token counts
sns.histplot(df['token_len'], bins=30, kde=True, ax=axs[0], color='steelblue')
axs[0].set_title("Token count per tweet")
# Plot histogram of character counts
sns.histplot(df['char_len'], bins=30, kde=True, ax=axs[1], color='orange')
axs[1].set_title("Character count per tweet")
plt.tight_layout()
plt.savefig(fig_dir / "length_hist.png")
plt.show()

# Print descriptive statistics
print("Token stats:")
print(df['token_len'].describe(percentiles=[.5, .95, 1]))
print("Char stats:")
print(df['char_len'].describe(percentiles=[.5, .95, 1]))


In [None]:
## 8. Tweet Volume Over Time
# Check if there are spikes or gaps in tweet collection (e.g. due to events like COVID).
df['date'] = pd.to_datetime(df['date'])    # Ensure dates are parsed as datetime objects
timeline = df.set_index('date').resample('D').size()   # Count tweets per day
plt.figure(figsize=(12,3))
timeline.plot()
plt.title("Tweets per day (time series)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(fig_dir / "timeline.png")
plt.show()


In [None]:
## 9. Top Hashtags by Class
# Extract hashtags from tweets and show the most frequent per topic, to understand topic vocabulary.
import collections

def extract_hashtags(text):
    """Extract all hashtags from a tweet."""
    return re.findall(r"#\w+", text)

hashtag_counts = {}
for topic in df['label_name'].unique():
    hashtags = sum(df.loc[df['label_name']==topic, 'text'].apply(extract_hashtags), [])
    hashtag_counts[topic] = collections.Counter(hashtags).most_common(5)

print("Top hashtags per class:")
for k, v in hashtag_counts.items():
    print(f"{k}: {v}")


In [None]:
## 10. Emoji Presence Check
# Estimate the percentage of tweets containing at least one emoji.
import emoji

def has_emoji(s):
    """Return True if any emoji present in text."""
    return any(char in emoji.EMOJI_DATA for char in s)

df['has_emoji'] = df['text'].apply(has_emoji)
percent_emoji = df['has_emoji'].mean() * 100
print(f"Percent of tweets with emoji: {percent_emoji:.2f}%")


In [None]:
## 11. Save Cleaned Data for Downstream Tasks
# Save the cleaned DataFrame to a binary Feather file for efficient loading in later scripts.
out_path = Path("data/tweets.feather")
df.reset_index(drop=True).to_feather(out_path)
print(f"Saved cleaned DataFrame to {out_path}")
