In [1]:
#imports
import pandas as pd
import numpy as np
import re
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import warnings
import string
warnings.filterwarnings('ignore')

In [2]:
# Text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)


True

In [3]:
reddit_df = pd.read_csv('../data/reddit_ai.csv')
youtube_df = pd.read_csv('../data/youtube_ai.csv')

In [4]:
def harmonize_reddit_data(df):
    """Standardize Reddit data to unified schema"""
    
    # Create unified schema
    harmonized = pd.DataFrame()
    
    # Core text fields
    harmonized['text'] = df['text']
    harmonized['text_length'] = df['comment_length']
    
    # Sentiment
    harmonized['sentiment_polarity'] = df['sentiment_polarity']
    harmonized['sentiment_subjectivity'] = df['sentiment_subjectivity']
    harmonized['sentiment_label'] = df.get('sentiment_label', 
                                            df['sentiment_polarity'].apply(
                                                lambda x: 'positive' if x > 0.1 else ('negative' if x < -0.1 else 'neutral')))
    
    # Engagement metrics
    harmonized['likes'] = df['likes']
    harmonized['replies'] = df['num_replies']
    
    # Temporal
    harmonized['created_at'] = pd.to_datetime(df['created_utc'], errors='coerce')
    
    # Source identification
    harmonized['source'] = 'reddit'
    harmonized['source_id'] = df['source_id']
    
    # Platform-specific metadata
    harmonized['platform_post_id'] = df['post_id']
    harmonized['platform_post_title'] = df['post_title']
    harmonized['platform_community'] = df['subreddit']
    harmonized['platform_post_score'] = df['post_score']
    harmonized['platform_post_engagement'] = df['post_num_comments']
    
    # Content flags
    harmonized['contains_ai'] = df['contains_ai']
    harmonized['contains_opinion'] = df['contains_opinion']
    harmonized['contains_societal'] = df['contains_societal']
    
    return harmonized

def harmonize_youtube_data(df):
    """Standardize YouTube data to unified schema"""
    
    # Create unified schema
    harmonized = pd.DataFrame()
    
    # Core text fields
    harmonized['text'] = df['text']
    harmonized['text_length'] = df['comment_length']
    
    # Sentiment
    harmonized['sentiment_polarity'] = df['sentiment_polarity']
    harmonized['sentiment_subjectivity'] = df['sentiment_subjectivity']
    harmonized['sentiment_label'] = df.get('sentiment_label',
                                            df['sentiment_polarity'].apply(
                                                lambda x: 'positive' if x > 0.1 else ('negative' if x < -0.1 else 'neutral')))
    
    # Engagement metrics
    harmonized['likes'] = df['likes']
    harmonized['replies'] = df['num_replies']
    
    # Temporal
    harmonized['created_at'] = pd.to_datetime(df['created_utc'], errors='coerce')
    
    # Source identification
    harmonized['source'] = 'youtube'
    harmonized['source_id'] = df['source_id']
    
    # Platform-specific metadata
    harmonized['platform_post_id'] = df['video_id']
    harmonized['platform_post_title'] = df['video_title']
    harmonized['platform_community'] = df['video_channel']
    harmonized['platform_post_score'] = df['video_like_count']
    harmonized['platform_post_engagement'] = df['video_comment_count']
    
    # Content flags
    harmonized['contains_ai'] = df['contains_ai']
    harmonized['contains_opinion'] = df['contains_opinion']
    harmonized['contains_societal'] = df['contains_societal']
    
    return harmonized

# Harmonize datasets
reddit_harmonized = harmonize_reddit_data(reddit_df)
youtube_harmonized = harmonize_youtube_data(youtube_df)

# Merge datasets
merged_df = pd.concat([reddit_harmonized, youtube_harmonized], ignore_index=True)
merged_df.to_csv('../data/ai_opinions_combined.csv', index=False)


In [5]:
# check for missing values
merged_df.isna().sum()

text                        0
text_length                 0
sentiment_polarity          0
sentiment_subjectivity      0
sentiment_label             0
likes                       0
replies                     0
created_at                  0
source                      0
source_id                   0
platform_post_id            0
platform_post_title         0
platform_community          0
platform_post_score         0
platform_post_engagement    0
contains_ai                 0
contains_opinion            0
contains_societal           0
dtype: int64

In [6]:
## 6. Text Cleaning Functions

class TextCleaner:
    """Comprehensive text cleaning and preprocessing"""
    
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        
        # Custom patterns
        self.url_pattern = re.compile(r'http\S+|www\.\S+|https\S+')
        self.email_pattern = re.compile(r'\S+@\S+')
        self.mention_pattern = re.compile(r'@\w+')
        self.hashtag_pattern = re.compile(r'#\w+')
        self.number_pattern = re.compile(r'\d+')
        
    def remove_urls(self, text):
        """Remove URLs"""
        return self.url_pattern.sub('', text)
    
    def remove_emails(self, text):
        """Remove email addresses"""
        return self.email_pattern.sub('', text)
    
    def remove_mentions(self, text):
        """Remove social media mentions"""
        return self.mention_pattern.sub('', text)
    
    def remove_hashtags(self, text):
        """Remove hashtags but keep the text"""
        return self.hashtag_pattern.sub(lambda m: m.group(0)[1:], text)
    
    def remove_extra_whitespace(self, text):
        """Remove extra whitespace"""
        return ' '.join(text.split())
    
    def lowercase(self, text):
        """Convert to lowercase"""
        return text.lower()
    
    def remove_punctuation(self, text, keep_sentence_end=True):
        """Remove punctuation, optionally keep sentence endings"""
        if keep_sentence_end:
            # Keep . ! ? for sentence structure
            translator = str.maketrans('', '', string.punctuation.replace('.', '').replace('!', '').replace('?', ''))
        else:
            translator = str.maketrans('', '', string.punctuation)
        return text.translate(translator)
    
    def remove_numbers(self, text):
        """Remove numbers"""
        return self.number_pattern.sub('', text)
    
    def remove_stopwords(self, text):
        """Remove stopwords"""
        words = word_tokenize(text)
        filtered = [w for w in words if w.lower() not in self.stop_words]
        return ' '.join(filtered)
    
    def lemmatize(self, text):
        """Lemmatize text"""
        words = word_tokenize(text)
        lemmatized = [self.lemmatizer.lemmatize(w) for w in words]
        return ' '.join(lemmatized)
    
    def stem(self, text):
        """Stem text"""
        words = word_tokenize(text)
        stemmed = [self.stemmer.stem(w) for w in words]
        return ' '.join(stemmed)
    
    def clean_basic(self, text):
        """Basic cleaning: URLs, emails, whitespace, lowercase"""
        if pd.isna(text) or not isinstance(text, str):
            return ""
        
        text = self.remove_urls(text)
        text = self.remove_emails(text)
        text = self.remove_mentions(text)
        text = self.remove_hashtags(text)
        text = self.lowercase(text)
        text = self.remove_extra_whitespace(text)
        
        return text.strip()
    
    def clean_standard(self, text):
        """Standard cleaning: basic + punctuation + numbers"""
        text = self.clean_basic(text)
        text = self.remove_punctuation(text, keep_sentence_end=False)
        text = self.remove_numbers(text)
        text = self.remove_extra_whitespace(text)
        
        return text.strip()
    
    def clean_aggressive(self, text):
        """Aggressive cleaning: standard + stopwords + lemmatization"""
        text = self.clean_standard(text)
        text = self.remove_stopwords(text)
        text = self.lemmatize(text)
        text = self.remove_extra_whitespace(text)
        
        return text.strip()

# Initialize cleaner
cleaner = TextCleaner()

## 7. Apply Text Cleaning (TF-IDF vs BERT Optimized)

def clean_for_tfidf(text, cleaner):
    """
    Optimized cleaning for TF-IDF:
    - Aggressive preprocessing: lowercase, no punctuation, no stopwords
    - Lemmatization to reduce vocabulary
    - Remove numbers (they're not meaningful for TF-IDF)
    - Goal: Clean bag-of-words representation
    """
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Basic cleaning
    text = cleaner.remove_urls(text)
    text = cleaner.remove_emails(text)
    text = cleaner.remove_mentions(text)
    text = cleaner.remove_hashtags(text)
    text = cleaner.lowercase(text)
    
    # Remove punctuation and numbers
    text = cleaner.remove_punctuation(text, keep_sentence_end=False)
    text = cleaner.remove_numbers(text)
    
    # Remove stopwords (they don't add value to TF-IDF)
    text = cleaner.remove_stopwords(text)
    
    # Lemmatize to reduce vocabulary size
    text = cleaner.lemmatize(text)
    
    # Clean whitespace
    text = cleaner.remove_extra_whitespace(text)
    
    return text.strip()

def clean_for_bert(text, cleaner):
    """
    Optimized cleaning for BERT:
    - Minimal preprocessing: preserve context and semantics
    - Keep punctuation (BERT uses it for understanding)
    - Keep stopwords (BERT learns from them)
    - Keep case variations (BERT has case-sensitive and case-insensitive versions)
    - Keep numbers (can be contextually important)
    - Goal: Natural, contextual text that BERT can understand
    """
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Only remove noise that doesn't add meaning
    text = cleaner.remove_urls(text)
    text = cleaner.remove_emails(text)
    
    # Keep mentions and hashtags as they might have semantic value
    # Just remove the @ and # symbols
    text = text.replace('@', '').replace('#', '')
    
    # Clean extra whitespace but preserve structure
    text = cleaner.remove_extra_whitespace(text)
    
    # Keep original case, punctuation, numbers, stopwords
    # BERT's tokenizer will handle these appropriately
    
    return text.strip()


# TF-IDF optimized: aggressive cleaning
print("\n1. Cleaning for TF-IDF (aggressive preprocessing)...")
merged_df['text_tfidf'] = merged_df['text'].apply(lambda x: clean_for_tfidf(x, cleaner))

# BERT optimized: minimal cleaning
print("2. Cleaning for BERT (minimal preprocessing)...")
merged_df['text_bert'] = merged_df['text'].apply(lambda x: clean_for_bert(x, cleaner))

# Also keep a basic cleaned version for general analysis
print("3. Creating basic cleaned version for EDA...")
merged_df['text_clean'] = merged_df['text'].apply(cleaner.clean_basic)

print("\n✓ Text cleaning complete!")


1. Cleaning for TF-IDF (aggressive preprocessing)...
2. Cleaning for BERT (minimal preprocessing)...
3. Creating basic cleaned version for EDA...

✓ Text cleaning complete!


In [7]:
def engineer_features(df):
    """Create additional features for analysis"""
    
    df = df.copy()
    
    # --- Temporal Features ---
    df['year'] = df['created_at'].dt.year
    df['month'] = df['created_at'].dt.month
    df['day_of_week'] = df['created_at'].dt.dayofweek
    df['hour'] = df['created_at'].dt.hour
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    
    # --- Text Features (using basic clean for consistency) ---
    df['word_count'] = df['text_clean'].apply(lambda x: len(str(x).split()))
    
    # Punctuation features
    df['exclamation_count'] = df['text'].apply(lambda x: str(x).count('!'))
    df['question_count'] = df['text'].apply(lambda x: str(x).count('?'))
    
    # --- Engagement Features ---
    df['engagement_score'] = df['likes'] + (df['replies'] * 2)  # Weight replies higher
    
    # Log transform for skewed features
    df['likes_log'] = np.log1p(df['likes'])
    df['replies_log'] = np.log1p(df['replies'])
    df['engagement_log'] = np.log1p(df['engagement_score'])
    
    # --- Sentiment Features ---
    df['is_positive'] = (df['sentiment_polarity'] > 0.1).astype(int)
    df['is_negative'] = (df['sentiment_polarity'] < -0.1).astype(int)
    df['is_neutral'] = ((df['sentiment_polarity'] >= -0.1) & (df['sentiment_polarity'] <= 0.1)).astype(int)
    df['sentiment_magnitude'] = np.abs(df['sentiment_polarity'])
    
    return df


In [8]:
clean_df = engineer_features(merged_df)

In [9]:
clean_df.to_csv('../data/df_cleaned.csv', index=False)