# Data analysis
- Step 1: Relevance Classification (Relevant vs Irrelevant)
- Step 2: Agreement Classification (Agree/Neutral/Disagree)
### Prerequisites

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from wordcloud import WordCloud

In [None]:
# pd.set_option("display.max_colwidth", None) # turn ON full text
# pd.reset_option("display.max_colwidth") # turn OFF full text

### Load the dataset

In [None]:
df_comments = pd.read_csv("youtube_comments_clean.csv")

In [None]:
# Add columns to track labels
df_comments['relevance_label'] = np.nan  # Step 1: relevant = 1 / irrelevant = 0
df_comments['agree_label'] = np.nan      # Step 2: agree = 1 / neutral = 0 / disagree = -1
df_comments['dataset_split'] = np.nan    # Track train/val/test

### Split the dataset into Train-Val-Test (60-20-20)

In [None]:
# Shuffle dataset
df_comments = df_comments.sample(frac=1, random_state=42).reset_index(drop=True)

# Split data into training and test sets
train_val, test = train_test_split(df_comments, test_size=0.2, random_state=42)

# Split training data into training and validation sets
train, val = train_test_split(train_val, test_size=0.25, random_state=42)

# Assign dataset_split column
df_comments.loc[train.index, 'dataset_split'] = 'train'
df_comments.loc[val.index, 'dataset_split'] = 'val'
df_comments.loc[test.index, 'dataset_split'] = 'test'

print("Train:", len(train), "Val:", len(val), "Test:", len(test))

## Step 1: Relevance Classification (Relevant vs Irrelevant)
### Sampling

In [None]:
# Sample 1000 unlabeled comments from train
sample_to_label = df_comments[(df_comments['dataset_split']=='train') & (df_comments['relevance_label'].isna())].sample(1000, random_state=42)

# Keep original comment for labeling
sample_to_label_export = sample_to_label[['comment']].copy()
sample_to_label_export['relevance_label'] = ""  # empty column to fill manually

# Export to Excel for manual labeling
sample_to_label_export.to_excel("relevance_label_sample.xlsx", index=False)
print("Exported 1000 comments for manual relevance labeling.")

### Run the code below after completing manual labelling

In [None]:
# Load the labled EXCEL file
labeled_relevance = pd.read_excel("relevance_label_sample.xlsx")

In [None]:
# Merge the labels back into df_comments
df_comments_merged1 = df_comments.merge(
    labeled_relevance,
    on="clean_comment",
    how="left"
)

In [None]:
# Save the CSV
df_comments_merged1.to_csv("youtube_comments_relevance_labled.csv", index=False)

**Note:** Although I manually labeled 1000 comments for the first step of SML, after merging the labels back into the main dataset, only 973 labeled comments remained. This is because some comments in the dataset are duplicated, and duplicates were removed during the merge to ensure each comment has a unique label for training.

### Train BERT for step 1 (relevance) classification

In [None]:
# Only keep rows with labels for training
df_labeled = df_comments_merged1[df_comments_merged1['relevance_label'].notna()].copy()
print(f"Labeled comments available for training: {len(df_labeled)}")

In [None]:
# Select the columns for SML
df = df_labeled[['clean_comment', 'relevance_label']].dropna()

# Ensure label is integer
df['relevance_label'] = df['relevance_label'].astype(int)

In [None]:
# Split into train, validation, test (e.g., 70-15-15)
texts = df['clean_comment'].tolist()
labels = df['relevance_label'].tolist()

X_train_val, X_test, y_train_val, y_test = train_test_split(
    texts, labels, test_size=0.15, random_state=42, stratify=labels
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.1765, random_state=42, stratify=y_train_val
)

In [None]:
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict({'text': X_train, 'label': y_train})
val_dataset = Dataset.from_dict({'text': X_val, 'label': y_val})
test_dataset = Dataset.from_dict({'text': X_test, 'label': y_test})

In [None]:
# Tokenization
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
# Initialize BERT model
num_labels = 2  # 2 classes for relevance: 1 = relevant, 0 = irrelevant

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # automatically choose GPU if available, otherwise use CPU
print("Using device:", device)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

Source of code: [transformers_bert_classification_collab.ipynb](https://github.com/uvacw/teaching-bdaca/blob/main/modules/machinelearning-text-exercises/transformers_bert_classification_collab.ipynb)

In [None]:
# Training
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"

metric_name = "accuracy" # you can change this for macro f1 etc

training_args = TrainingArguments(
    # Where to save model + checkpoints
    output_dir="./results",

    # Training setup
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_steps=0,
    weight_decay=0.01,

    # Logging
    logging_dir="./logs",
    logging_steps=20,

    # Evaluation & saving
    eval_strategy="steps",   # evaluate every eval_steps
    eval_steps=50,
    save_st# Initialize BERT model
num_labels = 2  # 2 classes for relevance: 1 = relevant, 0 = irrelevant

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # automatically choose GPU if available, otherwise use CPU
print("Using device:", device)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)rategy="steps",   # save checkpoint every save_steps
    save_steps=50,

    # Best-model loading
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    greater_is_better=True,

    # Run on CPU or GPU automatically (Trainer + accelerate handle this)
    # You don't need to set device manually here
)

# Define metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

trainer = Trainer(
    model=model,                         # the instantiated ü§ó Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,           # evaluation dataset
    compute_metrics=compute_metrics      # our custom evaluation function
)

trainer.train()

In [None]:
# Lock a decision threshold
THRESHOLD = 0.35

Wrap the full dataframe into a HuggingFace Dataset object:

In [None]:
# Select text for classification
df_full = df_comments.copy()

texts = df_full["clean_comment"].astype(str).tolist()

In [None]:
# Tokenize the full dataset with the same tokenizer
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

full_dataset = Dataset.from_dict({"text": texts})
full_dataset = full_dataset.map(tokenize, batched=True)

In [None]:
# Set dataset format for PyTorch
full_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask"]
)

In [None]:
# Apply the model to the FULL dataset (6610)
# Predict probabilities:
preds = trainer.predict(full_dataset)
probs = torch.softmax(torch.tensor(preds.predictions), dim=1)[:, 1]
# Apply threshold
df_comments["relevant_pred"] = (probs >= THRESHOLD).cpu().numpy().astype(int)
df_comments["relevant_prob"] = probs.cpu().numpy()

In [None]:
# Final check
df_comments["relevant_pred"].value_counts(normalize=True)

In [None]:
# Filter relevant comments
df_relevant = df_comments[df_comments["relevant_pred"] == 1]
print("Relevant comments:", len(df_relevant))

In [None]:
# Save full dataset with predictions
df_comments.to_csv("youtube_comments_relevance_trained.csv", index=False)

## Step 2: Agreement Classification (Agree/Neutral/Disagree)
### Prepare EXCEL file for step 2 labeling

In [None]:
# Load the labeled relevance Excel
labeled_relevance = pd.read_excel("relevance_label_sample.xlsx")

In [None]:
# Filter only the relevant comments
relevant_comments = labeled_relevance[labeled_relevance['relevance_label'] == 1]

In [None]:
# Keep only the comment text and create an empty column for the agreement label
step2_comments = relevant_comments[['clean_comment']]
step2_comments['agree_label'] = ""  # empty for manual labeling

In [None]:
# Export to Excel for manual labeling
step2_comments.to_excel("agreement_label.xlsx", index=False)

print(f"Exported {len(step2_comments)} relevant comments for Step 2 labeling.")

### Run the code below after completing manual labelling

In [None]:
# Load the labled EXCEL file
labeled_agree = pd.read_excel("agreement_label.xlsx")

In [None]:
df_comments_merged1['clean_comment'] = df_comments_merged1['clean_comment'].astype(str)
labeled_agree['clean_comment'] = labeled_agree['clean_comment'].astype(str)

In [None]:
df_comments_merged1 = df_comments_merged1.drop(columns=['agree_label'])

In [None]:
df_comments_merged2 = df_comments_merged1.merge(
    labeled_agree,
    on="clean_comment",
    how="left"
)

In [None]:
# Save the CSV
df_comments_merged2.to_csv("youtube_comments_agree_labled.csv", index=False)

### Train BERT for step 2 (agreement) classification

In [None]:
# Only keep rows with labels for training
df_labeled = df_comments_merged2[df_comments_merged2['agree_label'].notna()].copy()
print(f"Labeled comments available for training: {len(df_labeled)}")

In [None]:
# Select the columns for SML
df = df_labeled[['clean_comment', 'agree_label']].dropna()

# Ensure label is integer
df['agree_label'] = df['agree_label'].astype(int)

In [None]:
# Split into train, validation, test (e.g., 70-15-15)
texts = df['clean_comment'].tolist()
labels = df['agree_label'].tolist()

X_train_val, X_test, y_train_val, y_test = train_test_split(
    texts, labels, test_size=0.15, random_state=42, stratify=labels
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.1765, random_state=42, stratify=y_train_val
)

In [None]:
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict({'text': X_train, 'label': y_train})
val_dataset = Dataset.from_dict({'text': X_val, 'label': y_val})
test_dataset = Dataset.from_dict({'text': X_test, 'label': y_test})

In [None]:
# Tokenization
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
# Initialize BERT model
num_labels = 3  # 2 classes for agreements: 1 = agree, 0 = neither agree nor disagree, -1 = disagree

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # automatically choose GPU if available, otherwise use CPU
print("Using device:", device)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

In [None]:
df["agree_label"].value_counts()

Note:
- 1 refers to 0 in manual coding (for neither agree nor disagree)
- 0 refers to -1 in manual coding (for disagree)
- 2 refers to 1 in manual coding (for agree)

I applied class weighting to avoid the classification over-favor/learn from the dominant category, which is neither agree nor disagree in this case.

In [None]:
# Compute class weights
labels = df["agree_label"].values

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(labels),
    y=labels
)

class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)

In [None]:
# Define a new trainer with weighted loss
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights.to(self.model.device)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

Source of code: [transformers_bert_classification_collab.ipynb](https://github.com/uvacw/teaching-bdaca/blob/main/modules/machinelearning-text-exercises/transformers_bert_classification_collab.ipynb)

In [None]:
# Training
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"

metric_name = "f1" # you can change this for macro f1 etc

training_args = TrainingArguments(
    # Where to save model + checkpoints
    output_dir="./results",

    # Training setup
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=1e-5,
    warmup_steps=0,
    weight_decay=0.01,

    # Logging
    logging_dir="./logs",
    logging_steps=20,

    # Evaluation & saving
    eval_strategy="steps",   # evaluate every eval_steps
    eval_steps=50,
    save_strategy="steps",   # save checkpoint every save_steps
    save_steps=50,

    # Best-model loading
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    greater_is_better=True,

    # Run on CPU or GPU automatically (Trainer + accelerate handle this)
    # You don't need to set device manually here
)

# Define metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro',zero_division=0)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    class_weights=class_weights
)

trainer.train()

Source of code: [BERTopic_demo.ipynb](https://github.com/uvacw/teaching-bdaca/blob/main/6ec-course/week05/exercises/BERTopic_demo.ipynb)

In [None]:
# Select text for classification
df_full = df_comments.copy() # refers to "youtube_comments_relevance_trained.csv"

df_relevant = df_comments[df_comments["relevant_pred"] == 1]
print("Relevant comments:", len(df_relevant))

texts = df_relevant["clean_comment"].astype(str).tolist()

In [None]:
# Tokenize the full dataset with the same tokenizer
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

full_dataset = Dataset.from_dict({"text": texts})
full_dataset = full_dataset.map(tokenize, batched=True)

In [None]:
# Set dataset format for PyTorch
full_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask"]
)

In [None]:
# Apply the model to the FULL dataset
preds = trainer.predict(full_dataset)  # full_dataset now corresponds to df_relevant
logits = torch.tensor(preds.predictions)

# Get predicted class IDs (0, 1, or 2)
predicted_class_ids = torch.argmax(logits, dim=1).cpu().numpy()

# Get softmax probabilities for all classes
all_probs = torch.softmax(logits, dim=1).cpu().numpy()

# Initialize new columns in df_comments with NaN
df_comments["agree_pred"] = np.nan
df_comments["agree_pred_prob_neg"] = np.nan
df_comments["agree_pred_prob_neutral"] = np.nan
df_comments["agree_pred_prob_pos"] = np.nan

# Assign predictions to the corresponding rows in df_comments using the index of df_relevant
df_comments.loc[df_relevant.index, "agree_pred"] = predicted_class_ids
df_comments.loc[df_relevant.index, "agree_pred_prob_neg"] = all_probs[:, 0]
df_comments.loc[df_relevant.index, "agree_pred_prob_neutral"] = all_probs[:, 1]
df_comments.loc[df_relevant.index, "agree_pred_prob_pos"] = all_probs[:, 2]

In [None]:
# Relabel the trained comments to align with the codebook
mapping = {0: -1, 1: 0, 2: 1}  # adjust based on your training labels
df_comments['agree_pred'] = df_comments['agree_pred'].map(mapping)

In [None]:
# Final check
df_comments["agree_pred"].value_counts(normalize=True)

In [None]:
# Save full dataset with predictions
df_comments.to_csv("youtube_comments_agree_trained.csv", index=False)

### *RQ1: How much do people agree, disagree, or neither agree nor disagree with ‚ÄúIt‚Äôs embarrassing to have a boyfriend now‚Äù?*

In [None]:
df_comments = pd.read_csv("youtube_comments_agree_trained.csv") # delete!!!

In [None]:
# Count number of comments in each predicted category
counts = df_comments['agree_pred'].value_counts()
print("Counts:\n", counts)

# Calculate the proportions
proportions = df_comments['agree_pred'].value_counts(normalize=True)
print("\nProportions:\n", proportions)

In [None]:
# Bar chart
colors = ['orange', 'red', 'green']  # Neutral, Disagree, Agree
ax = counts.plot(kind='bar', color=colors)
plt.xlabel('Type of agreement')
plt.ylabel('Number of comments')
ax.set_xticks(range(len(counts)))
ax.set_xticklabels(['Neutral', 'Disagree', 'Agree'], rotation=0)

# Save the figure locally
plt.savefig('agreement_bar_chart.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
df_comments.groupby('agree_pred')['comment_likes'].agg(['sum', 'mean', 'max', 'min', 'median', 'std'])

## Step 3: 
### Prepare data

In [None]:
# Create a dataframe for the relevant comments that expressed neither agree nor disagree opinion
neutral_df = df_comments[
    (df_comments["relevant_pred"] == 1) &
    (df_comments["agree_pred"] == 0)
].copy()

In [None]:
# Vectorization of the clean comments
vectorizer_model = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    min_df=1,      # ignore rare words
    max_df=0.85      # ignore very frequent words
)

### Train a BERTopic model

In [None]:
# Apply SentenceBERT embedding
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Apply the BERTopic model
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=50,
    calculate_probabilities=True,
    verbose=True
)

docs = neutral_df['clean_comment'].tolist()
topics, probs = topic_model.fit_transform(docs)

### Inspect the results

In [None]:
# See the topics:
topic_info = topic_model.get_topic_info()
topic_info.to_csv('bertopic_topic_info.csv', index=False)
topic_info

In [None]:
# Get information for each document:
doc_info = topic_model.get_document_info(docs)
doc_info.to_csv('bertopic_document_info.csv', index=False)
doc_info

### Visualize the results
Source of code: [visualization.ipynb](https://github.com/uvacw/teaching-bdaca/blob/main/modules/basics/visualization.ipynb)

In [None]:
# Bar chart of top keywords
fig = topic_model.visualize_barchart(top_n_topics=5)
fig.update_layout(title_text='')
fig.write_html("bertopic_keyword_barchart.html")
fig.show()

In [None]:
# Wordcloud overview for each topic
topics = topic_model.get_topics()
topic_ids = [0, 1, 2, 3] # exlcude topic -1

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.flatten()

for ax, topic_id in zip(axes, topic_ids):
    words = topics[topic_id]
    word_freq = {word: weight for word, weight in words}

    wc = WordCloud(
        width=600,
        height=400,
        background_color='white',
        max_words=30
    ).generate_from_frequencies(word_freq)

    ax.imshow(wc, interpolation='bilinear')
    ax.set_title(f'Topic {topic_id}', fontsize=12)
    ax.axis('off')

plt.tight_layout()

# Save locally
plt.savefig(
    'neutral_topics_wordcloud.png',
    dpi=300,
    bbox_inches='tight'
)

plt.show()