# Data analysis
### Prerequisites

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

In [None]:
# pd.set_option("display.max_colwidth", None) # turn ON full text
# pd.reset_option("display.max_colwidth") # turn OFF full text

### Load the dataset

In [2]:
df_comments = pd.read_csv("youtube_comments_clean.csv")

In [3]:
# Add columns to track labels
df_comments['relevance_label'] = np.nan  # Step 1: relevant = 1 / irrelevant = 0
df_comments['agree_label'] = np.nan      # Step 2: agree = 1 / neutral = 0 / disagree = -1
df_comments['dataset_split'] = np.nan    # Track train/val/test

### Split the dataset into Train-Val-Test (60-20-20)

In [4]:
# Shuffle dataset
df_comments = df_comments.sample(frac=1, random_state=42).reset_index(drop=True)

# Split data into training and test sets
train_val, test = train_test_split(df_comments, test_size=0.2, random_state=42)

# Split training data into training and validation sets
train, val = train_test_split(train_val, test_size=0.25, random_state=42)

# Assign dataset_split column
df_comments.loc[train.index, 'dataset_split'] = 'train'
df_comments.loc[val.index, 'dataset_split'] = 'val'
df_comments.loc[test.index, 'dataset_split'] = 'test'

print("Train:", len(train), "Val:", len(val), "Test:", len(test))

Train: 3966 Val: 1322 Test: 1322


  df_comments.loc[train.index, 'dataset_split'] = 'train'


## Step 1: Relevance Classification (Relevant vs Irrelevant)
### Sampling

In [5]:
# Sample 1000 unlabeled comments from train
sample_to_label = df_comments[(df_comments['dataset_split']=='train') & (df_comments['relevance_label'].isna())].sample(1000, random_state=42)

# Keep original comment for labeling
sample_to_label_export = sample_to_label[['comment']].copy()
sample_to_label_export['relevance_label'] = ""  # empty column to fill manually

# Export to Excel for manual labeling
sample_to_label_export.to_excel("relevance_label_sample.xlsx", index=False)
print("Exported 1000 comments for manual relevance labeling.")

Exported 1000 comments for manual relevance labeling.


### Run the code below after completing manual labelling

In [27]:
# Load the labled EXCEL file
labeled_relevance = pd.read_excel("relevance_label_sample.xlsx")

In [28]:
# Merge the labels back into df_comments
df_comments_merged1 = df_comments.merge(
    labeled_relevance,
    on="clean_comment",
    how="left"
)

In [29]:
# Save the CSV
df_comments_merged1.to_csv("youtube_comments_relevance_labled.csv", index=False)

**Note:** Although I manually labeled 1000 comments for the first step of SML, after merging the labels back into the main dataset, only 973 labeled comments remained. This is because some comments in the dataset are duplicated, and duplicates were removed during the merge to ensure each comment has a unique label for training.

### Train BERT for step 1 (relevance) classification

In [33]:
# Only keep rows with labels for training
df_labeled = df_comments_merged1[df_comments_merged1['relevance_label'].notna()].copy()
print(f"Labeled comments available for training: {len(df_labeled)}")

Labeled comments available for training: 973


In [35]:
# Select the columns for SML
df = df_labeled[['clean_comment', 'relevance_label']].dropna()

# Ensure label is integer
df['relevance_label'] = df['relevance_label'].astype(int)

In [None]:
# Split into train, validation, test (e.g., 70-15-15)
texts = df['clean_comment'].tolist()
labels = df['relevance_label'].tolist()

X_train_val, X_test, y_train_val, y_test = train_test_split(
    texts, labels, test_size=0.15, random_state=42, stratify=labels
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.1765, random_state=42, stratify=y_train_val
)

In [None]:
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict({'text': X_train, 'label': y_train})
val_dataset = Dataset.from_dict({'text': X_val, 'label': y_val})
test_dataset = Dataset.from_dict({'text': X_test, 'label': y_test})

In [None]:
# Tokenization
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
# Initialize BERT model
num_labels = 2  # relevance: 1 = relevant, 0 = irrelevant

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # automatically choose GPU if available, otherwise use CPU
print("Using device:", device)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

In [None]:
# Training
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"

metric_name = "accuracy" # you can change this for macro f1 etc

training_args = TrainingArguments(
    # Where to save model + checkpoints
    output_dir="./results",

    # Training setup
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    warmup_steps=0,
    weight_decay=0.01,

    # Logging
    logging_dir="./logs",
    logging_steps=20,

    # Evaluation & saving
    eval_strategy="steps",   # evaluate every eval_steps
    eval_steps=50,
    save_strategy="steps",   # save checkpoint every save_steps
    save_steps=50,

    # Best-model loading
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    greater_is_better=True,

    # Run on CPU or GPU automatically (Trainer + accelerate handle this)
    # You don't need to set device manually here
)

# Define metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)

trainer.train()

In [None]:
# Save the model
trainer.save_model(save_directory)

In [None]:
# Evaluate on the test set
trainer.evaluate(test_dataset)

## Step 2: Sentiment/Agreement Classification (Agree/Neutral/Disagree)

In [None]:
# Filter the relevant comments
relevant_comments = df_comments[df_comments['relevance_label']==1]  # or predicted 1 if needed

### Sampling

In [None]:
sample_agree = relevant_comments.sample(300, random_state=42)
sample_agree_export = sample_agree[['comment']].copy()
sample_agree_export['agree_label'] = ""  # empty for manual labeling
sample_agree_export.to_excel("agree_label_sample.xlsx", index=False)
