# Data analysis
### Prerequisites

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# pd.set_option("display.max_colwidth", None) # turn ON full text
# pd.reset_option("display.max_colwidth") # turn OFF full text

### Load the dataset

In [3]:
df_comments = pd.read_csv("youtube_comments_clean.csv")

In [4]:
# Add columns to track labels
df_comments['relevance_label'] = np.nan  # Step 1: relevant = 1 / irrelevant = 0
df_comments['agree_label'] = np.nan      # Step 2: agree = 1 / neutral = 0 / disagree = -1
df_comments['dataset_split'] = np.nan    # Track train/val/test

### Split the dataset into Train-Val-Test (60-20-20)

In [5]:
# Shuffle dataset
df_comments = df_comments.sample(frac=1, random_state=42).reset_index(drop=True)

# Split data into training and test sets
train_val, test = train_test_split(df_comments, test_size=0.2, random_state=42)

# Split training data into training and validation sets
train, val = train_test_split(train_val, test_size=0.25, random_state=42)

# Assign dataset_split column
df_comments.loc[train.index, 'dataset_split'] = 'train'
df_comments.loc[val.index, 'dataset_split'] = 'val'
df_comments.loc[test.index, 'dataset_split'] = 'test'

print("Train:", len(train), "Val:", len(val), "Test:", len(test))

Train: 3966 Val: 1322 Test: 1322


  df_comments.loc[train.index, 'dataset_split'] = 'train'


## Step 1: Relevance Classification (Relevant vs Irrelevant)
### Sampling

In [6]:
# Sample 1000 unlabeled comments from train
sample_to_label = df_comments[(df_comments['dataset_split']=='train') & (df_comments['relevance_label'].isna())].sample(1000, random_state=42)

# Export clean comment for labeling
sample_to_label_export = sample_to_label[['clean_comment']].copy()
sample_to_label_export['relevance_label'] = ""  # empty column to fill manually

# Export to Excel for manual labeling
sample_to_label_export.to_excel("relevance_label_sample.xlsx", index=False)
print("Exported 1000 comments for manual relevance labeling.")

Exported 1000 comments for manual relevance labeling.


### Run the code below after completing manual labelling

In [None]:
labeled_relevance = pd.read_excel("relevance_label_sample_labeled.xlsx")
# Merge back into df_comments
df_comments.loc[labeled_relevance.index, 'relevance_label'] = labeled_relevance['relevance_label']

### Train BERT for step 1 (relevance) classification

## Step 2: Sentiment/Agreement Classification (Agree/Neutral/Disagree)

In [None]:
# Filter the relevant comments
relevant_comments = df_comments[df_comments['relevance_label']==1]  # or predicted 1 if needed

### Sampling

In [None]:
sample_agree = relevant_comments.sample(300, random_state=42)
sample_agree_export = sample_agree[['clean_comment']].copy()
sample_agree_export['agree_label'] = ""  # empty for manual labeling
sample_agree_export.to_excel("agree_label_sample.xlsx", index=False)
