# LABELLING - ACTIVE LEARNING

In [40]:
%pip install transformers datasets





[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [50]:
import pandas as pd
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset

## 1. Labelling and Finetuning functions

In [35]:
'''
Function that labels the data with the provided model
and saves the labeled data to a csv file. Additionally,
it saves 100 rows with the lowest RoBERTa confidence scores
to a new CSV file.

Params:
model - the model to be used for sentiment analysis
tokenizer - the tokenizer to be used for sentiment analysis
dataset - dataframe containing the entire dataset
round - active learning round
'''
def label_data(model, tokenizer, dataset, round):
  # Initialize the sentiment analysis pipeline
  sentiment_pipeline = pipeline("text-classification", 
                                model=model,
                                tokenizer=tokenizer,
                                device=0) 
  
  # Extract the text column of selected_data as a list
  reviews = dataset["text"].tolist()
    
  # Calculate the sentiment of the each of the reviews
  print(f"\nRound {round} - Automated Labelling ")
  print("Predicting sentiment labels of data...")

  kwargs = {'padding':True,'truncation':True,'max_length':512}
  results = sentiment_pipeline(reviews, **kwargs) 

  print("Sentiment labels predicted.")
  print("Saving labeled data to a csv files...")

  # Add the sentiment and score to the selected_data DataFrame
  label2id = {"positive": 1, "negative": -1, "neutral": 0}
  dataset["roberta_label"] = [label2id[res["label"]] for res in results]
  dataset["roberta_score"] = [res["score"] for res in results]

  # Save the labeled data to a csv file
  dataset.to_csv(f'../Data/Labelling/round{round}_roberta_labelled_all_data.csv', index=False)

  # Save 100 rows with the lowest RoBERTa confidence scores to a new CSV file
  df_low_confidence = dataset.nsmallest(100, 'roberta_score')
  df_low_confidence.to_csv(f'../Data/Labelling/round{round}_roberta_labelled_low_confidence.csv', index=False)
  
  print(f"Completed Round {round} - Automated Labeling")

  return dataset

In [None]:
# Load, process and tokenize the manual data for each round
def process_manual_data(tokenizer, round):
  # Wait till the manually labelled data for the round is ready
  ready = input(f"Press 'y' when the manually labelled data for round {round} is added to the Data/Labelling/Manual folder: ")
  
  while ready.lower()!= 'y':
    print("Please add the manually labelled data to the Data/Labelling/Manual folder.")
    ready = input(f"Press 'y' when the manual data for round {round} is added to the Manual folder: ")

  # Load the manual data for the round, and all the rounds before it (to retain previously learnt patterns)
  manual_data = pd.read_csv(f'../Data/Labelling/Manual/round{round}_manual_low_confidence.csv')
  for i in range(1, round):
    round_data = pd.read_csv(f'../Data/Labelling/Manual/round{i}_manual_low_confidence.csv')
    manual_data = pd.concat([manual_data, round_data], ignore_index=True)

  # Drop all columns except the text and the manual label
  manual_data.drop(columns=[col for col in manual_data.columns if col not in ['text', 'manual_label']], inplace=True)

  # Drop the duplicates from the manual data, keeping the first occurence (latest label)
  manual_data.drop_duplicates(subset=['text'], keep='first', inplace=True)

  # Drop the rows which are NaN, or contain '2' values in the manual_label column (rows marked irrelevant during manual labelling)
  manual_data = manual_data[manual_data['manual_label'] != 2].dropna()

  # Convert to Dataset object
  manual_data = Dataset.from_pandas(manual_data)
  
  # Tokenize the data using the model's tokenizer
  manual_data_tokenized = manual_data.map(
    lambda instance: tokenizer(instance["text"], truncation=True, max_length=512),
    batched=True
  )

  print(f"Round {round} - Manual data loaded and processed.")


  return manual_data_tokenized
    

In [None]:
# Reference: https://huggingface.co/learn/nlp-course/en/chapter3/3?fw=pt
def finetune(model, train_data, tokenizer, round):

  # Define a data collator object for dynamic padding (padding to the maximum length of the batch)
  data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

  # Configure the training arguments
  training_arguments = TrainingArguments(
    output_dir = f'../Models/round{round}_finetuned_model_checkpoints/',
    num_train_epochs = 3
    )
  
  trainer = Trainer(
    model,
    training_arguments,
    train_dataset = train_data,
    data_collator = data_collator,
    tokenizer = tokenizer,
  )

  # TODO:
  # Rename labels from 1, -1, 0 to positive, negative, neutral
  # Create a validation set, add compute_metrics

  trainer.train()
  trainer.save_model(f'../Models/round{round}_finetuned_model')
  pass

## 2. Run Active Learning Loop

Active learning allows us to manually label the most informative parts of the dataset that confuses the model the most. 

In [47]:
# Function to conduct one round of active learning
def active_learning(model, tokenizer, dataset, rounds = 5):
  
  for round in range(1, rounds+1):
    print (f"Round {round} of Active Learning")
    # 1. Using the model, automatically label the entire dataset
    label_data(model = model, 
              df = dataset, 
              tokenizer = tokenizer,
              round = round)
    
    # 2. Load the manually labeled data, including the newly labeled data from the previous round
    train_data = process_manual_data(tokenizer = tokenizer,
                                     round = round) 
    
    # 3. Fine-tune the model on the manually labeled data
    model = finetune(model = model, 
                     train_data = train_data,
                     tokenizer = tokenizer,
                     round = round)
        
    print (f"Completed Round {round} of Active Learning")


In [None]:
# Load the data
dataset = pd.read_csv('../Data/selected_data.csv')

In [45]:
# Load the pretrained model and its tokenizer from Hugging Face
pretrained_model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Conduct active learning
active_learning(model, tokenizer, dataset, round = 5)

## Create a Validation Set from previously labelled data

In [64]:
old_manual_label = pd.read_csv(r'..\Data\Archived\labelled_data_manual_tfidf_transformer.csv')

In [65]:
selected_dataset = pd.read_csv(r'..\Data\selected_data.csv')

In [66]:
# Drop from old manual label, any rows that have 4 in m_label_1
old_manual_label = old_manual_label[old_manual_label['m_label_1'] != 4]

In [67]:
old_manual_label.describe()

Unnamed: 0,number_of_comments,number_of_upvotes,similarity,score_1,m_label_1
count,268.0,268.0,268.0,268.0,268.0
mean,150.951493,192.119403,0.287343,0.726031,-0.033582
std,386.466323,918.963661,0.121697,0.151392,0.831716
min,0.0,-19.0,0.101015,0.385341,-1.0
25%,11.0,2.0,0.191583,0.595328,-1.0
50%,46.0,7.0,0.258199,0.747033,0.0
75%,140.5,45.0,0.353553,0.859085,1.0
max,3958.0,8057.0,0.707107,0.975497,1.0


In [70]:
# Remove rows from old_manual_label that are present in selected_dataset(based on post_id and comment_id)
old_manual_label['post_id_comment_id'] = old_manual_label['post_id'].astype(str) + old_manual_label['comment_id'].astype(str)
selected_dataset['post_id_comment_id'] = selected_dataset['post_id'].astype(str) + selected_dataset['comment_id'].astype(str)
old_manual_label = old_manual_label[~old_manual_label['post_id_comment_id'].isin(selected_dataset['post_id_comment_id'])]

In [71]:
old_manual_label.describe()

Unnamed: 0,number_of_comments,number_of_upvotes,similarity,score_1,m_label_1
count,199.0,199.0,199.0,199.0,199.0
mean,132.809045,136.396985,0.287043,0.722433,-0.025126
std,309.480069,703.868178,0.123452,0.150289,0.825339
min,0.0,-19.0,0.101015,0.385341,-1.0
25%,11.0,2.0,0.179605,0.59695,-1.0
50%,47.0,7.0,0.258199,0.74139,0.0
75%,125.0,39.0,0.377964,0.855132,1.0
max,3039.0,7595.0,0.707107,0.975497,1.0


In [None]:
# Drop Cleaned Text, similarity, label_1, score_1, similarity, and post_id_comment_id columns
old_manual_label.drop(columns=['Cleaned Text', 'similarity', 'label_1', 'score_1', 'post_id_comment_id'], inplace=True)

In [77]:
# From the text field, remove the word None that appears at the end of the text
old_manual_label['text'] = old_manual_label['text'].str.replace('None$', '', regex=True)

In [78]:
old_manual_label.head()

Unnamed: 0,post_id,subreddit,post_title,post_body,number_of_comments,readable_datetime,post_author,number_of_upvotes,query,text,comment_id,comment_body,comment_author,m_label_1
1,1icahc2,ChatGPT,Why does deepseek keep calling itself chatgpt,"well I know identity doesn’t matter,",116,2025-01-28 23:50:33,baskerville_clan,166,,Why does deepseek keep calling itself chatgpt ...,,,,0.0
2,1ib7xft,ChatGPT,Please bro stop using the free better alternat...,,860,2025-01-27 16:40:08,analgerianabroad,124,,"""Faking"" low cost is literally what ChatGPT wa...",m9h65nu,"""Faking"" low cost is literally what ChatGPT wa...",Efrayl,-1.0
3,11r0qx0,OpenAI,API Throttling (not rate limit),I swear to god this is happening. I've been re...,5,2023-03-14 12:33:17,DocmodApp,2,rate limit,API Throttling (not rate limit) I swear to god...,,,,-1.0
4,1hgna9l,OpenAI,Google has overshadowed 12 days of open ai til...,The response open ai would have expected from ...,243,2024-12-18 02:55:27,No_Macaroon_7608,15,Sora,Google was so bad I thought did they get AI sc...,m2kzjk0,Google was so bad I thought did they get AI sc...,mike7seven,-1.0
5,11dto2l,OpenAI,Could Chat GPT create a pdf that has non-stand...,"Hi, I am reviewing a pdf and when I changed th...",0,2023-02-28 05:55:34,Few_Mathematician_13,0,ChatGPT review,Could Chat GPT create a pdf that has non-stand...,,,,0.0


In [80]:
# Save the old_manual_label to a new csv file
old_manual_label.to_csv(r'..\Data\Labelling\Manual\manual_val_test_set_tfidf.csv', index=False)