# LABELLING - ACTIVE LEARNING

In [1]:
%pip install transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import pandas as pd
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

## 1. Labelling and Finetuning functions

In [35]:
'''
Function that labels the data with the provided model
and saves the labeled data to a csv file. Additionally,
it saves 100 rows with the lowest RoBERTa confidence scores
to a new CSV file.

Params:
model - the model to be used for sentiment analysis
tokenizer - the tokenizer to be used for sentiment analysis
dataset - dataframe containing the entire dataset
round - active learning round
'''
def label_data(model, tokenizer, dataset, round):
  # Initialize the sentiment analysis pipeline
  sentiment_pipeline = pipeline("text-classification", 
                                model=model,
                                tokenizer=tokenizer,
                                device=0) 
  
  # Extract the text column of selected_data as a list
  reviews = dataset["text"].tolist()
    
  # Calculate the sentiment of the each of the reviews
  print(f"\nRound {round} - Automated Labelling ")
  print("Predicting sentiment labels of data...")

  kwargs = {'padding':True,'truncation':True,'max_length':512}
  results = sentiment_pipeline(reviews, **kwargs) 

  print("Sentiment labels predicted.")
  print("Saving labeled data to a csv files...")

  # Add the sentiment and score to the selected_data DataFrame
  label2id = {"positive": 1, "negative": -1, "neutral": 0}
  dataset["roberta_label"] = [label2id[res["label"]] for res in results]
  dataset["roberta_score"] = [res["score"] for res in results]

  # Save the labeled data to a csv file
  dataset.to_csv(f'../Data/Labelling/round{round}_roberta_labelled_all_data.csv', index=False)

  # Save 100 rows with the lowest RoBERTa confidence scores to a new CSV file
  df_low_confidence = dataset.nsmallest(100, 'roberta_score')
  df_low_confidence.to_csv(f'../Data/Labelling/round{round}_roberta_labelled_low_confidence.csv', index=False)
  
  print(f"Completed Round {round} - Automated Labeling")

  return dataset

In [36]:
def load_manual_data(directory, round):
  pass

In [37]:
def finetune(model, train_data):
  model.save_pretrained(f'../Models/round1_finetuned_model/')
  pass

## 2. Run Active Learning Loop

Active learning allows us to manually label the most informative parts of the dataset that confuses the model the most. 

In [39]:
# Function to conduct one round of active learning
def active_learning(model, tokenizer, dataset, rounds = 5):
  
  for round in range(1, rounds+1):
    print (f"Round {round} of Active Learning")
    # 1. Using the model, automatically label the entire dataset
    label_data(model = model, 
              df = dataset, 
              tokenizer = tokenizer,
              round = round)
    
    # 2. Load the manually labeled data, including the newly labeled data from the previous round
    train_data = load_manual_data(directory = '../Data/Labelling/Manual', 
                                  round = round) 
    
    # 3. Fine-tune the model on the manually labeled data
    model = finetune(model = model, 
                     train_data = train_data,
                     round = round)
        
    print (f"Completed Round {round} of Active Learning")


In [None]:
# Load the data
dataset = pd.read_csv('../Data/selected_data.csv')

In [14]:
# Load the pretrained model, tokenizer, and configuration from Hugging Face
pretrained_model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
config = AutoConfig.from_pretrained(pretrained_model_name)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Conduct active learning
active_learning(model, tokenizer, dataset, round = 5)