# LABELLING - ACTIVE LEARNING

In [None]:
%pip install transformers

In [None]:
import pandas as pd
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

  from .autonotebook import tqdm as notebook_tqdm


### 1. Labelling with RoBERTa based sentiment analysis model

In [None]:
# Read the selected data
selected_data = pd.read_csv('../Data/selected_data.csv')

In [None]:
# Initialize the sentiment analysis pipeline
sentiment_pipeline = pipeline("text-classification", 
                              model="cardiffnlp/twitter-roberta-base-sentiment-latest",
                              device=0) 




Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [None]:
# Extract the text column of selected_data as a list
reviews = selected_data["text"].tolist()

In [None]:
# Calculate the sentiment of the each of the reviews
kwargs = {'padding':True,'truncation':True,'max_length':512}
results = sentiment_pipeline(reviews, **kwargs) 

In [None]:
selected_data["roberta_label"] = [res["label"] for res in results]
selected_data["roberta_score"] = [res["score"] for res in results]

In [None]:
# Save 100 rows with the lowest RoBERTa confidence scores to a new CSV file
low_confidence_rows = selected_data.sort_values("roberta_score").head(100)
low_confidence_rows.drop(columns=["vader_label", "vader_score"], inplace=True)

In [None]:
low_confidence_rows.to_csv('../Data/labelling-round_1.csv', index=False)