In [1]:
import pandas as pd
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from scipy.special import softmax

MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)


def polarity_score_roberta(data):
    # Specify the maximum sequence length
    max_length = 512  # Adjust this based on the model's maximum sequence length

    # Tokenize and truncate/pad the input text
    encoded_text = tokenizer(data, return_tensors='tf', max_length=max_length, truncation=True, padding=True)
    
    output = model(**encoded_text)
    scores = output[0][0].numpy()
    scores = softmax(scores)
    
    scores_dict = {
        "roberta_neg": scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    }
    
    return scores_dict

# Define the function to generate labels
def generate_roberta_labels(scores, positive_threshold=0.5, negative_threshold=0.5):
    roberta_neg, roberta_neu, roberta_pos = scores['roberta_neg'], scores['roberta_neu'], scores['roberta_pos']

    sentiment_results_dict = {'positive': roberta_pos, 
                              'negative': roberta_neg, 
                              'neutral': roberta_neu}

    highest_sentiment = max(sentiment_results_dict.items(), key=lambda x: x[1])

    if highest_sentiment[0] == 'positive':    
        if highest_sentiment[1] >= 0.8:        
            return "Strongly Positive"
        else:        
            return "Positive"
    elif highest_sentiment[0] == 'negative':    
        if highest_sentiment[1] >= 0.8:        
            return "Strongly Negative"
        else:
            return "Negative"
    else:
        return "Neutral"


if __name__ == "__main__":
    
    # TODO: CHANGE SPLIT NUMBER
    # Get the assigned file from https://drive.google.com/drive/folders/1G3ZPJGBr5JaSXtC-2cMWP8OA5NI-zSiT
    input_filepath = "./dataset/split_dataset/reviews_split_1.csv"
    output_filepath = '/dataset/split_dataset/reviews_split_1_sentiment.csv'

    print("Reading the file...")
    movie_reviews_df = pd.read_csv(input_filepath)
    movie_reviews_df['combined_review'] = movie_reviews_df['review_summary'] + movie_reviews_df['review_detail']
    
    print("Processing Movie Reviews...")
    movie_reviews_df['roberta_sentiment'] = movie_reviews_df['combined_review'].apply(lambda x : generate_roberta_labels(polarity_score_roberta(x)))

    movie_reviews_df = movie_reviews_df.drop(columns=['combined_review'], axis=1)
    movie_reviews_df.to_csv(output_filepath)

    print("\nSUCCESSFULLY SAVED THE FILE...")

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Reading the file...


FileNotFoundError: [Errno 2] No such file or directory: './dataset/reviews_split_1.csv'