In [None]:
#set up packages
import pandas as pd
import numpy as np
import time # To put the system to sleep

In [None]:
#import csv with comment data
# Read CSV into Pandas DataFrame
df = pd.read_csv('data/raw_data.csv')

In [None]:
#Keep only relevant columns
comments = df[['game', 'video_id', 'comment_like_count', 'text', 'reply_count', 'comment_date']]
comments['game'] = [i.replace("FIFA Women's World Cup 2023", "") if type(i) == str else '' for i in comments['game'] ]

# Display the DataFrame
comments.head()

In [None]:
#remove na values from comments
comments_cleaned = comments.dropna(subset=['text']).copy()

#convert to string
comments_cleaned['text_string'] = comments_cleaned["text"].astype(str)

In [None]:
#roberta library
from transformers import pipeline
from tqdm import tqdm
import numpy as np
tqdm.pandas()

In [None]:
#Create a text classification pipeline using RoBERTa

hate_speech = pipeline("sentiment-analysis", model = "facebook/roberta-hate-speech-dynabench-r4-target")

In [None]:
# apply pipeline to comment text
# Truncating because transformers can only handle upto 512 tokens
comments_cleaned["roberta_hs_score"]=comments_cleaned["text_string"].progress_apply(hate_speech, truncation=True, max_length=512)

In [None]:
comments_cleaned.head()

In [None]:
#Extract label dummy
comments_cleaned["roberta_label"]=comments_cleaned["roberta_hs_score"].progress_apply(lambda x: np.where(x[0]["label"]=="hate", 1, 0))

#Extract score
comments_cleaned["roberta_score"] = comments_cleaned["roberta_hs_score"].progress_apply(lambda x: x[0]["score"] if x else None)

In [None]:
comments_cleaned.head()

In [None]:
comments_cleaned.to_csv('data/data_w_roberta_score.csv', index=False)