In [4]:
from openai import OpenAI
import pandas as pd
import json
import ast
from dotenv import load_dotenv
from tqdm import tqdm
import os

load_dotenv()

my_key = os.getenv("MY_API_KEY")

In [5]:
client = OpenAI(api_key=my_key)

In [6]:
batch_size = 50   # how many rows per API call
labels = []

In [None]:
MODEL = "gpt-4o-mini"
df = pd.read_csv("../Data/ishowspeed_chat_messages.csv")

for i in tqdm(range(0, len(df), batch_size), desc="Batch labeling"):
    batch_texts = df["message"].iloc[i:i+batch_size].tolist()

    # Construct the batch prompt
    prompt = "Classify each of the following chat messages as Offensive (1) or Not Offensive (0).\n"
    prompt += "Return only a Python-style list of 0s and 1s, in the same order.\n\n"

    for j, text in enumerate(batch_texts, start=1):
        prompt += f"{j}. {text}\n"

    # Call OpenAI
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a moderation classifier."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
    )

    # Parse the output
    output = response.choices[0].message.content.strip()
    try:
        # batch_labels = eval(output)  # risky but works if model outputs [0,1,0,...]
        batch_labels = ast.literal_eval(output)  # safer
    except:
        # fallback if parsing fails
        batch_labels = [int(x) for x in output.replace("[","").replace("]","").split(",")]

    labels.extend(batch_labels)

Batch labeling: 100%|██████████| 2/2 [00:09<00:00,  4.91s/it]


In [11]:
df["label"] = labels
output_name = "../Data/twitch_chat_labeled_by_GPT.csv"
df.to_csv(output_name, index=False)

In [15]:
df[df['label'] == 0].tail(50)

Unnamed: 0,message,label
49,TODAY TOOK A TOLL ON HIM SHEESHHH,0
50,hpduke222 hpduke222 hpduke222 hpduke222 hpduke...,0
51,Stop give Nightmare Fuel CaitlynS NotLikeThis,0
52,aye nah I think Kodak done got too him,0
53,ishowspeedClap,0
54,HE LICKED ME,0
55,is this fr live ???,0
56,LUL,0
57,LET ME OUT OF THIS VAN WutFace,0
58,GREAT SPEED,0


In [34]:
test_df = pd.read_csv("../Data/hate_speech_and_offensive_language_dataset/labeled_data.csv")
test_df2 = pd.read_csv("../Data/hate_speech_detection_curated_dataset/HateSpeechDatasetBalanced.csv")

In [32]:
test_df[test_df['class'] == 1]

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
...,...,...,...,...,...,...,...
24774,25287,3,0,3,0,1,you really care bout dis bitch. my dick all in...
24775,25288,3,0,3,0,1,"you worried bout other bitches, you need me for?"
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...


In [41]:
test_df2[test_df2['Label'] == 0]

Unnamed: 0,Content,Label
523,simply copy and paste the following text into ...,0
524,in order to help increase the booklets downloa...,0
525,as of the booklet had been downloaded over tim...,0
527,click on the download my bad green banner link,0
528,booklet updated on,0
...,...,...
440894,crash another movie from left field i have to ...,0
440895,i why do not you debate first before starting ...,0
440896,removal of i reverted the removal of the above...,0
440897,i have unblocked you eddie as i discussed on u...,0
