# Initialize

In [73]:
from openai import OpenAI
from google import genai
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import ast
import time
from dotenv import load_dotenv
import os

load_dotenv()

my_key = os.getenv("GOOGLE_CLOUD_KEY_2")

# client = OpenAI(api_key=my_key)
client = genai.Client(api_key=my_key)

# Prompting prep

In [28]:
# Load dataset
df = pd.read_csv("../Data/hate_speech_and_offensive_language_dataset/labeled_data.csv")

# Sample some examples for context
df2 = df.copy()
df2['message'] = df2['tweet']
df2['labels'] = df2['class'].apply(lambda x: 0 if x == 2 else 1)

sample_size = 1000
n_zero = int(sample_size * 0.8)
n_one = sample_size - n_zero

df_zero = df2[df2['labels'] == 0]
df_one = df2[df2['labels'] == 1]

# if there aren't enough examples in a group, sample with replacement
replace_zero = len(df_zero) < n_zero
replace_one = len(df_one) < n_one

sample_zero = df_zero.sample(n=n_zero, random_state=42, replace=replace_zero)
sample_one = df_one.sample(n=n_one, random_state=42, replace=replace_one)

# API Calling

In [None]:
# Format examples as text for the prompt
# example_text = "\n".join(
#     [f"Message: {row['message']}\nLabel: {row['labels']}" for _, row in examples.iterrows()]
# )

# # Create improved synthesis prompt
# prompt = f"""
# You are an expert data generator specializing in creating realistic chat messages for training
# an offensive language detection model.

# Below are examples of real data (Message and Label pairs).
# Label '1' means the message is offensive, and label '0' means it is not offensive.

# Examples:
# {example_text}

# Now, generate 5000 new synthetic examples that follow the same style, tone, and balance between classes.
# Try to make the data realistic — include slang, emojis, mild and strong insults, as well as casual and polite non-offensive chats.
# Avoid repetition or overly similar phrasing.

# Output your response **strictly** in this format (CSV-like, no extra commentary):

# Message,Label
# <message_1>,<label>
# <message_2>,<label>
# ...
# <message_20>,<label>
# """

In [74]:
num_batches = 100
samples_per_batch = 50
ratio_non_off = 0.8
ratio_off = 0.2

count_zero_per_batch = int(samples_per_batch * ratio_non_off)
count_one_per_batch  = int(samples_per_batch * ratio_off)

batches = []
for i in range(num_batches):
    start_idx_zero = i * count_zero_per_batch
    end_idx_zero = start_idx_zero + count_zero_per_batch
    start_idx_one = i * count_one_per_batch
    end_idx_one = start_idx_one + count_one_per_batch
    
    batch_zero = sample_zero.iloc[start_idx_zero:end_idx_zero]
    batch_one  = sample_one.iloc[start_idx_one:end_idx_one]
    
    batch = pd.concat([batch_zero, batch_one]).sample(frac=1, random_state=i).reset_index(drop=True)
    batches.append(batch)

In [75]:
results = []
for i, batch in tqdm(enumerate(batches), total=len(batches), desc="Processing Batches"):
    example_text = "\n".join([
        f"Message: {row['message']}\nLabel: {row['labels']}"
        for _, row in batch.iterrows()
    ])
    
    prompt = f"""
    You are generating labeled twitch chat messages for an offensive language detection model.
    Here are some examples (80% non-offensive, 20% offensive):
    {example_text}

    These examples are from twitter, but I want you to try making the generated content imitate a platform behavior from Twitch chat.
    So include typical Twitch behavior such as emotes, short phrases, hype words, and slang.

    Generate 50 new examples following the same 80:20 ratio:
    - 40 non-offensive messages
    - 10 offensive messages

    Output strictly as CSV with columns message,labels:
    - Use a comma to separate the message from its label
    - Do NOT include headers
    - Do NOT include ',' in the messages themselves
    - Each line should contain only the message and its label
    - The label should be 0 for non-offensive and 1 for offensive messages
    - Use the format specified below

    Here is the format:
    <message_1>,<label>
    <message_2>,<label>
    ...
    <message_50>,<label>

    Do not add any extra text or explanation, just output the CSV lines.
    """

    for attempt in range(5):
        try:
            response = client.models.generate_content(
                model="gemini-2.0-flash",
                contents=prompt,
                config={"temperature": 0.9}
            )
            break  # success
        except Exception as e:
            print(f"Attempt {attempt+1} failed: {e}")
            time.sleep(5)  # wait 5 seconds before retry
    results.append(response.text)
    print(f"Batch {i+1} complete.")

Processing Batches:   1%|          | 1/100 [00:04<07:14,  4.39s/it]

Batch 1 complete.


Processing Batches:   2%|▏         | 2/100 [00:08<07:01,  4.30s/it]

Batch 2 complete.


Processing Batches:   3%|▎         | 3/100 [00:13<07:39,  4.74s/it]

Batch 3 complete.


Processing Batches:   4%|▍         | 4/100 [00:17<07:11,  4.49s/it]

Batch 4 complete.


Processing Batches:   5%|▌         | 5/100 [00:23<07:37,  4.82s/it]

Batch 5 complete.


Processing Batches:   6%|▌         | 6/100 [00:27<07:18,  4.66s/it]

Batch 6 complete.


Processing Batches:   7%|▋         | 7/100 [00:32<07:06,  4.59s/it]

Batch 7 complete.


Processing Batches:   8%|▊         | 8/100 [00:36<06:44,  4.40s/it]

Batch 8 complete.


Processing Batches:   9%|▉         | 9/100 [00:41<07:09,  4.72s/it]

Batch 9 complete.


Processing Batches:  10%|█         | 10/100 [00:45<06:47,  4.53s/it]

Batch 10 complete.


Processing Batches:  11%|█         | 11/100 [00:49<06:35,  4.45s/it]

Batch 11 complete.


Processing Batches:  12%|█▏        | 12/100 [01:42<27:54, 19.03s/it]

Batch 12 complete.


Processing Batches:  13%|█▎        | 13/100 [01:46<20:57, 14.45s/it]

Batch 13 complete.


Processing Batches:  14%|█▍        | 14/100 [01:51<16:36, 11.58s/it]

Batch 14 complete.


Processing Batches:  15%|█▌        | 15/100 [01:56<13:52,  9.80s/it]

Batch 15 complete.


Processing Batches:  16%|█▌        | 16/100 [02:01<11:30,  8.22s/it]

Batch 16 complete.


Processing Batches:  17%|█▋        | 17/100 [02:06<10:13,  7.39s/it]

Batch 17 complete.


Processing Batches:  18%|█▊        | 18/100 [02:10<08:43,  6.38s/it]

Batch 18 complete.


Processing Batches:  19%|█▉        | 19/100 [02:14<07:34,  5.61s/it]

Batch 19 complete.


Processing Batches:  20%|██        | 20/100 [02:20<07:29,  5.62s/it]

Batch 20 complete.


Processing Batches:  21%|██        | 21/100 [02:24<06:40,  5.07s/it]

Batch 21 complete.


Processing Batches:  22%|██▏       | 22/100 [02:28<06:10,  4.75s/it]

Batch 22 complete.


Processing Batches:  23%|██▎       | 23/100 [02:31<05:42,  4.45s/it]

Batch 23 complete.


Processing Batches:  24%|██▍       | 24/100 [02:36<05:31,  4.36s/it]

Batch 24 complete.


Processing Batches:  25%|██▌       | 25/100 [02:39<05:14,  4.20s/it]

Batch 25 complete.


Processing Batches:  26%|██▌       | 26/100 [02:43<05:00,  4.06s/it]

Batch 26 complete.


Processing Batches:  27%|██▋       | 27/100 [02:47<04:43,  3.88s/it]

Batch 27 complete.


Processing Batches:  28%|██▊       | 28/100 [02:51<04:40,  3.90s/it]

Batch 28 complete.


Processing Batches:  29%|██▉       | 29/100 [02:54<04:27,  3.76s/it]

Batch 29 complete.


Processing Batches:  30%|███       | 30/100 [02:58<04:22,  3.74s/it]

Batch 30 complete.


Processing Batches:  31%|███       | 31/100 [03:02<04:28,  3.90s/it]

Batch 31 complete.


Processing Batches:  32%|███▏      | 32/100 [03:05<04:10,  3.69s/it]

Batch 32 complete.


Processing Batches:  33%|███▎      | 33/100 [03:10<04:20,  3.89s/it]

Batch 33 complete.


Processing Batches:  34%|███▍      | 34/100 [03:13<04:17,  3.90s/it]

Batch 34 complete.


Processing Batches:  35%|███▌      | 35/100 [03:17<04:04,  3.77s/it]

Batch 35 complete.


Processing Batches:  36%|███▌      | 36/100 [03:21<04:00,  3.75s/it]

Batch 36 complete.


Processing Batches:  37%|███▋      | 37/100 [03:24<03:56,  3.75s/it]

Batch 37 complete.


Processing Batches:  38%|███▊      | 38/100 [03:28<03:48,  3.69s/it]

Batch 38 complete.


Processing Batches:  39%|███▉      | 39/100 [03:32<03:46,  3.71s/it]

Batch 39 complete.


Processing Batches:  40%|████      | 40/100 [03:36<03:51,  3.85s/it]

Batch 40 complete.


Processing Batches:  41%|████      | 41/100 [03:40<03:59,  4.06s/it]

Batch 41 complete.


Processing Batches:  42%|████▏     | 42/100 [03:44<03:47,  3.93s/it]

Batch 42 complete.


Processing Batches:  43%|████▎     | 43/100 [03:49<03:55,  4.14s/it]

Batch 43 complete.


Processing Batches:  44%|████▍     | 44/100 [03:52<03:43,  3.99s/it]

Batch 44 complete.


Processing Batches:  45%|████▌     | 45/100 [03:56<03:36,  3.94s/it]

Batch 45 complete.


Processing Batches:  46%|████▌     | 46/100 [04:00<03:36,  4.01s/it]

Batch 46 complete.


Processing Batches:  47%|████▋     | 47/100 [04:04<03:21,  3.81s/it]

Batch 47 complete.


Processing Batches:  48%|████▊     | 48/100 [04:07<03:18,  3.82s/it]

Batch 48 complete.


Processing Batches:  49%|████▉     | 49/100 [04:11<03:04,  3.63s/it]

Batch 49 complete.


Processing Batches:  50%|█████     | 50/100 [04:14<02:59,  3.60s/it]

Batch 50 complete.


Processing Batches:  51%|█████     | 51/100 [04:18<03:00,  3.69s/it]

Batch 51 complete.


Processing Batches:  52%|█████▏    | 52/100 [04:22<03:02,  3.79s/it]

Batch 52 complete.


Processing Batches:  53%|█████▎    | 53/100 [04:26<03:02,  3.88s/it]

Batch 53 complete.


Processing Batches:  54%|█████▍    | 54/100 [04:30<02:54,  3.79s/it]

Batch 54 complete.


Processing Batches:  55%|█████▌    | 55/100 [04:34<02:50,  3.80s/it]

Batch 55 complete.


Processing Batches:  56%|█████▌    | 56/100 [04:38<02:49,  3.86s/it]

Batch 56 complete.


Processing Batches:  57%|█████▋    | 57/100 [04:41<02:42,  3.79s/it]

Batch 57 complete.


Processing Batches:  58%|█████▊    | 58/100 [04:45<02:39,  3.79s/it]

Batch 58 complete.


Processing Batches:  59%|█████▉    | 59/100 [04:49<02:35,  3.80s/it]

Batch 59 complete.


Processing Batches:  60%|██████    | 60/100 [04:52<02:21,  3.54s/it]

Batch 60 complete.


Processing Batches:  61%|██████    | 61/100 [04:56<02:21,  3.63s/it]

Batch 61 complete.


Processing Batches:  62%|██████▏   | 62/100 [05:00<02:26,  3.85s/it]

Batch 62 complete.


Processing Batches:  63%|██████▎   | 63/100 [05:03<02:15,  3.65s/it]

Batch 63 complete.


Processing Batches:  64%|██████▍   | 64/100 [05:07<02:11,  3.66s/it]

Batch 64 complete.


Processing Batches:  65%|██████▌   | 65/100 [05:12<02:19,  3.99s/it]

Batch 65 complete.


Processing Batches:  66%|██████▌   | 66/100 [05:15<02:12,  3.88s/it]

Batch 66 complete.


Processing Batches:  67%|██████▋   | 67/100 [05:19<02:02,  3.70s/it]

Batch 67 complete.


Processing Batches:  68%|██████▊   | 68/100 [05:22<02:00,  3.77s/it]

Batch 68 complete.


Processing Batches:  69%|██████▉   | 69/100 [05:26<01:53,  3.66s/it]

Batch 69 complete.


Processing Batches:  70%|███████   | 70/100 [05:30<01:54,  3.81s/it]

Batch 70 complete.


Processing Batches:  71%|███████   | 71/100 [05:34<01:49,  3.77s/it]

Batch 71 complete.


Processing Batches:  72%|███████▏  | 72/100 [05:38<01:46,  3.80s/it]

Batch 72 complete.


Processing Batches:  73%|███████▎  | 73/100 [05:41<01:40,  3.71s/it]

Batch 73 complete.


Processing Batches:  74%|███████▍  | 74/100 [05:46<01:44,  4.02s/it]

Batch 74 complete.


Processing Batches:  75%|███████▌  | 75/100 [05:50<01:40,  4.00s/it]

Batch 75 complete.


Processing Batches:  76%|███████▌  | 76/100 [05:54<01:34,  3.93s/it]

Batch 76 complete.


Processing Batches:  77%|███████▋  | 77/100 [05:57<01:29,  3.90s/it]

Batch 77 complete.


Processing Batches:  78%|███████▊  | 78/100 [06:01<01:21,  3.69s/it]

Batch 78 complete.


Processing Batches:  79%|███████▉  | 79/100 [06:04<01:18,  3.72s/it]

Batch 79 complete.


Processing Batches:  80%|████████  | 80/100 [06:09<01:20,  4.01s/it]

Batch 80 complete.


Processing Batches:  81%|████████  | 81/100 [06:13<01:15,  3.99s/it]

Batch 81 complete.


Processing Batches:  82%|████████▏ | 82/100 [06:17<01:12,  4.04s/it]

Batch 82 complete.


Processing Batches:  83%|████████▎ | 83/100 [06:20<01:05,  3.84s/it]

Batch 83 complete.


Processing Batches:  84%|████████▍ | 84/100 [06:24<00:57,  3.62s/it]

Batch 84 complete.


Processing Batches:  85%|████████▌ | 85/100 [06:27<00:53,  3.55s/it]

Batch 85 complete.


Processing Batches:  86%|████████▌ | 86/100 [06:31<00:51,  3.65s/it]

Batch 86 complete.


Processing Batches:  87%|████████▋ | 87/100 [06:35<00:48,  3.74s/it]

Batch 87 complete.


Processing Batches:  88%|████████▊ | 88/100 [06:39<00:44,  3.74s/it]

Batch 88 complete.


Processing Batches:  89%|████████▉ | 89/100 [06:43<00:43,  3.95s/it]

Batch 89 complete.


Processing Batches:  90%|█████████ | 90/100 [06:48<00:41,  4.13s/it]

Batch 90 complete.


Processing Batches:  91%|█████████ | 91/100 [06:51<00:36,  4.06s/it]

Batch 91 complete.


Processing Batches:  92%|█████████▏| 92/100 [06:55<00:32,  4.02s/it]

Batch 92 complete.


Processing Batches:  93%|█████████▎| 93/100 [06:59<00:27,  3.90s/it]

Batch 93 complete.


Processing Batches:  94%|█████████▍| 94/100 [07:02<00:22,  3.77s/it]

Batch 94 complete.


Processing Batches:  95%|█████████▌| 95/100 [07:06<00:19,  3.81s/it]

Batch 95 complete.


Processing Batches:  96%|█████████▌| 96/100 [07:11<00:15,  3.96s/it]

Batch 96 complete.


Processing Batches:  97%|█████████▋| 97/100 [07:15<00:11,  3.93s/it]

Batch 97 complete.


Processing Batches:  98%|█████████▊| 98/100 [07:18<00:07,  3.77s/it]

Batch 98 complete.


Processing Batches:  99%|█████████▉| 99/100 [07:22<00:03,  3.85s/it]

Batch 99 complete.


Processing Batches: 100%|██████████| 100/100 [07:26<00:00,  4.47s/it]

Batch 100 complete.





In [None]:
with open("../Data/synthesized/synthesized_data_alt_fixed.csv", "w", encoding="utf-8") as f:
    f.write("message,label\n")
    for res in results:
        # Each res is multiple lines
        for line in res.splitlines():
            if not line.strip():  # skip empty lines
                continue
            try:
                message, label = line.rsplit(",", 1)  # split only on last comma
            except ValueError:
                continue  # skip malformed line
            # Escape inner quotes in message
            message = message.replace('"', '""')
            f.write(f'"{message}",{label}\n')

In [92]:
test = pd.read_csv("synthesized_data_alt_fixed.csv")

In [93]:
test.head(50)

Unnamed: 0,message,labels
0,PogChamp sheesh that play was insane!,0
1,LUL that fail!,0
2,FeelsBadMan they almost had it!,0
3,Hype! Hype! Hype!,0
4,ResidentSleeper zzzzzz,0
5,4Head that's what he gets,0
6,OMEGALUL,0
7,EZ Clap,0
8,KEKW,0
9,monkaS,0


In [87]:
test['Label'].value_counts()

Label
0                                                                     4788
1                                                                     1006
 maintaining an 80:20 ratio of non-offensive to offensive content.       1
Name: count, dtype: int64