In [12]:
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Literal, List
from langchain_openai import ChatOpenAI
import pandas as pd 
import os
from dotenv import load_dotenv

load_dotenv()

df_reddit_labeled = pd.read_csv("reddit_labeled.csv")
sample_neg_examples = df_reddit_labeled[df_reddit_labeled["Label"] == "NEG"]["Text"].tolist()
sample_neg_examples = sample_neg_examples[:8]

class NegativeRow(BaseModel):
    Text: str = Field(description="Text to classify")
    Label: Literal["NEG"] = Field(description="Label for the text")
    HumanLabel: Literal["NEG"] = Field(description="Human label for the text")
    
class NegativeExample(BaseModel):
    Data: List[NegativeRow] = Field(description="List of negative rows")

# prompt generated by gpt-4o
prompt = """
You are an expert in generating synthetic textual data that mimics the style of Reddit posts/comments.
Below are a few examples of posts/comments with clearly negative sentiment (NEG):
{examples}

Based on these examples, generate 50 new and unique synthetic Reddit posts/comments that clearly express negative sentiment (NEG). 
Each post should mimic the style and tone of the examples above.
For each generated post, output an object with the following keys:
    "Text": the generated post/comment text,
    "Label": set to "NEG",
    "HumanLabel": set to "NEG".
Output the resulting list as a JSON array.
""".format(examples = "".join(f"- {ex}\n" for ex in sample_neg_examples))

model = ChatOpenAI(model="gpt-4o", api_key = os.environ['OPENAI_API_KEY'])
structured_model = model.with_structured_output(NegativeExample)
df_reddit_sampled = df_reddit_labeled[:500]
human_label = []

i = 1
text_list = []
label_list = []
human_label_list = []
output = structured_model.invoke(prompt)
print(output)



Data=[NegativeRow(Text="Honestly, I don't see the point in trying anymore; it's just not worth the stress.", Label='NEG', HumanLabel='NEG'), NegativeRow(Text="It's frustrating how nothing ever goes according to plan, no matter how much effort I put in.", Label='NEG', HumanLabel='NEG'), NegativeRow(Text="I've given up hoping for anything positive to happen; it's just disappointment after disappointment.", Label='NEG', HumanLabel='NEG'), NegativeRow(Text='Wow, another day, another failure. Just fantastic.', Label='NEG', HumanLabel='NEG'), NegativeRow(Text="Why do I even bother? It's clear nobody appreciates what I do anyway.", Label='NEG', HumanLabel='NEG'), NegativeRow(Text="I guess I'm invisible, because nobody seems to notice or care about what I'm going through.", Label='NEG', HumanLabel='NEG'), NegativeRow(Text="Sick and tired of being let down every single time. It's exhausting.", Label='NEG', HumanLabel='NEG'), NegativeRow(Text='Not surprised that once again things have taken a tu

In [None]:
from datetime import datetime
import uuid
import pandas as pd

total_examples = len(output.Data)
text_list = []
label_list = []
human_label_list = []

for i in range(total_examples):
    text_list.append(output.Data[i].Text)
    label_list.append(output.Data[i].Label)
    human_label_list.append(output.Data[i].HumanLabel)
    
df_synthetic = pd.DataFrame({
    'Text': text_list,
    'Label': label_list,
    'HumanLabel': human_label_list
})

df_synthetic['Id'] = [str(uuid.uuid4()) for _ in range(len(df_synthetic))]
df_synthetic['Created'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

if 'Unnamed: 0' in df_reddit_labeled.columns:
    start_idx = df_reddit_labeled['Unnamed: 0'].max() + 1
    df_synthetic['Unnamed: 0'] = range(start_idx, start_idx + len(df_synthetic))
    
# shuffle the synthetic data
df_synthetic = df_synthetic[df_reddit_labeled.columns]
df_augmented = pd.concat([df_reddit_labeled, df_synthetic], ignore_index=True)
df_augmented = df_augmented.sample(frac=1).reset_index(drop=True)

In [None]:
df_augmented.to_csv("reddit_labeled_augmented.csv")