In [11]:
import pandas as pd
from sklearn.utils import resample

csv_path = "data/train.csv"  
data = pd.read_csv(csv_path)

nta_data = data[data['target'] == 'NTA']
other_data = data[data['target'] != 'NTA']

nta_sample = resample(nta_data, n_samples=3000, random_state=42)
other_sample = resample(other_data, n_samples=3000, random_state=42)

balanced_data = pd.concat([nta_sample, other_sample])

balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

balanced_data.to_csv("data/balanced_train_dataset.csv", index=False)

In [14]:
import pandas as pd
from sklearn.utils import resample

csv_path = "data/dev.csv"  
data = pd.read_csv(csv_path)

nta_data = data[data['target'] == 'NTA']
other_data = data[data['target'] != 'NTA']

nta_sample = resample(nta_data, n_samples=375, random_state=42)
other_sample = resample(other_data, n_samples=375, random_state=42)

balanced_data = pd.concat([nta_sample, other_sample])

balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

balanced_data.to_csv("data/balanced_dev_dataset.csv", index=False)

In [None]:
import pandas as pd
from sklearn.utils import resample

csv_path = "data/test.csv"  
data = pd.read_csv(csv_path)

nta_data = data[data['target'] == 'NTA']
other_data = data[data['target'] != 'NTA']

nta_sample = resample(nta_data, n_samples=375, random_state=42)
other_sample = resample(other_data, n_samples=375, random_state=42)

balanced_data = pd.concat([nta_sample, other_sample])

balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

balanced_data.to_csv("data/balanced_test_dataset.csv", index=False)

In [None]:
from openai import OpenAI
import os
import pandas as pd

csv_path = "data/dev.csv" 
data = pd.read_csv(csv_path)

test_data = data[0:1500] # insert assigned chunk here 

os.environ["OPENAI_API_KEY"] = "here"

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def pred_verdict(text):
    try:
        # columns = [
        #     "body", "num_comments", "score", "upvote_ratio",
        #     "title", "id", "length_body"
        # ]

        # # Creating a readable text format for each column
        # text_parts = [
        #     f"The {col.replace('_', ' ')} is: {{test_data['{col}']}}"
        #     for col in columns
        # ]

        # # Joining all parts into a single string
        # final_text = ". ".join(text_parts) + "."
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful agent."},
                {"role": "user", "content": f"You are a classifier for posts in the Am I the Asshole subreddit. Your task is to classify each post into one of the following categories: \"NTA\" (Not the Asshole), \"YTA\" (You're the Asshole), \"ESH\" (Everyone's the Asshole), and \"NAH\" (No Assholes Here), primarily based on the interpersonal behavior the user describes in the body text. Provide only the category name as the output. :\n{text}. Category: "}
            ]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating summary: {e}")
        return None

test_data['pred_verdict'] = test_data['body'].apply(pred_verdict)

test_data.to_csv("data/dev_predicted_100.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['pred_verdict'] = test_data['body'].apply(pred_verdict)


In [11]:
print(test_data['pred_verdict'])

0    NTA
1    YTA
2    NTA
3    NTA
4    NTA
5    YTA
6    NTA
7    NTA
8    NAH
9    ESH
Name: pred_verdict, dtype: object


In [12]:
print(test_data['target'])

0    NTA
1    YTA
2    NTA
3    NTA
4    NTA
5    YTA
6    NTA
7    NTA
8    NTA
9    NTA
Name: target, dtype: object


In [21]:
csv_path = "data/dev_predicted_100.csv" 
data = pd.read_csv(csv_path)

with open("labels/dev_100_pred.txt", "w") as pred_file:
    for pred in data['pred_verdict']:
        pred_file.write(f"{pred}\n")

with open("labels/dev_100_true.txt", "w") as true_file:
    for true_label in data['target']:
        true_file.write(f"{true_label}\n")
