In [None]:
import pandas as pd
import openai
import os

# Load OpenAI API Key:
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
# Load the CSV file:
file_path = "//Users//louis//Downloads//test1.txt"
df = pd.read_csv(file_path, encoding="utf-16", delimiter="\t")

# Clean dataset by removing empty rows and columns:
df = df.dropna(how="all", axis=1).dropna(how="all")

# Identify the relevant column for categorization:
response_column = "6. Does the mode that you use to travel to work change seasonally? If so, explain below. (i.e. I bike to work during the warmer months and drive alone once it starts to snow)."
df = df[[response_column]].dropna()


In [None]:
# Prompt for GPT:
def categorize_response(response):
    """Uses OpenAI API to classify survey responses into standardized labels."""
    system_prompt = """
    You are an expert in survey response analysis. You will categorize survey responses
    about seasonal commuting behavior into standardized labels. The possible labels are:
    
    - Always Drive
    - Seasonal Biker
    - Year-Round Biker
    - Public Transit User
    - Carpooler
    - Other
    
    ### **Classification Process:**
    1. **Identify the main idea** of the response.
    2. **Select the most appropriate label** from the list above.
    3. **Return only the label, nothing else.**
    """
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Response: {response}\nLabel: "},
        ]
    )
    return response.choices[0].message.content.strip()

# Apply AI categorization:
df["Standardized_Label"] = df[response_column].apply(categorize_response)

In [None]:
# Save the labeled data:
df.to_csv("//Users//louis//Downloads//categorized_survey_data.csv", index=False, encoding="utf-16", sep="\t")

print("Categorization complete! Results saved to 'categorized_survey_data.csv'")