<a href="https://colab.research.google.com/github/casllmproject/dialectic_intersubjectivity/blob/main/FineTuning_ANES2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import openai
import numpy as np
import json
import time
import os
from dotenv import load_dotenv
from collections import Counter
from collections import deque
from tqdm import tqdm
import re
import tiktoken # for token counting
from collections import defaultdict

In [None]:
# load ANES2020-22 Social Media Panel Survey
anes = pd.read_csv("{ANES}.csv")

In [None]:
anes.columns[:60]

In [None]:
# Mapping of scales to human-readable labels
gender_map = {0: 'Unknown', 1: 'Male', 2: 'Female'}
race_map = {1: 'White, non-hispanic', 2: 'Black, non-hispanic',
            3: 'Other, non-Hispanic (includes Asian non-hispanic)', 4: 'Hispanic'}
educ_map = {1: 'Less than HS', 2: 'HS graduate or equivalent', 3: 'Vocational/tech school/some college/ associates',
            4: 'Bachelor degree', 5: 'Post grad study/professional degree'}
income_map = {1: 'Less than $5,000', 2: '$5,000 to $9,999', 3: '$10,000 to $14,999',
              4: '$15,000 to $19,999', 5: '$20,000 to $24,999', 6: '$25,000 to $29,999',
              7: '$30,000 to $34,999', 8: '$35,000 to $39,999', 9: '$40,000 to $49,999',
              10: '$50,000 to $59,999', 11: '$60,000 to $74,999', 12: '$75,000 to $84,999',
              13: '$85,000 to $99,999', 14: '$100,000 to $124,999', 15: '$125,000 to $149,999',
              16: '$150,000 to $174,999', 17: '$175,000 to $199,999', 18: '$200,000 or more'}
party_map = {-7: 'No answer', -6: 'Unit non-response', -1: 'Inapplicable, legitimate skip',
             1: 'Very liberal', 2: 'Somewhat liberal', 3: 'Closer to liberal',
             4: 'Neither liberal nor conservative', 5: 'Closer to conservatives',
             6: 'Somewhat conservative', 7: 'Very conservative',
             77: "DON'T KNOW", 98: "SKIPPED ON WEB", 99: "REFUSED"}
trust_map = {-7: 'No answer', -6: 'Unit non-response', -1: 'Inapplicable, legitimate skip',
             1: 'Not at all', 2: 'A little', 3: 'A moderate amount',
             4: 'A lot', 5: 'A great deal',
             77: "DON'T KNOW", 98: "SKIPPED ON WEB", 99: "REFUSED"}

# Apply mapping to the DataFrame
anes['profile_gender'] = anes['profile_gender'].map(gender_map)
anes['profile_racethnicity'] = anes['profile_racethnicity'].map(race_map)
anes['profile_educ5'] = anes['profile_educ5'].map(educ_map)
anes['profile_income'] = anes['profile_income'].map(income_map)
anes['w3lcself'] = anes['w3lcself'].map(party_map)
anes['w3lcd'] = anes['w3lcd'].map(party_map)
anes['w3lcr'] = anes['w3lcr'].map(party_map)
anes['w3trustfox'] = anes['w3trustfox'].map(trust_map)
anes['w3trustmscnbc'] = anes['w3trustmscnbc'].map(trust_map)

# Define the questions for each variable
questions = {
    'profile_gender': 'What is your gender?',
    'profile_age': 'What is your age?',
    'profile_racethnicity': 'What is your race/ethnicity?',
    'profile_educ5': 'What is your highest level of education?',
    'profile_income': 'What is your income level?',
    'w3lcself': 'When it comes to politics, would you describe yourself as liberal, conservative, or neither liberal nor conservative?',
    'w3lcd': 'When it comes to politics, would you describe Democratic Party as liberal, conservative, or neither liberal nor conservative?',
    'w3lcr': 'When it comes to politics, would you describe Republican Party as liberal, conservative, or neither liberal nor conservative?',
    'w3trustfox': 'How much do you think political information from Fox News can be trusted?',
    'w3trustmscnbc': 'How much do you think political information from MSNBC can be trusted?'
}

In [None]:
# Function to save the chat training data to a JSONL file
def save_to_jsonl(data, filename):
    with open(filename, "w") as f:
        for entry in data:
            f.write(json.dumps(entry) + "\n")

In [None]:
# Prepare chat training data
chat_training_data = []
for _, row in anes.iterrows():
    user_message = (
        f"Demographics: {questions['profile_gender']} {row['profile_gender']}, "
        f"{questions['profile_age']} {row['profile_age']}, "
        f"{questions['profile_racethnicity']} {row['profile_racethnicity']}, "
        f"{questions['profile_educ5']} {row['profile_educ5']}, "
        f"{questions['profile_income']} {row['profile_income']}. "
        f"Responses: {questions['w3lcself']} {row['w3lcself']}, {questions['w3lcd']} {row['w3lcd']}, "
        f"{questions['w3lcr']} {row['w3lcr']}, {questions['w3trustfox']} {row['w3trustfox']}, "
        f"{questions['w3trustmscnbc']} {row['w3trustmscnbc']}."
    )

    assistant_message = "Provide insights based on the traits and responses."

    chat_training_data.append({
        "messages": [
            {"role": "user", "content": user_message},
            {"role": "assistant", "content": assistant_message}
        ]
    })

# Save to JSONL format
save_to_jsonl(chat_training_data, "{folder}_chat_training_data.jsonl")

In [None]:
# Function to test if the JSONL file is correctly formatted
def test_jsonl_file(filename):
    try:
        with open(filename, "r") as f:
            for i, line in enumerate(f, 1):
                try:
                    json_data = json.loads(line)
                    if "messages" not in json_data or not isinstance(json_data["messages"], list):
                        print(f"Error on line {i}: 'messages' field missing or not a list.")
                        return False
                    for message in json_data["messages"]:
                        if "role" not in message or "content" not in message:
                            print(f"Error on line {i}: 'role' or 'content' field missing.")
                            return False
                        if message["role"] not in ["user", "assistant"]:
                            print(f"Error on line {i}: Invalid 'role' value.")
                            return False
                except json.JSONDecodeError:
                    print(f"Error on line {i}: Invalid JSON format.")
                    return False
        print("File is valid.")
        return True
    except FileNotFoundError:
        print(f"File {filename} not found.")
        return False

# Test the generated JSONL file
test_jsonl_file("/content/drive/MyDrive/CASLLM/chat_training_data.jsonl")

In [None]:
# Show the first 5 rows of the generated JSONL file
def preview_jsonl_file(filename, num_lines=5):
    try:
        with open(filename, "r") as f:
            lines = [json.loads(next(f).strip()) for _ in range(num_lines)]
        return lines
    except FileNotFoundError:
        return f"File {filename} not found."
    except StopIteration:
        return "The file contains fewer lines than requested."


In [None]:
preview_jsonl_file("{folder}_chat_training_data.jsonl", 1)

In [None]:
# Set the API key by reading from the file
openai.api_key = open("{folder}_apikey.txt", "r").read().strip("\n")

# Upload the JSONL file for fine-tuning
response = openai.File.create(
    file=open("{folder}_chat_training_data.jsonl", "rb"),
    purpose="fine-tune"
)

# Print the response to check if the file was uploaded successfully
print(f"File uploaded successfully: {response['id']}")


In [None]:
# Create the fine-tuning job using the provided file ID
fine_tune_response = openai.FineTuningJob.create(
    training_file="file-tsXROXOjcqq798LzzUPIgKbc",
    model="gpt-4o-2024-08-06"
)
print("Fine-tuning job created:", fine_tune_response)

In [None]:
# Check the fine-tuning job using the provided file ID
fine_tune_status = openai.FineTuningJob.retrieve(id="{fine_tunning_ID}")
print("Fine-tuning status:", fine_tune_status)