In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
from openai import OpenAI

client = OpenAI(api_key="YOUR_API_KEY")

def get_survey_response(sex, age, questions_to_answer):
    role = f"""
        You are a Spanish netizen, answering this survey in early 2017. Answer all questions based on the provided demographic information(gender and age).
    """
    prompt = f"""
        **Data Provided:**

        The survey respondent is {sex} and his(or her) age is {age}.

        **Instructions:**

        Consider general perspectives associated with your age and gender.
        - Answer each question considering general perspectives associated with your age and gender. 
        - Format each response as follows: "'Q<number>': <Answer> (reason for answer)". Strictly adhere to the required response format without adding extra text or elaboration outside this structure.


        **Survey Questions to Answer:**
        {questions_to_answer}

        **Example Response:**
        [
            'Q1': 7 (The respondent might show a very strong positive attitude towards this statement based on his age.),
            'Q2': 1 (May indicate strong skepticism or disagreement with online ad reliability),
            'Q3': 4 (Showing a neutral stance based in respondent's gender).
        ]

        Use a Likert scale from 1 to 7 (1 = Completely disagree to 7 = Completely agree) for each answer.
        Respond in a structured format, outlining each step to form a well-supported evaluation.

    """
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": role},
            {"role": "user", "content": prompt}
        ]
    )

    result = response.choices[0].message.content
    return result

In [3]:
def generate_answers(df, idx):
    corresponding_ans = list(df.loc[idx])
    answer_dic = {}
    for i, ans in enumerate(corresponding_ans):
        answer_dic[f'Q{i+1}'] = ans
    answer_dic_str = "\n".join([f"{key}: {value}" for key, value in answer_dic.items()])
    return answer_dic_str

In [4]:
prior_df = pd.read_csv('../../../data/dv_total.csv', index_col=0)
target_df = pd.read_csv('../../../data/idv_total.csv', index_col=0)

idx_list = list(prior_df.index)
if idx_list != list(target_df.index):
    print("Error!")

In [5]:
with open(f"../../prior_questions.txt", "r", encoding="utf-8") as file:
    prior_questions = file.read()
with open(f"../../questions_to_answer.txt", "r", encoding="utf-8") as file:
    questions_to_answer = file.read()

In [None]:
progress = 0
answer_list = {}
save_path = 'PATH_TO_SAVE_RESPONSES'

for idx in idx_list:
    progress+=1
    if progress%50 == 0:
        print(f"progres {progress} done")

    sex = prior_df.loc[idx, 'Sex']
    age = prior_df.loc[idx, 'Age']

    gpt_answers_raw = get_survey_response(sex, age, questions_to_answer) ## responses generated by LLM
    gpt_answers = re.findall(r"[\"']Q\d+[\"']:\s*(\d+)", gpt_answers_raw)
    gpt_answers = list(map(int, gpt_answers))
    
    human_answers_with_Q = generate_answers(target_df, idx) ## Original human respones
    human_answers = re.findall(r': (\d+)', str(human_answers_with_Q))
    human_answers = list(map(int, human_answers))

    answer_list[idx] = gpt_answers

    if len(gpt_answers) == len(human_answers):
        check = [1 if gpt_answers[i] == human_answers[i] else 0 for i in range(len(gpt_answers))]

        df = pd.DataFrame({
            "raw_answer": gpt_answers_raw,
            "gpt_answers": gpt_answers,
            "human_answers": human_answers,
            "check": check
        })
        df.to_csv(f'{save_path}idx_{idx}_result.csv', index=False)
    
    else:
        print(f"The lengths of gpt answers and human answers for {idx} do not match.")
        df = pd.DataFrame({
            "raw_answer": gpt_answers_raw,
            "human_answers": human_answers,
        })
        df.to_csv(f'{save_path}idx_{idx}_result.csv', index=False)
    

In [11]:
gpt_aoa = pd.DataFrame.from_dict(answer_list, orient="index", columns=["Q1", "Q2", "Q3", "Q4"])
gpt_aoa.to_csv(f'{save_path}llm_responses.csv')