In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
def generate_answers(df, idx):
    corresponding_ans = list(df.loc[idx])
    answer_dic = {}
    for i, ans in enumerate(corresponding_ans):
        answer_dic[f'Q{i+1}'] = ans
    answer_dic_str = "\n".join([f"{key}: {value}" for key, value in answer_dic.items()])
    return answer_dic_str

## **PHASE1 : Generating user persona description using LLM**

In [2]:
from openai import OpenAI

client = OpenAI(api_key="YOUR_API_KEY")

def get_response_attribute(sex, age, prior_questions, prior_answers):

    poa_questions = prior_questions[0:4]    # 1~4
    poa_questions = "".join(poa_questions)

    coa_questions = prior_questions[4:8]    # 5~8
    coa_questions = "".join(coa_questions)

    eoa_questions = prior_questions[8:11]   # 9~11
    eoa_questions = "".join(eoa_questions)

    ioa_questions = prior_questions[11:15]  # 12~15
    ioa_questions = "".join(ioa_questions)

    oac_questions = prior_questions[15:19]  # 16~19
    oac_questions = "".join(oac_questions)

    items = re.findall(r"(Q\d+): ([^\n]+)", prior_answers)

    poa_answers = str({q: a for q, a in items if 1 <= int(q[1:]) <= 4})
    coa_answers = str({q: a for q, a in items if 5 <= int(q[1:]) <= 8})
    eoa_answers = str({q: a for q, a in items if 9 <= int(q[1:]) <= 11})
    ioa_answers = str({q: a for q, a in items if 12 <= int(q[1:]) <= 15})
    oac_answers = str({q: a for q, a in items if 16 <= int(q[1:]) <= 19})
    

    role = """
        Describe the respondent based on the prior responses. The respondent is a Spanish netizen, answering the survey in early 2017.
    """

    prompt = f"""
        **Data Provided:**

        The survey respondent is {sex} and his(or her) age is {age}.
        The respondent answered in likert scale (1 = Completely disagree to 7 = Completely agree, 4 = Neutral)

        **Survey Sections and Responses:**

        1. **Pleasure induced by online advertising**
        - Survey questions:
            {poa_questions}
        - Respondent's answers:
            {poa_answers}

        2. **Perceived credibility of online advertising**
        - Survey questions:
            {coa_questions}
        - Respondent's answers:
            {coa_answers}

        3. **Economic evaluation of online advertising**
        - Survey questions:
            {eoa_questions}
        - Respondent's answers:
            {eoa_answers}

        4. **Perceived intrusiveness of online advertising**
        - Survey questions:
            {ioa_questions}
        - Respondent's answers:
            {ioa_answers}

        5. **Perceived online advertising clutter**
        - Survey questions:
            {oac_questions}
        - Respondent's answers:
            {oac_answers}

        **Instructions:**
        Consider general perspectives associated with the age and gender, along with prior responses
        - Analyze the questions and answers carefully, aiming to provide insightful summaries that reflect the respondent’s likely views and attitudes.
        - Format your response as a text with 5 bullet points. Each bullet point should briefly summarize the respondent's attitude toward each latent variable.

        **Example Response Format:**
        [
            - Regarding Pleasure induced by online advertising, ~~
            - Regarding Perceived credibility of online advertising, ~~
            - Regarding Economic evaluation of online advertising, ~~
            - Regarding Perceived intrusiveness of online advertising, ~~
            - Regarding Perceived online advertising clutter, ~~
        ]
        """
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": role},
            {"role": "user", "content": prompt}
        ]
    )

    result = response.choices[0].message.content
    return result

In [3]:
with open('../../../data/prior_questions.txt', 'r') as file:
    prior_questions = file.readlines()

In [4]:
prior_df = pd.read_csv('../../../data/dv_total.csv', index_col=0)
target_df = pd.read_csv('../../../data/idv_total.csv', index_col=0)

idx_list = list(prior_df.index)
if idx_list != list(target_df.index):
    print("Error!")

In [None]:
progress = 0
descriptions_list = {}

for idx in idx_list:
    progress+=1
    if progress%50 == 0:
        print(f"progres {progress} done")

    sex = prior_df.loc[idx, 'Sex']
    age = prior_df.loc[idx, 'Age']

    prior_answers = generate_answers(prior_df.iloc[:, 2:], idx) ## provided prior questions and answers
    gpt_summary = get_response_attribute(sex, age, prior_questions, prior_answers) ## generated user persona description

    descriptions_list[idx] = gpt_summary

In [None]:
description_df = pd.DataFrame.from_dict(descriptions_list, orient='index', columns=['description'])
final_description_df = pd.concat([prior_df[['Sex', 'Age']], description_df], axis=1)
save_path = 'PATH_TO_SAVE_PERSONA_DESCRIPTON'
final_description_df.to_csv(f'{save_path}description.csv')

## **PHASE2: Generate responses using USER PERSONA (generated by LLM)**

In [5]:
from openai import OpenAI

client = OpenAI(api_key="YOUR_API_KEY")

def get_survey_response(description, questions_to_answer):
    role=f"""
        You are a Spanish netizen, answering this survey in early 2017. Answer all questions based on the given description to maintain consisteny with the respondent. 
        """
    prompt = f"""
        **Data Provided**

        Description about respondent's attitudes toward online advertising:
        {description}

        **Instructions:**

        Consider general perspectives associated with your age and gender, along with you attitudes described in the given description
        - Answer each question considering general perspectives associated with your age and gender, along with the description.
        - Format each response as follows: "'Q<number>': <Answer> (reason for answer)". Strictly adhere to the required response format without adding extra text or elaboration outside this structure.

        **Survey Questions to Answer:**
        {questions_to_answer}

        ***Example Response:**
        [
            'Q1': 7 (The respondent shows a very strong positive attitude towards this statement based on prior answers),
            'Q2': 1 (Indicating strong skepticism or disagreement with online ad reliability based on demographic information),
            'Q3': 4 (Showing a neutral stance as in prior responses)
        ]

        Use a Likert scale from 1 to 7 (1 = Completely disagree to 7 = Completely agree) for each answer.
        Respond in a structured format, outlining each step to form a well-supported evaluation.
        """
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": role},
            {"role": "user", "content": prompt}
        ]
    )

    result = response.choices[0].message.content
    return result

In [6]:
desc_path = 'DESCRIPTION_PATH'
description_df = pd.read_csv(f'{desc_path}description.csv', index_col=0)
with open(f"../../questions_to_answer.txt", "r", encoding="utf-8") as file:
    questions_to_answer = file.read()
target_df = pd.read_csv('../../idv_total.csv', index_col=0)

save_path = 'PATH_TO_SAVE_RESPONSES'

In [None]:
progress = 0
answer_list = {}

for idx in idx_list:
    progress+=1
    if progress%50 == 0:
        print(f"progres {progress} done")

    sex = description_df.loc[idx, 'Sex']
    age = description_df.loc[idx, 'Age']
    description = description_df.loc[idx, 'description']

    gpt_answers_raw = get_survey_response(description, questions_to_answer) ## responses generated by LLM
    gpt_answers = re.findall(r"[\"']Q\d+[\"']:\s*(\d+)", gpt_answers_raw)
    gpt_answers = list(map(int, gpt_answers))
    
    human_answers_with_Q = generate_answers(target_df, idx) ## Original human responses
    human_answers = re.findall(r': (\d+)', str(human_answers_with_Q))
    human_answers = list(map(int, human_answers))

    answer_list[idx] = gpt_answers

    if len(gpt_answers) == len(human_answers):
        check = [1 if gpt_answers[i] == human_answers[i] else 0 for i in range(len(gpt_answers))]

        df = pd.DataFrame({
            "raw_answer": gpt_answers_raw,
            "gpt_answers": gpt_answers,
            "human_answers": human_answers,
            "check": check
        })
        df.to_csv(f'{save_path}idx_{idx}_result.csv', index=False)
    
    else:
        print(f"The lengths of gpt answers and human answers for {idx} do not match.")
        df = pd.DataFrame({
            "raw_answer": gpt_answers_raw,
            "human_answers": human_answers,
        })
        df.to_csv(f'{save_path}idx_{idx}_result.csv', index=False)
    

In [11]:
gpt_aoa = pd.DataFrame.from_dict(answer_list, orient="index", columns=["Q1", "Q2", "Q3", "Q4"])
gpt_aoa.to_csv(f'{save_path}llm_responses.csv')