In [14]:
import numpy as np
import pandas as pd
import re
import os
from openai import OpenAI
import warnings
warnings.filterwarnings("ignore")

In [15]:
df = pd.read_excel("DataInBrief_Bankdata.xlsx")
df

Unnamed: 0,A_1_AGE,LOY_1,QUAL_1,QUAL_2,QUAL_4,QUAL_5,QUAL_6,QUA_7,PERF_1,PERF_2,...,LOY_3,TRUST_1,TRUST_2,TRUST_3,TRUST_4,D_1_GENDER,D_2_FAMILY,D_3_EDUCATION,D_4_EMPLOYMENT,D_6_INCOME
0,4,7,7,7,7,6,6,6,7,7,...,4,7,7,7,7,1,4,2,3,4
1,5,6,4,4,5,4,4,4,4,4,...,3,4,4,4,4,1,0,6,0,0
2,3,7,7,7,7,7,7,1,7,7,...,7,7,7,7,7,0,4,5,7,4
3,3,7,7,7,7,6,6,4,7,6,...,7,6,6,4,4,1,4,3,7,4
4,2,7,6,6,6,5,6,4,7,7,...,4,7,6,6,6,1,2,6,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670,5,7,7,7,7,7,7,7,7,7,...,4,7,7,7,7,1,5,2,7,2
671,4,6,5,4,5,4,3,4,5,4,...,2,7,6,4,5,1,1,3,2,0
672,6,6,5,5,4,4,4,3,5,5,...,4,5,4,5,4,0,4,6,2,5
673,5,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,1,6,6,7,3


In [16]:
ordered_columns = ['A_1_AGE','D_1_GENDER','D_2_FAMILY','D_3_EDUCATION','D_4_EMPLOYMENT','D_6_INCOME',
    'LIKE_1', 'LIKE_2',
    'COMP_1', 'COMP_2', 'COMP_3',
    'SAT_1', 'SAT_2', 'SAT_3',
    'TRUST_1', 'TRUST_2', 'TRUST_3', 'TRUST_4',
    'LOY_1', 'LOY_2', 'LOY_3',
]

In [17]:
df = df[ordered_columns]

In [18]:
df_demo=df[['A_1_AGE','D_1_GENDER','D_2_FAMILY','D_3_EDUCATION','D_4_EMPLOYMENT','D_6_INCOME']]

In [19]:
demo_dict={
    "A_1_AGE": {
        0: "<18",
        1: "18-24",
        2: "25-34",
        3: "35-44",
        4: "45-54",
        5: "55-65",
        6: ">65"
    },
    "D_1_GENDER": {
        0: "Prefer not to answer",
        1: "Male",
        2: "Female",
        3: "Diverse"
    },
    "D_2_FAMILY": {
        0: "Prefer not to answer",
        1: "Living alone",
        2: "Living with a partner",
        3: "Registered civil partnership",
        4: "Married",
        5: "Divorced",
        6: "Widowed"
    },
    "D_3_EDUCATION": {
        0: "Prefer not to answer",
        1: "No education",
        2: "Completed 9th grade",
        3: "Completed 10th grade",
        4: "Completed 12th grade",
        5: "High School Diploma",
        6: "Vocational training",
        7: "University degree"
    },
    "D_4_EMPLOYMENT": {
        0: "Prefer not to answer",
        1: "Unemployed",
        2: "Retired",
        3: "Houseman/housewife",
        4: "In education",
        5: "Studying at a university",
        6: "Self-employed",
        7: "Employed"
    },
    "D_6_INCOME": {
        0: "Prefer not to answer",
        1: "< EUR 750",
        2: "EUR 750–1250",
        3: "EUR 1250–2000",
        4: "EUR 2000–3500",
        5: "EUR 3500–5000",
        6: "> EUR 5000"
    }
}


In [20]:
for col in df_demo.columns:
    dict_key = col
    df_demo[col] = df_demo[col].map(demo_dict[dict_key])


In [21]:
##Questions for LLM to Answer
questions_to_ans='''
Q1. How likely is it that you will remain a customer of your bank?
Q2. I will purchase new banking products in the future
Q3. In the future, I will make use of other banking products or financial services offered by my bank 
'''

In [22]:
def generate_answers(df, idx):
    corresponding_ans = list(df.loc[idx])
    answer_dic = {}
    for i, ans in enumerate(corresponding_ans):
        answer_dic[f'Q{i+1}'] = ans
    answer_dic_str = "\n".join([f"{key}: {value}" for key, value in answer_dic.items()])
    return answer_dic_str

In [23]:
#Demographic Form
def generate_demo_txt(demo_ans):
    text=f"""
    Q1: How old are you?
    A1:{demo_ans[0]}

    Q2: What gender are you?
    A2:{demo_ans[1]}

    Q3: What is your marital status?
    A3:{demo_ans[2]}

    Q4: What is your highest educational qualification?
    A4:{demo_ans[3]}

    Q5: What best describes your main occupation at the moment?
    A5:{demo_ans[4]}

    Q6:In which of the following groups does your monthly net income fall?
    A6:{demo_ans[5]}
    """
    return text

In [None]:
#Generate Responses for the Surveys
client = OpenAI(api_key="YOUR-API-KEY")

def get_survey_response(demo_info, questions_to_ans):
    role = f"""
        You are a German online survey respondent, customer of cooperative banks in Germany, evaluating your main bank with regard to several characteristics. Answer all questions based on the provided information to maintain a consistent and personalized answering pattern. Do not skip any questions.
    """
    prompt = f"""
        Survey Respondent Information:
        - {demo_info}

        **Instructions:**

        Consider general perspectives associated with your demographic information.
        - Answer each question considering general perspectives associated with your demographic information. 
        - Format each response as follows: "'Q<number>': <Answer> (reason for answer)". Strictly adhere to the required response format without adding extra text or elaboration outside this structure.

        **Survey Questions to Answer:**
        There are 3 questions to answer.
        {questions_to_ans}
        (The responses are on a Likert scale from 1 to 7, where 1 = Do not at all agree, 4 = Neutral, 7 = Do completely agree)
        For **question 1**, use a Likert scale from 1 to 7, where 1=Very unlikely, 4= Neutral, 7=Very likely.
        **Example Response:**
        [
        'Q1': 6 (The respondent has shown a generally positive attitude towards main bank, and this question aligns with that tendency, so a higher value is expected),
        'Q2': 2 (The respondent tends to be skeptical about main bank, reflected in their prior answers, and this question aligns with that skepticism),
        'Q3': 4 (Neutral response, based on the respondent's previous tendency to give neutral answers on similar questions)
        ]

        Use a Likert scale from 1 to 7 (1 = Completely disagree to 7 = Completely agree) for each answer.
        Respond in a structured format, outlining each step to form a well-supported evaluation.
    """
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": role},
            {"role": "user", "content": prompt}
        ]
    )

    result = response.choices[0].message.content
    return result

In [None]:
progress = 0
answer_list = {}
save_path = 'YOUR-PATH'
if not os.path.exists(save_path):
    os.makedirs(save_path)
idx_list = list(df_demo.index)

for idx in idx_list:
    progress+=1
    if progress%50 == 0:
        print(f"progres {progress} done")

    demo_ans= df_demo.loc[idx, 'A_1_AGE':'D_6_INCOME'].values.tolist()
    demo_info=generate_demo_txt(demo_ans)
    gpt_answers_raw = get_survey_response(demo_info, questions_to_ans)  # Generate LLM Survey Responses
    gpt_answers = re.findall(r"[\"']Q\d+[\"']:\s*(\d+)", gpt_answers_raw)
    gpt_answers = list(map(int, gpt_answers))
    human_answers_with_Q = generate_answers(df.iloc[:, 18:21], idx) # Original Human answers to compare
    human_answers = re.findall(r': (\d+)', str(human_answers_with_Q))
    human_answers = list(map(int, human_answers))

    answer_list[idx] = gpt_answers

    if len(gpt_answers) == len(human_answers):
        check = [1 if gpt_answers[i] == human_answers[i] else 0 for i in range(len(gpt_answers))]

        results = pd.DataFrame({
            "raw_answer": gpt_answers_raw,
            "gpt_answers": gpt_answers,
            "human_answers": human_answers,
            "check": check
        })
        results.to_csv(f'{save_path}idx_{idx}_result.csv', index=False)
    
    else:
        print(f"Index {idx} gpt_answers length does not match to human_answers.")
        results = pd.DataFrame({
            "raw_answer": gpt_answers_raw,
            "human_answers": human_answers,
        })
        results.to_csv(f'{save_path}idx_{idx}_result.csv', index=False)

In [None]:
gpt_aoa = pd.DataFrame.from_dict(answer_list, orient="index", columns=["Q1", "Q2", "Q3"])
gpt_aoa.to_csv(f'{save_path}llm_responses.csv')