<a href="https://colab.research.google.com/github/antndlcrx/LLM-for-Social-Science-Research/blob/surveys-experimental/survey_mappings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import os
import json
import random

In [None]:
#@title Helper Functions


def create_random_profile(survey_mappings: dict,
                          max_features: int = 3,
                          fixed_features: dict = None) -> (dict, str):
    """
    Generates a random profile for a survey respondent by selecting a subset of features
    from different sections in the survey mappings. Each section can have a random number
    of features selected (up to `max_features`), with specified `fixed_features` included
    in every profile. Also selects a random feature to serve as the response variable.

    Returns both the profile dictionary and the selected response feature key.
    """
    respondent_profile = {
        section: {
            feature: survey_mappings[section][feature]
            for feature in features if feature in survey_mappings.get(section, {})
        }
        for section, features in (fixed_features or {}).items()
        if section in survey_mappings
    }

    remaining_sections = list(set(survey_mappings.keys()) - set(respondent_profile.keys()))
    selected_sections = []

    if remaining_sections:
        num_sections = random.randint(1, len(remaining_sections))
        selected_sections = random.sample(remaining_sections, k=num_sections)

    for section in selected_sections:
        features = list(survey_mappings[section].keys())
        max_k = min(max_features, len(features))
        if max_k > 0:
            k = random.randint(0, max_k)
            selected_features = random.sample(features, k=k)
            if selected_features:
                respondent_profile[section] = {feature: survey_mappings[section][feature] for feature in selected_features}

    # Flatten profile and randomly select a feature for response
    flattened_profile = {f"{section}_{feature}": data for section, features in respondent_profile.items() for feature, data in features.items()}
    response_feature = random.choice(list(flattened_profile.keys())) if flattened_profile else None

    return flattened_profile, response_feature


def generate_flattened_profiles(df: pd.DataFrame,
                                id_var: str,
                                survey_mappings: dict,
                                max_features: int = 3,
                                fixed_features: dict = None) -> pd.DataFrame:
    """
    Generates and flattens profiles for each respondent by selecting features and mapping
    them to the respondent's data from the 'df' DataFrame, along with a response feature key.
    """
    profiles = []

    for _, respondent in df.iterrows():
        profile, response_feature = create_random_profile(survey_mappings, max_features=max_features, fixed_features=fixed_features)

        # Flatten profile and map it to respondent's data, excluding the response feature
        respondent_profile = {
            'respondent_id': respondent[id_var],
            'response_feature': response_feature,
            **{feature_key: respondent.get(feature_key.split('_', 1)[1], None)
               for feature_key in profile.keys() if feature_key != response_feature}
        }

        profiles.append(respondent_profile)

    return pd.DataFrame(profiles).set_index('respondent_id')


def create_preamble(profile: dict, survey_mappings: dict, response_feature: str) -> str:
    """
    Constructs the preamble text for a respondent's profile, excluding the response feature.
    """
    preamble_parts = []

    for feature_key, feature_value in profile.items():
        if feature_key == 'response_feature':
            continue

        section, feature = feature_key.split('_', 1)
        feature_data = survey_mappings.get(section, {}).get(feature)

        if feature_data:
            description = feature_data.get("description", feature)
            values_map = feature_data.get("values", {})
            value_text = values_map.get(str(feature_value), str(feature_value))
            preamble_parts.append(f"{description}: {value_text}")

    return ", ".join(preamble_parts)


def create_response(df: pd.DataFrame, respondent_id, survey_mappings: dict, response_feature: str) -> str:
    """
    Constructs the response text for a specific respondent using the pre-selected response feature.
    """
    if not response_feature:
        return "No response feature available."

    section, feature = response_feature.split('_', 1)
    feature_data = survey_mappings.get(section, {}).get(feature)
    if not feature_data:
        return "No available question for the selected response."

    question = feature_data.get("question", "No question available")
    respondent_value = df.loc[respondent_id, feature]
    response_text = feature_data.get("values", {}).get(str(respondent_value), str(respondent_value))

    return f"{question}: {response_text}"


def generate_text_profiles(df: pd.DataFrame, profiles_df: pd.DataFrame, survey_mappings: dict, id_var: str) -> dict:
    """
    Generates text representations for each respondent based on their profile, ensuring that
    the response feature is excluded from the preamble.
    """
    text_profiles = {}

    for respondent_id, profile in profiles_df.iterrows():
        response_feature = profile.pop('response_feature')
        preamble = create_preamble(profile, survey_mappings, response_feature)
        response = create_response(df, respondent_id, survey_mappings, response_feature)

        text_profile = f"respondent id: {respondent_id}, preamble: \"{preamble}\", response: \"{response}\""
        text_profiles[respondent_id] = text_profile

    return text_profiles


In [None]:
directory = 'mappings'

survey_mappings = {}

for filename in os.listdir(directory):
    if filename.endswith('.json'):
        section_name = os.path.splitext(filename)[0]

        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            survey_mappings[section_name] = json.load(file)


In [None]:
ess = pd.read_csv('ESS10.csv')

  ess = pd.read_csv('ESS10.csv')


In [None]:
print(ess.shape)
ess.head(2)

(37611, 618)


Unnamed: 0,name,essround,edition,proddate,idno,cntry,dweight,pspwght,pweight,anweight,...,vinwe,inwde,jinws,jinwe,inwtm,mode,domain,prob,stratum,psu
0,ESS10e03_2,10,3.2,02.11.2023,10038,BE,0.88222,0.972276,0.718075,0.698167,...,2022-09-01 17:47:00,2022-09-01 17:47:00,2022-09-01 17:47:00,2022-09-01 17:47:00,36.0,1,1.0,0.000397,188,2596
1,ESS10e03_2,10,3.2,02.11.2023,10053,BE,1.047643,0.888635,0.718075,0.638107,...,2022-04-08 11:07:00,2022-04-08 11:10:00,2022-04-08 11:07:00,2022-04-08 11:10:00,54.0,2,2.0,0.000334,194,2206


In [None]:
fixed_features = {"demographics": ["cntry", "gndr"]}
respondent_profile = create_random_profile(survey_mappings, max_features=3, fixed_features=fixed_features)

In [None]:
respondent_profile.keys()

dict_keys(['demographics', 'well_being_emot_attachment'])

In [None]:
generate_profiles_from_dataset(ess[0:5], survey_mappings, max_features=3, fixed_features=fixed_features)

Unnamed: 0_level_0,demographics_cntry,demographics_gndr,human_values_ipfrule,political_opinions_hmsacld,political_opinions_euftf,political_opinions_freehms,activism_bctprd,activism_pstplonl,relationship_parents_and_at_work_manwrkpl,relationship_parents_and_at_work_mancom,...,understanding_democracy_stpldmc,understanding_democracy_wpestop,discriminated_by_char_lang_country_origin_dscrdsb,well_being_emot_attachment_aesfdrk,well_being_emot_attachment_atcherp,well_being_emot_attachment_sclmeet,human_values_ipadvnt,human_values_ipbhprp,party_affiliation_prtclecz,voting_behavior_prtvbhr
idno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10038,BE,2,4.0,,,,,,,,...,,,,,,,,,,
10053,BE,2,,3.0,5.0,1.0,2.0,2.0,66.0,66.0,...,,,,,,,,,,
10055,BE,1,,,,,,,,,...,,,,,,,,,,
10062,BE,1,,,,,,,,,...,,,,,,,,,,
10064,BE,1,,,,,,,,,...,66.0,3.0,0.0,1.0,9.0,6.0,5.0,1.0,,


## Next Steps

In [None]:
!pip install -q -U google-generativeai

In [None]:
# Import the Python SDK
import google.generativeai as genai
# Used to securely store your API key
from google.colab import userdata

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
model = genai.GenerativeModel('gemini-1.5-flash')

In [None]:
prompt = "Create a short biographic description for the profile below."

In [None]:
model.generate_content()

In [None]:
#@title Dealing with bad Mappings

def find_numeric_to_numeric_mappings(survey_mappings: dict) -> dict:
    """
    Identifies mappings with numeric-to-numeric key-value pairs in the nested survey_mappings dictionary.
    Returns a dictionary with the section and feature names for each problematic mapping.

    Parameters:
    -----------
    survey_mappings : dict
        The nested dictionary containing sections and features of survey mappings.

    Returns:
    --------
    dict
        A dictionary where each key is a section name and each value is a list of features that contain
        numeric-to-numeric mappings in their "values" field.
    """

    problematic_mappings = {}

    for section, features in survey_mappings.items():
        for feature, feature_data in features.items():
            # Check if "values" key exists in feature data
            if "values" in feature_data:
                # Flag to indicate if this feature has numeric-to-numeric mappings
                has_numeric_to_numeric = False

                for key, value in feature_data["values"].items():
                    # Check if both key and value are numeric (integer-like or string numeric)
                    if key.isdigit() and (value.isdigit() or isinstance(value, int)):
                        has_numeric_to_numeric = True
                        break

                if has_numeric_to_numeric:
                    if section not in problematic_mappings:
                        problematic_mappings[section] = []
                    problematic_mappings[section].append(feature)

    return problematic_mappings


In [None]:
problematic_mappings = find_numeric_to_numeric_mappings(survey_mappings)
problematic_mappings

{'relationship_parents_and_at_work': ['stfmjob',
  'mansupp',
  'teamfeel',
  'wrkextra'],
 'political_opinions': ['lrscale',
  'stflife',
  'stfeco',
  'stfgov',
  'stfdem',
  'stfedu',
  'stfhlth',
  'euftf',
  'imbgeco',
  'imueclt',
  'imwbcnt'],
 'internet_use_social_trust': ['ppltrst',
  'pplfair',
  'pplhlp',
  'trstprl',
  'trstlgl',
  'trstplc',
  'trstplt',
  'trstprt',
  'trstep',
  'trstun',
  'trstsci'],
 'well_being_emot_attachment': ['happy', 'inprdsc', 'atchctr', 'atcherp'],
 'religion': ['rlgrl'],
 'climate_change_eu': ['ccrdprs', 'testic34', 'testic35', 'testic36'],
 'understanding_democracy': ['fairelc',
  'dfprtal',
  'medcrgv',
  'rghmgpr',
  'votedir',
  'cttresa',
  'gptpel',
  'gvctzpv',
  'grdfinc',
  'viepol',
  'wpestop',
  'keydec',
  'fairelcc',
  'dfprtalc',
  'medcrgvc',
  'rghmgprc',
  'votedirc',
  'cttresac',
  'gptpelcc',
  'gvctzpvc',
  'grdfincc',
  'viepolc',
  'wpestopc',
  'keydecc',
  'chpldmi',
  'chpldmc',
  'stpldmi',
  'stpldmc',
  'accalaw'

In [None]:
1+1

2