In [1]:
import pandas as pd
import numpy as np
import os
import json
import random
from typing import List, Dict, Tuple, Any

In [2]:
!git clone https://github.com/antndlcrx/LLM-for-Social-Science-Research.git

Cloning into 'LLM-for-Social-Science-Research'...
remote: Enumerating objects: 126, done.[K
remote: Counting objects: 100% (126/126), done.[K
remote: Compressing objects: 100% (105/105), done.[K
remote: Total 126 (delta 65), reused 56 (delta 21), pack-reused 0 (from 0)[K
Receiving objects: 100% (126/126), 10.68 MiB | 5.28 MiB/s, done.
Resolving deltas: 100% (65/65), done.


In [3]:
directory = 'LLM-for-Social-Science-Research/mappings'

survey_mappings = {}

for filename in os.listdir(directory):
    if filename.endswith('.json'):
        section_name = os.path.splitext(filename)[0]

        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            survey_mappings[section_name] = json.load(file)


ess = pd.read_csv('LLM-for-Social-Science-Research/ESS10.csv')

  ess = pd.read_csv('LLM-for-Social-Science-Research/ESS10.csv')


In [98]:
survey_mappings['voting_behavior']['prtvtebe'].items()

dict_items([('description', 'Party voted for in last national election, Belgium'), ('question', 'Which party did you vote for in that election? (Belgium)'), ('values', {'1': 'Groen!', '2': 'CD&V', '3': 'N-VA', '4': 'Sp.a (Vooruit)', '5': 'PVDA', '6': 'Vlaams Belang', '7': 'Open VLD', '8': 'cdH (Les Engagés)', '9': 'Ecolo', '10': 'MR', '11': 'PS', '12': 'PTB', '13': 'DéFI', '14': 'Blank', '15': 'Invalid', '16': 'Other', '66': 'Not applicable', '77': 'Refusal', '88': "Don't know", '99': 'No answer'})])

In [20]:
ess.head(3)

Unnamed: 0,name,essround,edition,proddate,idno,cntry,dweight,pspwght,pweight,anweight,...,vinwe,inwde,jinws,jinwe,inwtm,mode,domain,prob,stratum,psu
0,ESS10e03_2,10,3.2,02.11.2023,10038,BE,0.88222,0.972276,0.718075,0.698167,...,2022-09-01 17:47:00,2022-09-01 17:47:00,2022-09-01 17:47:00,2022-09-01 17:47:00,36.0,1,1.0,0.000397,188,2596
1,ESS10e03_2,10,3.2,02.11.2023,10053,BE,1.047643,0.888635,0.718075,0.638107,...,2022-04-08 11:07:00,2022-04-08 11:10:00,2022-04-08 11:07:00,2022-04-08 11:10:00,54.0,2,2.0,0.000334,194,2206
2,ESS10e03_2,10,3.2,02.11.2023,10055,BE,1.087741,0.722811,0.718075,0.519033,...,2022-05-20 11:08:00,2022-05-20 11:10:00,2022-05-20 11:08:00,2022-05-20 11:10:00,77.0,1,2.0,0.000322,198,2114


In [4]:
class SurveyProfileGenerator:
    def __init__(self,
                 data: pd.DataFrame,
                 respondent_id: str,
                 survey_mappings: Dict[str, Dict[str, Any]],
                 max_sections: int = 3,
                 max_features: int = 3,
                 fixed_features: List[str] = None,
                 country_field: str = None,
                 country_specific_variables: Dict[str, Dict[str, Dict[str, str]]] = None):
        """
        Initializes the SurveyProfileGenerator with survey mappings, maximum number of features per section,
        any fixed features that should always be included in the profiles, and optional country-specific variables.

        Parameters:
        - data (pd.DataFrame): The survey dataset.
        - respondent_id (str): The column name for respondent IDs.
        - survey_mappings (dict): Nested dictionary mapping of survey questions.
        - max_sections (int): Maximum number of sections to randomly select.
        - max_features (int): Maximum number of features to randomly select per section.
        - fixed_features (List[str]): List of feature names that are fixed and always included.
        - country_field (str, optional): The column name for country information. If None, country-specific variables are not adjusted.
        - country_specific_variables (dict, optional): Dictionary of country-specific variables. If None, no country-specific variables are used.
        """
        self.data = data
        self.respondent_id = respondent_id
        self.survey_mappings = survey_mappings
        self.max_sections = max_sections
        self.max_features = max_features
        self.fixed_features = fixed_features or []
        self.country_field = country_field
        self.country_specific_variables = country_specific_variables or {}

        # Build a mapping from feature names to their sections
        self.feature_to_section = {
            feature: section
            for section, features in self.survey_mappings.items()
            for feature in features
        }

    def create_random_profile(self, respondent: pd.Series) -> Dict[str, Any]:
        profile = {'respondent_id': respondent[self.respondent_id]}

        # Add fixed features
        for feature in self.fixed_features:
            if feature in respondent:
                profile[feature] = respondent[feature]

        # Get respondent's country if country_field is provided
        if self.country_field and self.country_field in respondent:
            respondent_country = respondent[self.country_field]
        else:
            respondent_country = None

        available_sections = list(self.survey_mappings.keys())

        # Randomly select sections
        num_sections_to_select = min(self.max_sections, len(available_sections))
        random_sections = random.sample(available_sections, num_sections_to_select)

        # Collect selected features
        selected_features = []
        for section in random_sections:
            features_in_section = list(self.survey_mappings[section].keys())

            adjusted_features = []

            for feature in features_in_section:
                is_country_specific = False

                # Adjust features for country-specific variables if country_field is provided
                if self.country_field and self.country_specific_variables:
                    for feature_type, country_vars in self.country_specific_variables.items():
                        if feature in country_vars.values():
                            is_country_specific = True
                            # Only include if it matches the respondent's country
                            if respondent_country and feature == country_vars.get(respondent_country):
                                adjusted_features.append(feature)
                            break  # No need to check other feature types

                if not is_country_specific:
                    adjusted_features.append(feature)

            num_features_to_select = min(self.max_features, len(adjusted_features))
            if num_features_to_select > 0:
                selected_in_section = random.sample(adjusted_features, num_features_to_select)
                selected_features.extend(selected_in_section)

        # Remove any fixed features from selected features
        selected_features = [f for f in selected_features if f not in self.fixed_features]

        if not selected_features:
            # If no features are left after removing fixed features
            return profile

        # Select one feature as the response feature
        response_feature = random.choice(selected_features)
        selected_features.remove(response_feature)

        # Add predictor features
        for feature in selected_features:
            if feature in respondent:
                profile[feature] = respondent[feature]

        # Add the response feature
        if response_feature in respondent:
            profile['response_feature'] = respondent[response_feature]
            profile['response_feature_name'] = response_feature

        return profile

    def generate_profiles(self, num_profiles_per_respondent: int) -> List[Dict[str, Any]]:
        profiles = []
        for _, respondent in self.data.iterrows():
            for _ in range(num_profiles_per_respondent):
                profile = self.create_random_profile(respondent)
                profiles.append(profile)
        return profiles

    def profile_to_text(self, profile: Dict[str, Any]) -> Tuple[str, str, str]:
        lines = []

        # Extract the response feature name and value
        response_feature_name = profile.get('response_feature_name')
        response_feature_value = profile.get('response_feature')

        # Iterate over predictor features
        for feature, value in profile.items():
            if feature in ['respondent_id', 'response_feature', 'response_feature_name']:
                continue  # Skip non-feature fields

            if pd.isnull(value):
                continue  # Skip features with NaN values

            section = self.feature_to_section.get(feature)
            if not section:
                continue  # Skip if section is not found

            feature_mapping = self.survey_mappings.get(section, {}).get(feature)
            if not feature_mapping:
                continue  # Skip if feature mapping is not found

            description = feature_mapping.get('description', feature)
            values_mapping = feature_mapping.get('values', {})

            # Convert value to match the keys in values_mapping
            if isinstance(value, float) and value.is_integer():
                value_key = str(int(value))
            elif isinstance(value, (int, np.integer)):
                value_key = str(value)
            else:
                value_key = str(value)

            value_text = values_mapping.get(value_key, str(value))

            lines.append(f"{description}: {value_text}")

        # Get the question and response for the response feature
        if response_feature_name and response_feature_value is not None:
            section = self.feature_to_section.get(response_feature_name)
            if section:
                feature_mapping = self.survey_mappings.get(section, {}).get(response_feature_name)
                if feature_mapping:
                    question = feature_mapping.get('question', f"Please answer the following question about {response_feature_name}:")
                    values_mapping = feature_mapping.get('values', {})

                    # Convert response_feature_value to match the keys
                    if isinstance(response_feature_value, float) and response_feature_value.is_integer():
                        response_value_key = str(int(response_feature_value))
                    elif isinstance(response_feature_value, (int, np.integer)):
                        response_value_key = str(response_feature_value)
                    else:
                        response_value_key = str(response_feature_value)

                    response_text = values_mapping.get(response_value_key, str(response_feature_value))
                else:
                    question = f"Please answer the following question about {response_feature_name}:"
                    response_text = str(response_feature_value)
            else:
                question = f"Please answer the following question about {response_feature_name}:"
                response_text = str(response_feature_value)
        else:
            # If no response feature is available
            response_text = ""
            question = ""

        preamble = '\n'.join(lines)
        return preamble, question, response_text


In [5]:
#@title add country specific var dictionary for ess
country_specific_variables = {
            'religion_present': {
                'CH': 'rlgdnach',
                'FI': 'rlgdnafi',
                'GR': 'rlgdnagr',
                'HU': 'rlgdnhu',
                'IS': 'rlgdnais',
                'IE': 'rlgdnie',
                'LT': 'rlgdnlt',
                'ME': 'rlgdme',
                'NL': 'rlgdnanl',
                'MK': 'rlgdnmk',
                'NO': 'rlgdnno',
                'SK': 'rlgdnbsk',
                'GB': 'rlgdngb',
            },
            'religion_past': {
                'CH': 'rlgdeach',
                'FI': 'rlgdeafi',
                'GR': 'rlgdeagr',
                'HU': 'rlgdehu',
                'IS': 'rlgdeais',
                'IE': 'rlgdeie',
                'LT': 'rlgdelt',
                'ME': 'rlgdeme',
                'NL': 'rlgdeanl',
                'MK': 'rlgdemk',
                'NO': 'rlgdeno',
                'SK': 'rlgdebsk',
                'GB': 'rlgdegb',
            },
            'voted': {
                'BE': 'prtvtebe',
                'BG': 'prtvtebg',
                'CH': 'prtvthch',
                'HR': 'prtvbhr',
                'CZ': 'prtvtecz',
                'EE': 'prtvthee',
                'FI': 'prtvtefi',
                'FR': 'prtvfr',
                'GR': 'prtvtdgr',
                'HU': 'prtvtghu',
                'IS': 'prtvic',
                'IE': 'prtvie',
                'IT': 'prtvti',
                'LT': 'prtvlt',
                'ME': 'prtvme',
                'NL': 'prtvthnl',
                'MK': 'prtvtmk',
                'NO': 'prtvtbno',
                'PT': 'prtvtdpt',
                'SI': 'prtvtfsi',
                'SK': 'prtvtesk',
                'GB': 'prtvttdgb',
            },
            'party_affiliation': {
                'BE': 'prtclebe',
                'BG': 'prtclebg',
                'CH': 'prtclhch',
                'HR': 'prtclbhr',
                'CZ': 'prtclecz',
                'EE': 'prtclhee',
                'FI': 'prtclffi',
                'FR': 'prtclffr',
                'GR': 'prtcldgr',
                'HU': 'prtclhhu',
                'IS': 'prtcldis',
                'IE': 'prtclfie',
                'IT': 'prtcleit',
                'LT': 'prtclclt',
                'ME': 'prtclame',
                'NL': 'prtclgnl',
                'MK': 'prtclmk',
                'NO': 'prtclbno',
                'PT': 'prtclfpt',
                'SI': 'prtclfsi',
                'SK': 'prtclesk',
                'GB': 'prtcldgb',
            }
        }

In [6]:
prof_generator = SurveyProfileGenerator(ess[10:15], survey_mappings=survey_mappings,
                                        respondent_id='idno', max_sections=3, max_features=3,
                                        fixed_features=['cntry', 'gndr', 'essround'], country_field='cntry',
                                        country_specific_variables=country_specific_variables)

In [10]:
profiles = prof_generator.generate_profiles(3)

In [8]:
profiles[1]

{'respondent_id': 10112,
 'cntry': 'BE',
 'gndr': 1,
 'essround': 10,
 'uemp3m': 2,
 'wkhct': 36,
 'pplfair': 7,
 'psppipla': 2,
 'cptppola': 1,
 'colspeak': 66,
 'closepnt': 6,
 'scrnpnt': 66,
 'response_feature': 1,
 'response_feature_name': 'hincsrca'}

In [11]:
for profile in profiles:
    preambule, question, response = prof_generator.profile_to_text(profile)
    print(f"Profile: {preambule}. \n\nQuestion: {question} \n\nResponse: {response}")
    print("-" * 40)

Profile: Country: Belgium
Gender: Male
Year: 2020
Trust in country's parliament: 5
Confident in own ability to participate in politics: Not at all confident
Voted last national election: Yes
Party voted for in last national election, Belgium: PVDA
How close to party: Quite close
Feel closer to a particular party than all other parties: Yes
Which party feel closer to, Belgium: PVDA. 

Question: How much would you say the political system in [country] allows people like you to have a say in what the government does? 

Response: Very little
----------------------------------------
Profile: Country: Belgium
Gender: Male
Year: 2020
Important to try new and different things in life: Not like me
Important to care for nature and environment: Very much like me
Important to think new ideas and being creative: Like me
How often attend religious services apart from special occasions: More than once a week
Religion or denomination belonging to at present: Islam
Worn or displayed campaign badge/stic

## Next Steps

In [9]:
!pip install -q -U google-generativeai

In [10]:
# Import the Python SDK
import google.generativeai as genai
# Used to securely store your API key
from google.colab import userdata

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [17]:
model = genai.GenerativeModel('gemini-1.5-flash')

In [12]:
def create_prompts(profiles: Dict[str, Any]):
    """
    """

    prompt_main = "Edit the text below to create a dialog. First part should prompt a model to take on a given personality from features given in the profile. Then, this model with the personality should be asked a question stated in the prompt. Then, the model should answer the qestion with the probided response."

    prompts = []
    for profile in profiles:
        preamble, question, response = prof_generator.profile_to_text(profile)
        prompt = f"{prompt_main}\n<<<Profile: {preamble}>>>. \n<<<Question: {question}>>>. \n<<<Response: {response}>>>"
        prompts.append(prompt)
    return prompts

In [13]:
len(profiles)

5

In [14]:
prompts = create_prompts(profiles)

In [18]:
outputs = [model.generate_content(prompt) for prompt in prompts]
text_outputs = [output.text for output in outputs]

In [19]:
for output in text_outputs:
    print(output)
    print("-" * 40)

**Interviewer:**  Okay, let's imagine we're talking to a 2020 Belgian man, Muslim, who spends 210 minutes a day online, has low trust in political parties (a 2 out of 10), and believes most people are mostly looking out for themselves (also a 2 out of 10). He doesn't work with colleagues in person or remotely.  Assume this persona and answer the following question.

**Belgian Man (2020):**  (Adjusts imaginary phone to his ear)  Okay, I'm ready.  Ask away.

**Interviewer:** Have you ever considered yourself as belonging to any particular religion or denomination?

**Belgian Man (2020):** Not applicable.

----------------------------------------
**Interviewer:**  Okay, let's imagine we're interviewing a Belgian male, working in financial services in 2020. He's a former trade union member, values proper behavior and loyalty to close friends, but isn't particularly concerned with a strong government.  Now, embodying that persona, how likely would your colleagues be to give you work-related

In [108]:
for profile in profiles:
    preambule, question, response = prof_generator.profile_to_text(profile)
    print(f"Profile: {preambule}. \n\nQuestion: {question} \n\nResponse: {response}")
    print("-" * 40)

Profile: Country: Belgium
Gender: Male
Important to try new and different things in life: Not like me
Important to understand different people: Somewhat like me
Important to be humble and modest, not draw attention: Don't know
In country key decisions are made by national governments rather than the European Union: 4
In country citizens have the final say on political issues by voting directly in referendums: Refusal
Worn or displayed campaign badge/sticker last 12 months: Yes
Donated to or participated in political party or pressure group last 12 months: No
Volunteered for not-for-profit or charitable organisation: Yes. 

Question: How much do you think the views of ordinary people in [country] prevail over the views of the political elite? 

Response: 6
----------------------------------------
Profile: Country: Belgium
Gender: Male
Main source of household income: Wages or salaries
Total hours normally worked per week in main job, overtime included: 42
Employment relation: Employee
V

In [None]:
profiles[1]

{'respondent_id': 10053,
 'cntry': 'BE',
 'gndr': 2,
 'bctprd': 2,
 'pbldmna': 2,
 'pstplonl': 2,
 'lrscale': 5,
 'euftf': 5,
 'wkhct': 40,
 'uemp3m': 2,
 'hinctnta': 88,
 'response_feature': 5,
 'response_feature_name': 'stfeco'}

In [None]:
#@title Dealing with bad Mappings

def find_numeric_to_numeric_mappings(survey_mappings: dict) -> dict:
    """
    Identifies mappings with numeric-to-numeric key-value pairs in the nested survey_mappings dictionary.
    Returns a dictionary with the section and feature names for each problematic mapping.

    Parameters:
    -----------
    survey_mappings : dict
        The nested dictionary containing sections and features of survey mappings.

    Returns:
    --------
    dict
        A dictionary where each key is a section name and each value is a list of features that contain
        numeric-to-numeric mappings in their "values" field.
    """

    problematic_mappings = {}

    for section, features in survey_mappings.items():
        for feature, feature_data in features.items():
            # Check if "values" key exists in feature data
            if "values" in feature_data:
                # Flag to indicate if this feature has numeric-to-numeric mappings
                has_numeric_to_numeric = False

                for key, value in feature_data["values"].items():
                    # Check if both key and value are numeric (integer-like or string numeric)
                    if key.isdigit() and (value.isdigit() or isinstance(value, int)):
                        has_numeric_to_numeric = True
                        break

                if has_numeric_to_numeric:
                    if section not in problematic_mappings:
                        problematic_mappings[section] = []
                    problematic_mappings[section].append(feature)

    return problematic_mappings


In [None]:
problematic_mappings = find_numeric_to_numeric_mappings(survey_mappings)
problematic_mappings

{'relationship_parents_and_at_work': ['stfmjob',
  'mansupp',
  'teamfeel',
  'wrkextra'],
 'political_opinions': ['lrscale',
  'stflife',
  'stfeco',
  'stfgov',
  'stfdem',
  'stfedu',
  'stfhlth',
  'euftf',
  'imbgeco',
  'imueclt',
  'imwbcnt'],
 'internet_use_social_trust': ['ppltrst',
  'pplfair',
  'pplhlp',
  'trstprl',
  'trstlgl',
  'trstplc',
  'trstplt',
  'trstprt',
  'trstep',
  'trstun',
  'trstsci'],
 'well_being_emot_attachment': ['happy', 'inprdsc', 'atchctr', 'atcherp'],
 'religion': ['rlgrl'],
 'climate_change_eu': ['ccrdprs', 'testic34', 'testic35', 'testic36'],
 'understanding_democracy': ['fairelc',
  'dfprtal',
  'medcrgv',
  'rghmgpr',
  'votedir',
  'cttresa',
  'gptpel',
  'gvctzpv',
  'grdfinc',
  'viepol',
  'wpestop',
  'keydec',
  'fairelcc',
  'dfprtalc',
  'medcrgvc',
  'rghmgprc',
  'votedirc',
  'cttresac',
  'gptpelcc',
  'gvctzpvc',
  'grdfincc',
  'viepolc',
  'wpestopc',
  'keydecc',
  'chpldmi',
  'chpldmc',
  'stpldmi',
  'stpldmc',
  'accalaw'

In [16]:
ess.iloc[:, 0:15].head(2)

Unnamed: 0,name,essround,edition,proddate,idno,cntry,dweight,pspwght,pweight,anweight,nwspol,netusoft,netustm,ppltrst,pplfair
0,ESS10e03_2,10,3.2,02.11.2023,10038,BE,0.88222,0.972276,0.718075,0.698167,30,5,8,6,7
1,ESS10e03_2,10,3.2,02.11.2023,10053,BE,1.047643,0.888635,0.718075,0.638107,10,5,240,3,4
