In [1]:
import pandas as pd
import numpy as np
import os
import json
import random
from typing import List, Dict, Tuple, Any

In [2]:
!git clone https://github.com/antndlcrx/LLM-for-Social-Science-Research.git

Cloning into 'LLM-for-Social-Science-Research'...
remote: Enumerating objects: 76, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 76 (delta 31), reused 53 (delta 19), pack-reused 0 (from 0)[K
Receiving objects: 100% (76/76), 76.09 KiB | 10.87 MiB/s, done.
Resolving deltas: 100% (31/31), done.


In [3]:
directory = 'LLM-for-Social-Science-Research/mappings'

survey_mappings = {}

for filename in os.listdir(directory):
    if filename.endswith('.json'):
        section_name = os.path.splitext(filename)[0]

        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            survey_mappings[section_name] = json.load(file)


ess = pd.read_csv('ESS10.csv')

  ess = pd.read_csv('ESS10.csv')


In [None]:
survey_mappings.items()

In [157]:
class SurveyProfileGenerator:
    def __init__(self,
                 data: pd.DataFrame,
                 respondent_id: str,
                 survey_mappings: Dict[str, Dict[str, Any]],
                 max_sections: int = 3,
                 max_features: int = 3,
                 fixed_features: List[str] = None):
        """
        Initializes the SurveyProfileGenerator with survey mappings, maximum number of features per section,
        and any fixed features that should always be included in the profiles.

        Parameters:
        - data (pd.DataFrame): The survey dataset.
        - respondent_id (str): The column name for respondent IDs.
        - survey_mappings (dict): Nested dictionary mapping of survey questions.
        - max_sections (int): Maximum number of sections to randomly select.
        - max_features (int): Maximum number of features to randomly select per section.
        - fixed_features (List[str]): List of feature names that are fixed and always included.
        """
        self.data = data
        self.respondent_id = respondent_id
        self.survey_mappings = survey_mappings
        self.max_sections = max_sections
        self.max_features = max_features
        self.fixed_features = fixed_features or []

        # Build a mapping from feature names to their sections
        self.feature_to_section = {
            feature: section
            for section, features in self.survey_mappings.items()
            for feature in features
        }

    def create_random_profile(self, respondent: pd.Series) -> Dict[str, Any]:
        """
        Creates a random profile for a single respondent.

        Parameters:
        - respondent (pd.Series): A row from the DataFrame representing a respondent.

        Returns:
        - profile (dict): A dictionary representing the respondent's profile.
        """
        profile = {'respondent_id': respondent[self.respondent_id]}

        # Add fixed features
        predictor_features = []
        for feature in self.fixed_features:
            if feature in respondent:
                profile[feature] = respondent[feature]
                predictor_features.append(feature)

        available_sections = list(self.survey_mappings.keys())

        # Randomly select sections
        num_sections_to_select = min(self.max_sections, len(available_sections))
        random_sections = random.sample(available_sections, num_sections_to_select)

        # Collect selected features
        selected_features = []
        for section in random_sections:
            features_in_section = list(self.survey_mappings[section].keys())
            num_features_to_select = min(self.max_features, len(features_in_section))
            selected_in_section = random.sample(features_in_section, num_features_to_select)
            selected_features.extend(selected_in_section)

        # Remove any fixed features from selected features
        selected_features = [f for f in selected_features if f not in self.fixed_features]

        if not selected_features:
            # If no features are left after removing fixed features
            return profile

        # Select one feature as the response feature
        response_feature = random.choice(selected_features)
        selected_features.remove(response_feature)

        # Add predictor features
        for feature in selected_features:
            if feature in respondent:
                profile[feature] = respondent[feature]
                predictor_features.append(feature)

        # Add the response feature
        if response_feature in respondent:
            profile['response_feature'] = respondent[response_feature]
            profile['response_feature_name'] = response_feature

        return profile

    def generate_profiles(self, num_profiles_per_respondent: int) -> List[Dict[str, Any]]:
        """
        Generates profiles for all respondents in the dataset.

        Parameters:
        - num_profiles_per_respondent (int): Number of profiles to generate per respondent.

        Returns:
        - profiles (List[dict]): A list of respondent profiles.
        """
        profiles = []
        for _, respondent in self.data.iterrows():
            for _ in range(num_profiles_per_respondent):
                profile = self.create_random_profile(respondent)
                profiles.append(profile)
        return profiles

    def profile_to_text(self, profile: Dict[str, Any]) -> Tuple[str, str]:
        """
        Converts a profile into text form, turning profile features into a text description
        and the response feature into an answer to an interview question.

        Parameters:
        - profile (dict): A respondent's profile.

        Returns:
        - preamble (str): The text description of the respondent.
        - response (str): The answer to the interview question.
        """
        lines = []

        # Extract the response feature name and value
        response_feature_name = profile.get('response_feature_name')
        response_feature_value = profile.get('response_feature')

        # Iterate over predictor features
        for feature, value in profile.items():
            if feature in ['respondent_id', 'response_feature', 'response_feature_name']:
                continue  # Skip non-feature fields

            if pd.isnull(value):
                continue  # Skip features with NaN values

            section = self.feature_to_section.get(feature)
            if not section:
                continue  # Skip if section is not found

            feature_mapping = self.survey_mappings.get(section, {}).get(feature)
            if not feature_mapping:
                continue  # Skip if feature mapping is not found

            description = feature_mapping.get('description', feature)
            values_mapping = feature_mapping.get('values', {})
            value_text = values_mapping.get(str(value), str(value))

            lines.append(f"{description}: {value_text}")

        # Get the question and response for the response feature
        if response_feature_name and response_feature_value is not None:
            section = self.feature_to_section.get(response_feature_name)
            if section:
                feature_mapping = self.survey_mappings.get(section, {}).get(response_feature_name)
                if feature_mapping:
                    question = feature_mapping.get('question', f"Please answer the following question about {response_feature_name}:")
                    values_mapping = feature_mapping.get('values', {})
                    response_text = values_mapping.get(str(response_feature_value), str(response_feature_value))
                else:
                    question = f"Please answer the following question about {response_feature_name}:"
                    response_text = str(response_feature_value)
            else:
                question = f"Please answer the following question about {response_feature_name}:"
                response_text = str(response_feature_value)

        else:
            # If no response feature is available
            response_text = ""

        preamble = '\n'.join(lines)
        return preamble, question, response_text


In [158]:
prof_generator = SurveyProfileGenerator(ess[0:3], survey_mappings=survey_mappings,
                                        respondent_id='idno', max_sections=3, max_features=3,
                                        fixed_features=['cntry', 'gndr'])

In [183]:
profiles = prof_generator.generate_profiles(1)

In [184]:
for profile in profiles:
    preambule, question, response = prof_generator.profile_to_text(profile)
    print(f"Profile: {preambule}. \n\nQuestion: {question} \n\nResponse: {response}")
    print("-" * 40)

Profile: Country: Belgium
Gender: Female
Country of birth, mother: Nigeria
Discrimination of respondent's group: religion: Not marked
Discrimination of respondent's group: disability: Not marked
In country government sticks to policies regardless of what most people think: Not applicable
In country the will of the people cannot be stopped: 7
Different political parties offer clear alternatives to one another: 8. 

Question: Which party did you vote for in that election? (Switzerland) 

Response: nan
----------------------------------------
Profile: Country: Belgium
Gender: Female
Boycotted certain products last 12 months: No
Taken part in public demonstration last 12 months: No
Posted or shared anything about politics online last 12 months: No
Placement on left right scale: 5
European Union: European unification go further or gone too far: 5
Total contracted hours per week in main job, overtime excluded: 40
Ever unemployed and seeking work for a period more than three months: No
Househ

## Next Steps

In [13]:
!pip install -q -U google-generativeai

In [14]:
# Import the Python SDK
import google.generativeai as genai
# Used to securely store your API key
from google.colab import userdata

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [15]:
model = genai.GenerativeModel('gemini-1.5-flash')

In [185]:
def create_prompts(profiles: Dict[str, Any]):
    """
    """

    prompt_main = "Edit the text below to create a dialog. First part should prompt a model to take on a given personality from features given in the profile. Then, this model with the personality should be asked a question stated in the prompt. Then, the model should answer the qestion with the probided response."

    prompts = []
    for profile in profiles:
        preamble, question, response = prof_generator.profile_to_text(profile)
        prompt = f"{prompt_main}\n<<<Profile: {preamble}>>>. \n<<<Question: {question}>>>. \n<<<Response: {response}>>>"
        prompts.append(prompt)
    return prompts

In [186]:
profiles[0]

{'respondent_id': 10038,
 'cntry': 'BE',
 'gndr': 2,
 'mbrncntc': 'NG',
 'dscrrlg': 0,
 'dscrdsb': 0,
 'stpldmc': 66,
 'wpestopc': 7,
 'dfprtal': 8,
 'prtvtghu': nan,
 'prtvtdgr': nan,
 'response_feature': nan,
 'response_feature_name': 'prtvthch'}

In [187]:
prompts = create_prompts(profiles)

In [188]:
outputs = [model.generate_content(prompt) for prompt in prompts]
text_outputs = [output.text for output in outputs]

In [189]:
text_outputs[1]

"**You:** Okay, I'm going to introduce you to someone. Imagine you're a woman living in Belgium. You work full-time, and you're not particularly active in politics. You're pretty neutral on things like European unification. \n\n**Model (as Belgian woman):** Okay, I'm ready. \n\n**You:**  How satisfied are you with the current state of the economy in Belgium? \n\n**Model (as Belgian woman):** Hmm, I'd say I'm about a 5 out of 10 satisfied.  It's not great, but it's not terrible either. \n"

In [191]:
for profile in profiles:
    preambule, question, response = prof_generator.profile_to_text(profile)
    print(f"Profile: {preambule}. \n\nQuestion: {question} \n\nResponse: {response}")
    print("-" * 40)

Profile: Country: Belgium
Gender: Female
Country of birth, mother: Nigeria
Discrimination of respondent's group: religion: Not marked
Discrimination of respondent's group: disability: Not marked
In country government sticks to policies regardless of what most people think: Not applicable
In country the will of the people cannot be stopped: 7
Different political parties offer clear alternatives to one another: 8. 

Question: Which party did you vote for in that election? (Switzerland) 

Response: nan
----------------------------------------
Profile: Country: Belgium
Gender: Female
Boycotted certain products last 12 months: No
Taken part in public demonstration last 12 months: No
Posted or shared anything about politics online last 12 months: No
Placement on left right scale: 5
European Union: European unification go further or gone too far: 5
Total contracted hours per week in main job, overtime excluded: 40
Ever unemployed and seeking work for a period more than three months: No
Househ

In [190]:
profiles[1]

{'respondent_id': 10053,
 'cntry': 'BE',
 'gndr': 2,
 'bctprd': 2,
 'pbldmna': 2,
 'pstplonl': 2,
 'lrscale': 5,
 'euftf': 5,
 'wkhct': 40,
 'uemp3m': 2,
 'hinctnta': 88,
 'response_feature': 5,
 'response_feature_name': 'stfeco'}

In [None]:
#@title Dealing with bad Mappings

def find_numeric_to_numeric_mappings(survey_mappings: dict) -> dict:
    """
    Identifies mappings with numeric-to-numeric key-value pairs in the nested survey_mappings dictionary.
    Returns a dictionary with the section and feature names for each problematic mapping.

    Parameters:
    -----------
    survey_mappings : dict
        The nested dictionary containing sections and features of survey mappings.

    Returns:
    --------
    dict
        A dictionary where each key is a section name and each value is a list of features that contain
        numeric-to-numeric mappings in their "values" field.
    """

    problematic_mappings = {}

    for section, features in survey_mappings.items():
        for feature, feature_data in features.items():
            # Check if "values" key exists in feature data
            if "values" in feature_data:
                # Flag to indicate if this feature has numeric-to-numeric mappings
                has_numeric_to_numeric = False

                for key, value in feature_data["values"].items():
                    # Check if both key and value are numeric (integer-like or string numeric)
                    if key.isdigit() and (value.isdigit() or isinstance(value, int)):
                        has_numeric_to_numeric = True
                        break

                if has_numeric_to_numeric:
                    if section not in problematic_mappings:
                        problematic_mappings[section] = []
                    problematic_mappings[section].append(feature)

    return problematic_mappings


In [None]:
problematic_mappings = find_numeric_to_numeric_mappings(survey_mappings)
problematic_mappings

{'relationship_parents_and_at_work': ['stfmjob',
  'mansupp',
  'teamfeel',
  'wrkextra'],
 'political_opinions': ['lrscale',
  'stflife',
  'stfeco',
  'stfgov',
  'stfdem',
  'stfedu',
  'stfhlth',
  'euftf',
  'imbgeco',
  'imueclt',
  'imwbcnt'],
 'internet_use_social_trust': ['ppltrst',
  'pplfair',
  'pplhlp',
  'trstprl',
  'trstlgl',
  'trstplc',
  'trstplt',
  'trstprt',
  'trstep',
  'trstun',
  'trstsci'],
 'well_being_emot_attachment': ['happy', 'inprdsc', 'atchctr', 'atcherp'],
 'religion': ['rlgrl'],
 'climate_change_eu': ['ccrdprs', 'testic34', 'testic35', 'testic36'],
 'understanding_democracy': ['fairelc',
  'dfprtal',
  'medcrgv',
  'rghmgpr',
  'votedir',
  'cttresa',
  'gptpel',
  'gvctzpv',
  'grdfinc',
  'viepol',
  'wpestop',
  'keydec',
  'fairelcc',
  'dfprtalc',
  'medcrgvc',
  'rghmgprc',
  'votedirc',
  'cttresac',
  'gptpelcc',
  'gvctzpvc',
  'grdfincc',
  'viepolc',
  'wpestopc',
  'keydecc',
  'chpldmi',
  'chpldmc',
  'stpldmi',
  'stpldmc',
  'accalaw'

In [None]:
1+1

2

In [64]:
ess['rlgdnbsk'].value_counts()

Unnamed: 0_level_0,count
rlgdnbsk,Unnamed: 1_level_1
9.0,703
6666.0,340
1.0,138
2.0,100
4.0,80
9999.0,21
3.0,17
10.0,5
7777.0,4
14.0,4


In [78]:
ess['rlgdeme'].value_counts()

Unnamed: 0_level_0,count
rlgdeme,Unnamed: 1_level_1
6666.0,1261
8.0,6
5.0,6
3.0,3
1.0,1
4.0,1
