# Set Up

In [1]:
import pandas as pd
import numpy as np
import os
import json
import random
from typing import List, Dict, Tuple, Any

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
!git clone https://github.com/antndlcrx/LLM-for-Social-Science-Research.git

Cloning into 'LLM-for-Social-Science-Research'...
remote: Enumerating objects: 339, done.[K
remote: Counting objects: 100% (100/100), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 339 (delta 52), reused 74 (delta 33), pack-reused 239 (from 1)[K
Receiving objects: 100% (339/339), 16.64 MiB | 9.30 MiB/s, done.
Resolving deltas: 100% (193/193), done.


In [21]:
class SurveyProfileGenerator:
    def __init__(self,
                 data: pd.DataFrame,
                 respondent_id: str,
                 survey_mappings: Dict[str, Dict[str, Any]],
                 max_sections: int = 3,
                 max_features: int = 3,
                 fixed_features: List[str] = None,
                 country_field: str = None,
                 country_specific_variables: Dict[str, Dict[str, Dict[str, str]]] = None,
                 random_state: int = None):
        """
        Initializes the SurveyProfileGenerator with survey mappings, maximum number of features per section,
        any fixed features that should always be included in the profiles, and optional country-specific variables.

        Parameters:
        - data (pd.DataFrame): The survey dataset.
        - respondent_id (str): The column name for respondent IDs.
        - survey_mappings (dict): Nested dictionary mapping of survey questions.
        - max_sections (int): Maximum number of sections to randomly select.
        - max_features (int): Maximum number of features to randomly select per section.
        - fixed_features (List[str], optional): List of feature names that are fixed and always included.
        - country_field (str, optional): The column name for country information. If None, country-specific variables are not adjusted.
        - country_specific_variables (dict, optional): Dictionary of country-specific variables. If None, no country-specific variables are used.
        - random_state (int, optional): Seed for random number generators to ensure reproducibility.
        """
        self.data = data
        self.respondent_id = respondent_id
        self.survey_mappings = survey_mappings
        self.max_sections = max_sections
        self.max_features = max_features
        self.fixed_features = fixed_features or []
        self.country_field = country_field
        self.country_specific_variables = country_specific_variables or {}

        if random_state is not None:
            random.seed(random_state)
            np.random.seed(random_state)

        # Build a mapping from feature names to their sections
        self.feature_to_section = {}
        for section, features in self.survey_mappings.items():
            for feature in features:
                self.feature_to_section[feature] = section

        # Build a mapping from generic feature names to actual variable names
        self.generic_to_actual_features = {}
        for generic_feature, country_vars in self.country_specific_variables.items():
            for country_code, actual_feature in country_vars.items():
                self.generic_to_actual_features[actual_feature] = generic_feature

        # Precompute adjusted features if country_field is not used
        if not self.country_field or not self.country_specific_variables:
            self.adjusted_features_cache = {
                section: list(features.keys())
                for section, features in self.survey_mappings.items()
            }
        else:
            self.adjusted_features_cache = None
            # Build reverse mapping from feature to countries
            self.feature_to_countries = {}
            for feature_type, country_vars in self.country_specific_variables.items():
                for country, feature in country_vars.items():
                    self.feature_to_countries.setdefault(feature, set()).add(country)

    def select_random_sections(self, available_sections: List[str]) -> List[str]:
        num_sections_to_select = min(self.max_sections, len(available_sections))
        return random.sample(available_sections, num_sections_to_select)

    def adjust_features_for_country(self, features: List[str], respondent_country: Any) -> List[str]:
        adjusted_features = []
        for feature in features:
            countries = self.feature_to_countries.get(feature)
            if countries:
                if respondent_country in countries:
                    adjusted_features.append(feature)
            else:
                adjusted_features.append(feature)
        return adjusted_features

    def select_features_in_sections(self, sections: List[str], respondent_country: Any) -> List[str]:
        selected_features = []
        for section in sections:
            if self.adjusted_features_cache:
                adjusted_features = self.adjusted_features_cache[section]
            else:
                features_in_section = list(self.survey_mappings[section].keys())
                adjusted_features = self.adjust_features_for_country(features_in_section, respondent_country)

            num_features_to_select = min(self.max_features, len(adjusted_features))
            if num_features_to_select > 0:
                selected_in_section = random.sample(adjusted_features, num_features_to_select)
                selected_features.extend(selected_in_section)
        return selected_features

    def filter_valid_features(self, features: List[str], respondent: pd.Series) -> List[str]:
        valid_features = []
        for feature in features:
            if feature in respondent:
                value = respondent[feature]
                if pd.isnull(value):
                    continue  # Skip features with NaN values

                # Map value to text
                section = self.feature_to_section.get(feature)
                if not section:
                    continue  # Skip if section is not found
                feature_mapping = self.survey_mappings.get(section, {}).get(feature)
                if not feature_mapping:
                    continue  # Skip if feature mapping is not found
                values_mapping = feature_mapping.get('values', {})

                # Convert value to match the keys in values_mapping
                value_key = str(int(value)) if isinstance(value, (int, float)) and not pd.isnull(value) else str(value)

                value_text = values_mapping.get(value_key, str(value))
                # Skip missing values
                skip_values = {"not applicable", "not asked", "Not asked  in this country", "-3", "-4"}

                if value_text.strip().lower() not in skip_values:
                    valid_features.append(feature)
        return valid_features

    def create_random_profile(self, respondent: pd.Series) -> Dict[str, Any]:
        profile = {'respondent_id': respondent[self.respondent_id]}

         # Get respondent's country if country_field is provided
        respondent_country = respondent.get(self.country_field) if self.country_field else None

        # Add fixed features
        for feature in self.fixed_features:
            # Check if feature is a country-specific variable
            if feature in self.country_specific_variables:
                # Get the country-specific variable name for this respondent
                country_vars = self.country_specific_variables[feature]
                actual_feature = country_vars.get(respondent_country)
                if actual_feature and actual_feature in respondent:
                    # Store using the actual variable name
                    profile[actual_feature] = respondent[actual_feature]
            else:
                # Regular feature
                if feature in respondent:
                    profile[feature] = respondent[feature]

        # Get respondent's country if country_field is provided
        respondent_country = respondent.get(self.country_field) if self.country_field else None

        available_sections = list(self.survey_mappings.keys())

        # Select random sections
        random_sections = self.select_random_sections(available_sections)

        # Select features within these sections
        selected_features = self.select_features_in_sections(random_sections, respondent_country)

        # Remove any fixed features from selected features
        selected_features = [f for f in selected_features if f not in self.fixed_features]

        if not selected_features:
            # If no features are left after removing fixed features
            return profile

        # Filter out features where the respondent's value is "Not applicable"
        filtered_features = self.filter_valid_features(selected_features, respondent)

        if not filtered_features:
            # If no features are left after filtering
            return profile

        # Decide whether to pick response feature from filtered_features or from all possible features
        if random.random() < 0.5:
            # Option 1: Select response feature from filtered_features
            response_feature_pool = set(filtered_features)
        else:
            # Option 2: Select response feature from all available features
            if self.adjusted_features_cache:
                all_features = set()
                for features in self.adjusted_features_cache.values():
                    all_features.update(features)
            else:
                all_features = set()
                for section in self.survey_mappings:
                    features_in_section = list(self.survey_mappings[section].keys())
                    adjusted_features = self.adjust_features_for_country(features_in_section, respondent_country)
                    all_features.update(adjusted_features)

            # Exclude features already in filtered_features and fixed_features
            response_feature_pool = all_features - set(filtered_features) - set(self.fixed_features)

            # Filter out invalid features
            response_feature_pool = self.filter_valid_features(response_feature_pool, respondent)

            if not response_feature_pool:
                # If no valid response features are available, fallback to filtered_features
                response_feature_pool = set(filtered_features)

        if not response_feature_pool:
            # If no valid response features are available, return profile
            return profile

        # Select the response feature
        response_feature = random.choice(list(response_feature_pool))

        # Remove the response feature from filtered_features if it's there
        filtered_features = [f for f in filtered_features if f != response_feature]

        # Add predictor features
        for feature in filtered_features:
            if feature in respondent:
                profile[feature] = respondent[feature]

        # Add the response feature
        if response_feature in respondent:
            profile['response_feature'] = respondent[response_feature]
            profile['response_feature_name'] = response_feature

        return profile

    def generate_profiles(self, num_profiles_per_respondent: int) -> List[Dict[str, Any]]:
        profiles = []
        seen_profiles = set()
        for _, respondent in self.data.iterrows():
            attempts = 0
            profiles_generated = 0
            max_attempts = num_profiles_per_respondent * 10  # Adjust as needed
            while profiles_generated < num_profiles_per_respondent and attempts < max_attempts:
                profile = self.create_random_profile(respondent)
                attempts += 1
                # Ensure the profile has a response feature and the response is not NaN
                if "response_feature" in profile and not pd.isnull(profile["response_feature"]):
                    # Create a signature for the profile
                    feature_items = [
                        (feature, profile[feature])
                        for feature in profile
                        if feature not in ['respondent_id', 'response_feature', 'response_feature_name']
                    ]
                    feature_items.sort()
                    profile_signature = tuple(feature_items)
                    if profile_signature not in seen_profiles:
                        seen_profiles.add(profile_signature)
                        profiles.append(profile)
                        profiles_generated += 1
                    else:
                        print(f"Duplicate profile encountered for respondent {respondent[self.respondent_id]}.")
        return profiles

    def profile_to_text(self, profile: Dict[str, Any]) -> Tuple[str, str, str]:
        lines = []

        # Extract the response feature name and value
        response_feature_name = profile.get('response_feature_name')
        response_feature_value = profile.get('response_feature')

        # Iterate over predictor features
        for feature, value in profile.items():
            if feature in ['respondent_id', 'response_feature', 'response_feature_name']:
                continue  # Skip non-feature fields

            if pd.isnull(value):
                continue  # Skip features with NaN values

            description, mapped_value = self.map_value(feature, value)
            lines.append(f"{description}: {mapped_value}")

        # Get the question and response for the response feature
        if response_feature_name and response_feature_value is not None:
            description, mapped_value = self.map_value(response_feature_name, response_feature_value)

            # Retrieve the question from the mappings
            section = self.feature_to_section.get(response_feature_name)
            feature_mapping = self.survey_mappings.get(section, {}).get(response_feature_name, {})
            question = feature_mapping.get('question', f"Please answer the following question about {description}:")
            response_text = mapped_value
        else:
            # If no response feature is available
            question = ""
            response_text = ""

        preamble = '\n'.join(lines)
        return preamble, question, response_text

    def map_value(self, feature_name: str, value) -> Tuple[str, str]:
        """
        Maps a single feature's value to its description and textual value.

        Returns:
        - Tuple[str, str]: (description, mapped_value)
        """
        # Use the actual feature name to find the section and mapping
        section = self.feature_to_section.get(feature_name)
        if not section:
            return feature_name, str(value)  # Feature not found in mappings

        feature_mapping = self.survey_mappings[section].get(feature_name)
        if not feature_mapping:
            return feature_name, str(value)  # Feature mapping not found

        # Get the generic feature name for the description
        generic_feature_name = self.generic_to_actual_features.get(feature_name, feature_name)
        description = feature_mapping.get('description', generic_feature_name)
        values_mapping = feature_mapping.get('values', {})

        if pd.isnull(value):
            return description, "Missing"

        # Convert value to string key and strip whitespace
        if isinstance(value, float) and value.is_integer():
            value_key = str(int(value))
        elif isinstance(value, (int, np.integer)):
            value_key = str(value)
        else:
            value_key = str(value).strip()

        # Try to get the mapped text
        mapped_text = values_mapping.get(value_key)
        if mapped_text is None:
            mapped_text = str(value)  # Use the original value

        return description, mapped_text


# ESS

In [None]:
directory = 'LLM-for-Social-Science-Research/mappings/2020/'

survey_mappings = {}

for filename in os.listdir(directory):
    if filename.endswith('.json'):
        section_name = os.path.splitext(filename)[0]

        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            survey_mappings[section_name] = json.load(file)


ess = pd.read_csv('LLM-for-Social-Science-Research/ESS10.csv')

  ess = pd.read_csv('LLM-for-Social-Science-Research/ESS10.csv')


In [None]:
ess['cntry'].nunique()

22

In [None]:
#@title add country specific var dictionary for ess
country_specific_variables_2020 = {
            'religion_present': {
                'CH': 'rlgdnach',
                'FI': 'rlgdnafi',
                'GR': 'rlgdnagr',
                'HU': 'rlgdnhu',
                'IS': 'rlgdnais',
                'IE': 'rlgdnie',
                'LT': 'rlgdnlt',
                'ME': 'rlgdme',
                'NL': 'rlgdnanl',
                'MK': 'rlgdnmk',
                'NO': 'rlgdnno',
                'SK': 'rlgdnbsk',
                'GB': 'rlgdngb',
            },
            'religion_past': {
                'CH': 'rlgdeach',
                'FI': 'rlgdeafi',
                'GR': 'rlgdeagr',
                'HU': 'rlgdehu',
                'IS': 'rlgdeais',
                'IE': 'rlgdeie',
                'LT': 'rlgdelt',
                'ME': 'rlgdeme',
                'NL': 'rlgdeanl',
                'MK': 'rlgdemk',
                'NO': 'rlgdeno',
                'SK': 'rlgdebsk',
                'GB': 'rlgdegb',
            },
            'voted': {
                'BE': 'prtvtebe',
                'BG': 'prtvtebg',
                'CH': 'prtvthch',
                'HR': 'prtvbhr',
                'CZ': 'prtvtecz',
                'EE': 'prtvthee',
                'FI': 'prtvtefi',
                'FR': 'prtvfr',
                'GR': 'prtvtdgr',
                'HU': 'prtvtghu',
                'IS': 'prtvic',
                'IE': 'prtvie',
                'IT': 'prtvti',
                'LT': 'prtvlt',
                'ME': 'prtvme',
                'NL': 'prtvthnl',
                'MK': 'prtvtmk',
                'NO': 'prtvtbno',
                'PT': 'prtvtdpt',
                'SI': 'prtvtfsi',
                'SK': 'prtvtesk',
                'GB': 'prtvttdgb',
            },
            'party_affiliation': {
                'BE': 'prtclebe',
                'BG': 'prtclebg',
                'CH': 'prtclhch',
                'HR': 'prtclbhr',
                'CZ': 'prtclecz',
                'EE': 'prtclhee',
                'FI': 'prtclffi',
                'FR': 'prtclffr',
                'GR': 'prtcldgr',
                'HU': 'prtclhhu',
                'IS': 'prtcldis',
                'IE': 'prtclfie',
                'IT': 'prtcleit',
                'LT': 'prtclclt',
                'ME': 'prtclame',
                'NL': 'prtclgnl',
                'MK': 'prtclmk',
                'NO': 'prtclbno',
                'PT': 'prtclfpt',
                'SI': 'prtclfsi',
                'SK': 'prtclesk',
                'GB': 'prtcldgb',
            }
        }



country_specific_variables_2023 = {
    'religion_present': {
        'AT': 'rlgdnbat',
        'CH': 'rlgdnach',
        'DE': 'rlgdnade',
        'FI': 'rlgdnafi',
        'GR': 'rlgdnagr',
        'HU': 'rlgdnhu',
        'IS': 'rlgdnais',
        'IE': 'rlgdnie',
        'LT': 'rlgdnlt',
        'ME': 'rlgdme',
        'NL': 'rlgdnanl',
        'MK': 'rlgdnmk',
        'NO': 'rlgdnno',
        'SK': 'rlgdnask',
        'GB': 'rlgdngb'
    },
    'religion_past': {
        'AT': 'rlgdebat',
        'CH': 'rlgdeach',
        'DE': 'rlgdeade',
        'FI': 'rlgdeafi',
        'GR': 'rlgdeagr',
        'HU': 'rlgdehu',
        'IS': 'rlgdeais',
        'IE': 'rlgdeie',
        'LT': 'rlgdelt',
        'ME': 'rlgdeme',
        'NL': 'rlgdeanl',
        'MK': 'rlgdemk',
        'NO': 'rlgdeno',
        'SK': 'rlgdeask',
        'GB': 'rlgdegb'
    },
    'voted': {
        'AT': 'prtvtdat',
        'CH': 'prtvthch',
        'HR': 'prtvtchr',
        'FI': 'prtvtffi',
        'DE': 'prtvgde1',
        'HU': 'prtvthhu',
        'IE': 'prtvteie',
        'LT': 'prtvclt1',
        'NL': 'prtvtinl',
        'NO': 'prtvtcno',
        'SK': 'prtvtesk',
        'SI': 'prtvtgsi',
        'GB': 'prtvtdgb'
    },
    'party_affiliation': {
        'AT': 'prtcleat',
        'CH': 'prtclhch',
        'HR': 'prtclbhr',
        'FI': 'prtclgfi',
        'DE': 'prtclgde',
        'HU': 'prtclihu',
        'IE': 'prtclfie',
        'LT': 'prtclclt',
        'NL': 'prtclhnl',
        'NO': 'prtclcno',
        'SK': 'prtclesk',
        'SI': 'prtclgsi',
        'GB': 'prtcldgb'
    }
}


## Profiles 2023

In [None]:
random_subset = ess[ess['cntry']=="HR"].sample(n=100, random_state=42)

In [None]:
prof_generator = SurveyProfileGenerator(ess, survey_mappings=survey_mappings,
                                        respondent_id='idno', max_sections=4, max_features=3,
                                        fixed_features=['cntry', 'gndr', 'agea', 'essround'], country_field='cntry',
                                        country_specific_variables=country_specific_variables_2023,
                                        random_state=42)

In [None]:
profiles = prof_generator.generate_profiles(10)

Duplicate profile encountered for respondent 82598.
Duplicate profile encountered for respondent 81567.
Duplicate profile encountered for respondent 85279.
Duplicate profile encountered for respondent 68286.
Duplicate profile encountered for respondent 54784.
Duplicate profile encountered for respondent 81172.
Duplicate profile encountered for respondent 63917.
Duplicate profile encountered for respondent 68012.
Duplicate profile encountered for respondent 72181.
Duplicate profile encountered for respondent 68287.
Duplicate profile encountered for respondent 81667.
Duplicate profile encountered for respondent 72157.
Duplicate profile encountered for respondent 81180.
Duplicate profile encountered for respondent 82373.
Duplicate profile encountered for respondent 74884.
Duplicate profile encountered for respondent 79089.
Duplicate profile encountered for respondent 83653.
Duplicate profile encountered for respondent 69572.
Duplicate profile encountered for respondent 83285.


In [None]:
len(profiles)

221900

In [None]:
profile_signatures = set()
for profile in profiles:
    feature_items = [
        (feature, profile[feature])
        for feature in profile
        if feature not in ['respondent_id', 'response_feature', 'response_feature_name']
    ]
    feature_items.sort()
    profile_signature = tuple(feature_items)
    assert profile_signature not in profile_signatures, "Duplicate profile found!"
    profile_signatures.add(profile_signature)

In [None]:
for profile in profiles[200000:200010]:
    preambule, question, response = prof_generator.profile_to_text(profile)
    print(f"Profile: \n{preambule}. \nQuestion: {question} \nResponse: {response}")
    print("-" * 40)

Profile: 
Country: Slovenia
Gender: Male
Age of respondent, calculated: 62
Year: 2023
Would vote for [country] to remain member of European Union or leave: Remain member of the European Union
To what extent feel personal responsibility to reduce climate change: 5
How close to party: Quite close
How interested in politics: Quite interested
Trust in country's parliament: 5
News about politics and current affairs, watching, reading or listening, in minutes on a typical day: 120. 
Question: Is there a particular political party you feel closer to than all the other parties? 
Response: Yes
----------------------------------------
Profile: 
Country: Slovenia
Gender: Male
Age of respondent, calculated: 62
Year: 2023
Ever been divorced/had civil union dissolved: Yes
Domicile, respondent's description: A big city
Most people can be trusted or you can't be too careful: 5
Confident in own ability to participate in politics: Not at all confident
Able to take active role in political group: Not at 

In [None]:
ids = []
prof_descriptions = []
for profile in profiles:
    id = profile['respondent_id']
    preambule, question, response = prof_generator.profile_to_text(profile)
    prof_text = f"Profile: \n{preambule}. \nQuestion: {question} \nResponse: {response}"

    ids.append(id)
    prof_descriptions.append(prof_text)


In [None]:
pd.DataFrame({'id': ids, 'text': prof_descriptions})['id'].nunique()
#.to_csv('profiles.csv', index=False)

17095

In [None]:
ess['idno'].nunique()

17095

In [None]:
df = pd.DataFrame({'id': ids, 'text': prof_descriptions})
# df.head()
df.to_csv('profiles_2023.csv', index=False)

## Prifiles 2020

In [None]:
prof_generator = SurveyProfileGenerator(ess, survey_mappings=survey_mappings,
                                        respondent_id='idno', max_sections=4, max_features=3,
                                        fixed_features=['cntry', 'gndr', 'agea', 'essround'], country_field='cntry',
                                        country_specific_variables=country_specific_variables_2020,
                                        random_state=42)

In [None]:
profiles = prof_generator.generate_profiles(10)

In [None]:
len(profiles)

376110

In [None]:
profile_signatures = set()
for profile in profiles:
    feature_items = [
        (feature, profile[feature])
        for feature in profile
        if feature not in ['respondent_id', 'response_feature', 'response_feature_name']
    ]
    feature_items.sort()
    profile_signature = tuple(feature_items)
    assert profile_signature not in profile_signatures, "Duplicate profile found!"
    profile_signatures.add(profile_signature)

In [None]:
for profile in profiles[200000:200100]:
    preambule, question, response = prof_generator.profile_to_text(profile)
    print(f"Profile: \n{preambule}. \nQuestion: {question} \nResponse: {response}")
    print("-" * 40)

In [None]:
ids = []
prof_descriptions = []
for profile in profiles:
    id = profile['respondent_id']
    preambule, question, response = prof_generator.profile_to_text(profile)
    prof_text = f"Profile: \n{preambule}. \nQuestion: {question} \nResponse: {response}"

    ids.append(id)
    prof_descriptions.append(prof_text)


In [None]:
df = pd.DataFrame({'id': ids, 'text': prof_descriptions})
# print(df.shape)
# df.head()

df.to_csv('profiles_2020.csv', index=False)

# WVS

In [33]:
# WVS
directory = 'LLM-for-Social-Science-Research/mappings/WVS/updated_wvs_mappings.json'

survey_mappings = {}

with open(directory, 'r', encoding='utf-8') as file:
    survey_mappings = json.load(file)

In [7]:
wvs = pd.read_csv("/content/drive/MyDrive/Ox LLMs Model For Social Science/surveys/WVS_2017_22.csv")
wvs.shape

  wvs = pd.read_csv("/content/drive/MyDrive/Ox LLMs Model For Social Science/surveys/WVS_2017_22.csv")


(97220, 613)

In [9]:
wvs['D_INTERVIEW'].nunique()

96221

In [28]:
random_sample = wvs.sample(n=200, random_state=42)

In [34]:
prof_generator = SurveyProfileGenerator(random_sample, survey_mappings=survey_mappings,
                                        respondent_id='D_INTERVIEW', max_sections=4, max_features=3,
                                        fixed_features=['B_COUNTRY', 'Q260', 'Q262', 'A_YEAR'], country_field='B_COUNTRY',
                                        random_state=42)

In [35]:
profiles = prof_generator.generate_profiles(5)

In [36]:
profile_signatures = set()
for profile in profiles:
    feature_items = [
        (feature, profile[feature])
        for feature in profile
        if feature not in ['respondent_id', 'response_feature', 'response_feature_name']
    ]
    feature_items.sort()
    profile_signature = tuple(feature_items)
    assert profile_signature not in profile_signatures, "Duplicate profile found!"
    profile_signatures.add(profile_signature)

In [37]:
for profile in profiles:
    preambule, question, response = prof_generator.profile_to_text(profile)
    print(f"Profile: \n{preambule}. \nQuestion: {question} \nResponse: {response}")
    print("-" * 40)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Confidence: Television: Quite a lot
Where are the headquarters of the International Monetary Fund (IMF) located?: Washington DC
Which of the following problems does the organization Amnesty International deal with?: Human rights
Immigration in your country: Leads to social conflict: Disagree
Immigration in your country: Strengthens cultural diversity: Agree
Immigration in your country: Increases unemployment: Disagree. 
Question: Would you say that having a system governed by religious law in which there are no political parties or elections is a very good, fairly good, fairly bad, or very bad way of governing this country? 
Response: Bad
----------------------------------------
Profile: 
ISO 3166-1 numeric country code: Indonesia
Sex: Female
Age: 30
A_YEAR: 2018
Satisfaction with financial situation of household: 8
Frequency you/family (last 12 month): Gone without a cash income: Never
Frequency you/family (last 12 month

In [43]:
ids = []
prof_descriptions = []
for profile in profiles:
    id = profile['respondent_id']
    preambule, question, response = prof_generator.profile_to_text(profile)
    prof_text = f"Profile: \n\n{preambule}. \n\nQuestion: {question} \n\nResponse: {response}"

    ids.append(id)
    prof_descriptions.append(prof_text)

In [44]:
df = pd.DataFrame({'id': ids, 'text': prof_descriptions})
# print(df.shape)
# df.head()

df.to_csv('profiles_wvs_test.csv', index=False)

In [46]:
df['id'].nunique()

199

# Next Steps

In [None]:
!pip install -q -U google-generativeai

In [None]:
# Import the Python SDK
import google.generativeai as genai
# Used to securely store your API key
from google.colab import userdata

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
model = genai.GenerativeModel('gemini-1.5-flash')

In [None]:
def create_prompts(profiles: Dict[str, Any]):
    """
    """

    prompt_main = "Edit the text below to create a dialog. First part should prompt a model to take on a given personality from features given in the profile. Then, this model with the personality should be asked a question stated in the prompt. Then, the model should answer the qestion with the probided response."

    prompts = []
    for profile in profiles:
        preamble, question, response = prof_generator.profile_to_text(profile)
        prompt = f"{prompt_main}\n<<<Profile: {preamble}>>>. \n<<<Question: {question}>>>. \n<<<Response: {response}>>>"
        prompts.append(prompt)
    return prompts

In [None]:
len(profiles)

5

In [None]:
prompts = create_prompts(profiles)

In [None]:
outputs = [model.generate_content(prompt) for prompt in prompts]
text_outputs = [output.text for output in outputs]

In [None]:
for output in text_outputs:
    print(output)
    print("-" * 40)

In [None]:
for profile in profiles:
    preambule, question, response = prof_generator.profile_to_text(profile)
    print(f"Profile: {preambule}. \n\nQuestion: {question} \n\nResponse: {response}")
    print("-" * 40)

In [None]:
profiles[1]

{'respondent_id': 10053,
 'cntry': 'BE',
 'gndr': 2,
 'bctprd': 2,
 'pbldmna': 2,
 'pstplonl': 2,
 'lrscale': 5,
 'euftf': 5,
 'wkhct': 40,
 'uemp3m': 2,
 'hinctnta': 88,
 'response_feature': 5,
 'response_feature_name': 'stfeco'}

In [None]:
#@title Dealing with bad Mappings

def find_numeric_to_numeric_mappings(survey_mappings: dict) -> dict:
    """
    Identifies mappings with numeric-to-numeric key-value pairs in the nested survey_mappings dictionary.
    Returns a dictionary with the section and feature names for each problematic mapping.

    Parameters:
    -----------
    survey_mappings : dict
        The nested dictionary containing sections and features of survey mappings.

    Returns:
    --------
    dict
        A dictionary where each key is a section name and each value is a list of features that contain
        numeric-to-numeric mappings in their "values" field.
    """

    problematic_mappings = {}

    for section, features in survey_mappings.items():
        for feature, feature_data in features.items():
            # Check if "values" key exists in feature data
            if "values" in feature_data:
                # Flag to indicate if this feature has numeric-to-numeric mappings
                has_numeric_to_numeric = False

                for key, value in feature_data["values"].items():
                    # Check if both key and value are numeric (integer-like or string numeric)
                    if key.isdigit() and (value.isdigit() or isinstance(value, int)):
                        has_numeric_to_numeric = True
                        break

                if has_numeric_to_numeric:
                    if section not in problematic_mappings:
                        problematic_mappings[section] = []
                    problematic_mappings[section].append(feature)

    return problematic_mappings


In [None]:
problematic_mappings = find_numeric_to_numeric_mappings(survey_mappings)
problematic_mappings

{'relationship_parents_and_at_work': ['stfmjob',
  'mansupp',
  'teamfeel',
  'wrkextra'],
 'political_opinions': ['lrscale',
  'stflife',
  'stfeco',
  'stfgov',
  'stfdem',
  'stfedu',
  'stfhlth',
  'euftf',
  'imbgeco',
  'imueclt',
  'imwbcnt'],
 'internet_use_social_trust': ['ppltrst',
  'pplfair',
  'pplhlp',
  'trstprl',
  'trstlgl',
  'trstplc',
  'trstplt',
  'trstprt',
  'trstep',
  'trstun',
  'trstsci'],
 'well_being_emot_attachment': ['happy', 'inprdsc', 'atchctr', 'atcherp'],
 'religion': ['rlgrl'],
 'climate_change_eu': ['ccrdprs', 'testic34', 'testic35', 'testic36'],
 'understanding_democracy': ['fairelc',
  'dfprtal',
  'medcrgv',
  'rghmgpr',
  'votedir',
  'cttresa',
  'gptpel',
  'gvctzpv',
  'grdfinc',
  'viepol',
  'wpestop',
  'keydec',
  'fairelcc',
  'dfprtalc',
  'medcrgvc',
  'rghmgprc',
  'votedirc',
  'cttresac',
  'gptpelcc',
  'gvctzpvc',
  'grdfincc',
  'viepolc',
  'wpestopc',
  'keydecc',
  'chpldmi',
  'chpldmc',
  'stpldmi',
  'stpldmc',
  'accalaw'

In [None]:
ess.iloc[:, 0:15].head(2)

Unnamed: 0,name,essround,edition,proddate,idno,cntry,dweight,pspwght,pweight,anweight,nwspol,netusoft,netustm,ppltrst,pplfair
0,ESS10e03_2,10,3.2,02.11.2023,10038,BE,0.88222,0.972276,0.718075,0.698167,30,5,8,6,7
1,ESS10e03_2,10,3.2,02.11.2023,10053,BE,1.047643,0.888635,0.718075,0.638107,10,5,240,3,4
