# Set Up

In [1]:
import pandas as pd
import numpy as np
import os
import json
import random
from typing import List, Dict, Tuple, Any

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!git clone https://github.com/antndlcrx/LLM-for-Social-Science-Research.git

Cloning into 'LLM-for-Social-Science-Research'...
remote: Enumerating objects: 412, done.[K
remote: Counting objects: 100% (173/173), done.[K
remote: Compressing objects: 100% (136/136), done.[K
remote: Total 412 (delta 103), reused 78 (delta 34), pack-reused 239 (from 1)[K
Receiving objects: 100% (412/412), 17.32 MiB | 12.82 MiB/s, done.
Resolving deltas: 100% (244/244), done.


In [4]:
class SurveyProfileGenerator:
    def __init__(self,
                 data: pd.DataFrame,
                 respondent_id: str,
                 survey_mappings: Dict[str, Dict[str, Any]],
                 max_sections: int = 3,
                 max_features: int = 3,
                 fixed_features: List[str] = None,
                 country_field: str = None,
                 country_specific_variables: Dict[str, Dict[str, Dict[str, str]]] = None,
                 random_state: int = None):
        """
        Initializes the SurveyProfileGenerator with survey mappings, maximum number of features per section,
        any fixed features that should always be included in the profiles, and optional country-specific variables.

        Parameters:
        - data (pd.DataFrame): The survey dataset.
        - respondent_id (str): The column name for respondent IDs.
        - survey_mappings (dict): Nested dictionary mapping of survey questions.
        - max_sections (int): Maximum number of sections to randomly select.
        - max_features (int): Maximum number of features to randomly select per section.
        - fixed_features (List[str], optional): List of feature names that are fixed and always included.
        - country_field (str, optional): The column name for country information. If None, country-specific variables are not adjusted.
        - country_specific_variables (dict, optional): Dictionary of country-specific variables. If None, no country-specific variables are used.
        - random_state (int, optional): Seed for random number generators to ensure reproducibility.
        """
        self.data = data
        self.respondent_id = respondent_id
        self.survey_mappings = survey_mappings
        self.max_sections = max_sections
        self.max_features = max_features
        self.fixed_features = fixed_features or []
        self.country_field = country_field
        self.country_specific_variables = country_specific_variables or {}

        if random_state is not None:
            random.seed(random_state)
            np.random.seed(random_state)

        # Build a mapping from feature names to their sections
        self.feature_to_section = {}
        for section, features in self.survey_mappings.items():
            for feature in features:
                self.feature_to_section[feature] = section

        # Build a mapping from generic feature names to actual variable names
        self.generic_to_actual_features = {}
        for generic_feature, country_vars in self.country_specific_variables.items():
            for country_code, actual_feature in country_vars.items():
                self.generic_to_actual_features[actual_feature] = generic_feature

        # Precompute adjusted features if country_field is not used
        if not self.country_field or not self.country_specific_variables:
            self.adjusted_features_cache = {
                section: list(features.keys())
                for section, features in self.survey_mappings.items()
            }
        else:
            self.adjusted_features_cache = None
            # Build reverse mapping from feature to countries
            self.feature_to_countries = {}
            for feature_type, country_vars in self.country_specific_variables.items():
                for country, feature in country_vars.items():
                    self.feature_to_countries.setdefault(feature, set()).add(country)

    def select_random_sections(self, available_sections: List[str]) -> List[str]:
        num_sections_to_select = min(self.max_sections, len(available_sections))
        return random.sample(available_sections, num_sections_to_select)

    def adjust_features_for_country(self, features: List[str], respondent_country: Any) -> List[str]:
        adjusted_features = []
        for feature in features:
            countries = self.feature_to_countries.get(feature)
            if countries:
                if respondent_country in countries:
                    adjusted_features.append(feature)
            else:
                adjusted_features.append(feature)
        return adjusted_features

    def select_features_in_sections(self, sections: List[str], respondent_country: Any) -> List[str]:
        selected_features = []
        for section in sections:
            if self.adjusted_features_cache:
                adjusted_features = self.adjusted_features_cache[section]
            else:
                features_in_section = list(self.survey_mappings[section].keys())
                adjusted_features = self.adjust_features_for_country(features_in_section, respondent_country)

            num_features_to_select = min(self.max_features, len(adjusted_features))
            if num_features_to_select > 0:
                selected_in_section = random.sample(adjusted_features, num_features_to_select)
                selected_features.extend(selected_in_section)
        return selected_features

    def filter_valid_features(self, features: List[str], respondent: pd.Series) -> List[str]:
        valid_features = []
        for feature in features:
            if feature in respondent:
                value = respondent[feature]
                if pd.isnull(value):
                    continue  # Skip features with NaN values

                # Map value to text
                section = self.feature_to_section.get(feature)
                if not section:
                    continue  # Skip if section is not found
                feature_mapping = self.survey_mappings.get(section, {}).get(feature)
                if not feature_mapping:
                    continue  # Skip if feature mapping is not found
                values_mapping = feature_mapping.get('values', {})

                # Convert value to match the keys in values_mapping
                value_key = str(int(value)) if isinstance(value, (int, float)) and not pd.isnull(value) else str(value)

                value_text = values_mapping.get(value_key, str(value))
                # Skip missing values
                skip_values = {"not applicable", "not asked", "Not asked  in this country",
                               "Not asked in survey", "-3", "-4", "-5", "-specific list of codes in Annex",
                               "List of codes in Annex", "Missing, Not available", "Not asked",
                               "No answer ", "Missing; Not available", "Not asked "}

                if value_text.strip() not in skip_values:
                    valid_features.append(feature)
        return valid_features

    def create_random_profile(self, respondent: pd.Series, available_sections: List[str]) -> Tuple[Dict[str, Any], List[str]]:
        profile = {'respondent_id': respondent[self.respondent_id]}

        # Get respondent's country if country_field is provided
        respondent_country = respondent.get(self.country_field) if self.country_field else None

        # Add fixed features
        for feature in self.fixed_features:
            # Check if feature is a country-specific variable
            if feature in self.country_specific_variables:
                # Get the country-specific variable name for this respondent
                country_vars = self.country_specific_variables[feature]
                actual_feature = country_vars.get(respondent_country)
                if actual_feature and actual_feature in respondent:
                    # Store using the actual variable name
                    profile[actual_feature] = respondent[actual_feature]
            else:
                # Regular feature
                if feature in respondent:
                    profile[feature] = respondent[feature]

        # Select random sections from available_sections
        random_sections = self.select_random_sections(available_sections)

        # Select features within these sections
        selected_features = self.select_features_in_sections(random_sections, respondent_country)

        # Remove any fixed features from selected features
        selected_features = [f for f in selected_features if f not in self.fixed_features]

        if not selected_features:
            # If no features are left after removing fixed features
            return profile, random_sections

        # Filter out features where the respondent's value is "Not applicable"
        filtered_features = self.filter_valid_features(selected_features, respondent)

        if not filtered_features:
            # If no features are left after filtering
            return profile, random_sections

        # Decide whether to pick response feature from filtered_features or from all possible features
        if random.random() < 0.5:
            # Option 1: Select response feature from filtered_features
            response_feature_pool = set(filtered_features)
        else:
            # Option 2: Select response feature from all available features
            if self.adjusted_features_cache:
                all_features = set()
                for features in self.adjusted_features_cache.values():
                    all_features.update(features)
            else:
                all_features = set()
                for section in self.survey_mappings:
                    features_in_section = list(self.survey_mappings[section].keys())
                    adjusted_features = self.adjust_features_for_country(features_in_section, respondent_country)
                    all_features.update(adjusted_features)

            # Exclude features already in filtered_features and fixed_features
            response_feature_pool = all_features - set(filtered_features) - set(self.fixed_features)

            # Filter out invalid features
            response_feature_pool = self.filter_valid_features(response_feature_pool, respondent)

            if not response_feature_pool:
                # If no valid response features are available, fallback to filtered_features
                response_feature_pool = set(filtered_features)

        if not response_feature_pool:
            # If no valid response features are available, return profile
            return profile, random_sections

        # Select the response feature
        response_feature = random.choice(list(response_feature_pool))

        # Remove the response feature from filtered_features if it's there
        filtered_features = [f for f in filtered_features if f != response_feature]

        # Add predictor features
        for feature in filtered_features:
            if feature in respondent:
                profile[feature] = respondent[feature]

        # Add the response feature
        if response_feature in respondent:
            profile['response_feature'] = respondent[response_feature]
            profile['response_feature_name'] = response_feature

        return profile, random_sections

    def generate_profiles(self, num_profiles_per_respondent: int) -> List[Dict[str, Any]]:
        profiles = []
        seen_profiles = set()
        for _, respondent in self.data.iterrows():
            attempts = 0
            profiles_generated = 0
            max_attempts = num_profiles_per_respondent * 10  # Adjust as needed

            # Initialize the pool of available sections for this respondent
            remaining_sections = list(self.survey_mappings.keys())

            while profiles_generated < num_profiles_per_respondent and attempts < max_attempts:
                # If all sections have been used, reset the pool
                if not remaining_sections:
                    remaining_sections = list(self.survey_mappings.keys())

                profile, sections_used = self.create_random_profile(respondent, remaining_sections)
                attempts += 1

                # Ensure the profile has a response feature and the response is not NaN
                if "response_feature" in profile and not pd.isnull(profile["response_feature"]):
                    # Create a signature for the profile
                    feature_items = [
                        (feature, profile[feature])
                        for feature in profile
                        if feature not in ['respondent_id', 'response_feature', 'response_feature_name']
                    ]
                    feature_items.sort()
                    profile_signature = tuple(feature_items)
                    if profile_signature not in seen_profiles:
                        seen_profiles.add(profile_signature)
                        profiles.append(profile)
                        profiles_generated += 1

                        # Remove the sections used in this profile from remaining_sections
                        for section in sections_used:
                            if section in remaining_sections:
                                remaining_sections.remove(section)
                    else:
                        print(f"Duplicate profile encountered for respondent {respondent[self.respondent_id]}.")
        return profiles

    def profile_to_text(self, profile: Dict[str, Any]) -> Tuple[str, str, str]:
        lines = []

        # Extract the response feature name and value
        response_feature_name = profile.get('response_feature_name')
        response_feature_value = profile.get('response_feature')

        # Iterate over predictor features
        for feature, value in profile.items():
            if feature in ['respondent_id', 'response_feature', 'response_feature_name']:
                continue  # Skip non-feature fields

            if pd.isnull(value):
                continue  # Skip features with NaN values

            description, mapped_value = self.map_value(feature, value)
            lines.append(f"{description}: {mapped_value}")

        # Get the question and response for the response feature
        if response_feature_name and response_feature_value is not None:
            description, mapped_value = self.map_value(response_feature_name, response_feature_value)

            # Retrieve the question from the mappings
            section = self.feature_to_section.get(response_feature_name)
            feature_mapping = self.survey_mappings.get(section, {}).get(response_feature_name, {})
            question = feature_mapping.get('question', f"Please answer the following question about {description}:")
            response_text = mapped_value
        else:
            # If no response feature is available
            question = ""
            response_text = ""

        preamble = '\n'.join(lines)
        return preamble, question, response_text

    def map_value(self, feature_name: str, value) -> Tuple[str, str]:
        """
        Maps a single feature's value to its description and textual value.

        Returns:
        - Tuple[str, str]: (description, mapped_value)
        """
        # Use the actual feature name to find the section and mapping
        section = self.feature_to_section.get(feature_name)
        if not section:
            return feature_name, str(value)  # Feature not found in mappings

        feature_mapping = self.survey_mappings[section].get(feature_name)
        if not feature_mapping:
            return feature_name, str(value)  # Feature mapping not found

        # Get the generic feature name for the description
        generic_feature_name = self.generic_to_actual_features.get(feature_name, feature_name)
        description = feature_mapping.get('description', generic_feature_name)
        values_mapping = feature_mapping.get('values', {})

        if pd.isnull(value):
            return description, "Missing"

        # Convert value to string key and strip whitespace
        if isinstance(value, float) and value.is_integer():
            value_key = str(int(value))
        elif isinstance(value, (int, np.integer)):
            value_key = str(value)
        else:
            value_key = str(value).strip()

        # Try to get the mapped text
        mapped_text = values_mapping.get(value_key)
        if mapped_text is None:
            mapped_text = str(value)  # Use the original value

        return description, mapped_text


# ESS

In [5]:
directory = 'LLM-for-Social-Science-Research/mappings/ESS/2023'

survey_mappings = {}

for filename in os.listdir(directory):
    if filename.endswith('.json'):
        section_name = os.path.splitext(filename)[0]

        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            survey_mappings[section_name] = json.load(file)


ess2023 = pd.read_csv('LLM-for-Social-Science-Research/ESS11.csv')

  ess2023 = pd.read_csv('LLM-for-Social-Science-Research/ESS11.csv')


In [14]:
ess2020 = pd.read_csv('LLM-for-Social-Science-Research/ESS10.csv')
ess2020['cntry'].unique()

  ess2020 = pd.read_csv('LLM-for-Social-Science-Research/ESS10.csv')


array(['BE', 'BG', 'CH', 'CZ', 'EE', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU',
       'IE', 'IS', 'IT', 'LT', 'ME', 'MK', 'NL', 'NO', 'PT', 'SI', 'SK'],
      dtype=object)

In [13]:
ess2023['cntry'].unique()

array(['AT', 'CH', 'DE', 'FI', 'GB', 'HR', 'HU', 'IE', 'LT', 'NL', 'NO',
       'SI', 'SK'], dtype=object)

In [9]:
#@title add country specific var dictionary for ess
country_specific_variables_2020 = {
            'religion_present': {
                'CH': 'rlgdnach',
                'FI': 'rlgdnafi',
                'GR': 'rlgdnagr',
                'HU': 'rlgdnhu',
                'IS': 'rlgdnais',
                'IE': 'rlgdnie',
                'LT': 'rlgdnlt',
                'ME': 'rlgdme',
                'NL': 'rlgdnanl',
                'MK': 'rlgdnmk',
                'NO': 'rlgdnno',
                'SK': 'rlgdnbsk',
                'GB': 'rlgdngb',
            },
            'religion_past': {
                'CH': 'rlgdeach',
                'FI': 'rlgdeafi',
                'GR': 'rlgdeagr',
                'HU': 'rlgdehu',
                'IS': 'rlgdeais',
                'IE': 'rlgdeie',
                'LT': 'rlgdelt',
                'ME': 'rlgdeme',
                'NL': 'rlgdeanl',
                'MK': 'rlgdemk',
                'NO': 'rlgdeno',
                'SK': 'rlgdebsk',
                'GB': 'rlgdegb',
            },
            'voted': {
                'BE': 'prtvtebe',
                'BG': 'prtvtebg',
                'CH': 'prtvthch',
                'HR': 'prtvbhr',
                'CZ': 'prtvtecz',
                'EE': 'prtvthee',
                'FI': 'prtvtefi',
                'FR': 'prtvfr',
                'GR': 'prtvtdgr',
                'HU': 'prtvtghu',
                'IS': 'prtvic',
                'IE': 'prtvie',
                'IT': 'prtvti',
                'LT': 'prtvlt',
                'ME': 'prtvme',
                'NL': 'prtvthnl',
                'MK': 'prtvtmk',
                'NO': 'prtvtbno',
                'PT': 'prtvtdpt',
                'SI': 'prtvtfsi',
                'SK': 'prtvtesk',
                'GB': 'prtvttdgb',
            },
            'party_affiliation': {
                'BE': 'prtclebe',
                'BG': 'prtclebg',
                'CH': 'prtclhch',
                'HR': 'prtclbhr',
                'CZ': 'prtclecz',
                'EE': 'prtclhee',
                'FI': 'prtclffi',
                'FR': 'prtclffr',
                'GR': 'prtcldgr',
                'HU': 'prtclhhu',
                'IS': 'prtcldis',
                'IE': 'prtclfie',
                'IT': 'prtcleit',
                'LT': 'prtclclt',
                'ME': 'prtclame',
                'NL': 'prtclgnl',
                'MK': 'prtclmk',
                'NO': 'prtclbno',
                'PT': 'prtclfpt',
                'SI': 'prtclfsi',
                'SK': 'prtclesk',
                'GB': 'prtcldgb',
            }
        }



country_specific_variables_2023 = {
    'religion_present': {
        'AT': 'rlgdnbat',
        'CH': 'rlgdnach',
        'DE': 'rlgdnade',
        'FI': 'rlgdnafi',
        'GR': 'rlgdnagr',
        'HU': 'rlgdnhu',
        'IS': 'rlgdnais',
        'IE': 'rlgdnie',
        'LT': 'rlgdnlt',
        'ME': 'rlgdme',
        'NL': 'rlgdnanl',
        'MK': 'rlgdnmk',
        'NO': 'rlgdnno',
        'SK': 'rlgdnask',
        'GB': 'rlgdngb'
    },
    'religion_past': {
        'AT': 'rlgdebat',
        'CH': 'rlgdeach',
        'DE': 'rlgdeade',
        'FI': 'rlgdeafi',
        'GR': 'rlgdeagr',
        'HU': 'rlgdehu',
        'IS': 'rlgdeais',
        'IE': 'rlgdeie',
        'LT': 'rlgdelt',
        'ME': 'rlgdeme',
        'NL': 'rlgdeanl',
        'MK': 'rlgdemk',
        'NO': 'rlgdeno',
        'SK': 'rlgdeask',
        'GB': 'rlgdegb'
    },
    'voted': {
        'AT': 'prtvtdat',
        'CH': 'prtvthch',
        'HR': 'prtvtchr',
        'FI': 'prtvtffi',
        'DE': 'prtvgde1',
        'HU': 'prtvthhu',
        'IE': 'prtvteie',
        'LT': 'prtvclt1',
        'NL': 'prtvtinl',
        'NO': 'prtvtcno',
        'SK': 'prtvtesk',
        'SI': 'prtvtgsi',
        'GB': 'prtvtdgb'
    },
    'party_affiliation': {
        'AT': 'prtcleat',
        'CH': 'prtclhch',
        'HR': 'prtclbhr',
        'FI': 'prtclgfi',
        'DE': 'prtclgde',
        'HU': 'prtclihu',
        'IE': 'prtclfie',
        'LT': 'prtclclt',
        'NL': 'prtclhnl',
        'NO': 'prtclcno',
        'SK': 'prtclesk',
        'SI': 'prtclgsi',
        'GB': 'prtcldgb'
    }
}


## Profiles 2023

In [7]:
random_subset = ess2023[ess2023['cntry']=="DE"]
random_subset.shape

(2420, 558)

In [33]:
prof_generator_short = SurveyProfileGenerator(random_subset, survey_mappings=survey_mappings,
                                        respondent_id='idno', max_sections=3, max_features=2,
                                        fixed_features=['cntry', 'gndr', 'agea', 'essround'], country_field='cntry',
                                        country_specific_variables=country_specific_variables_2023,
                                        random_state=42)


prof_generator_long = SurveyProfileGenerator(random_subset, survey_mappings=survey_mappings,
                                        respondent_id='idno', max_sections=5, max_features=3,
                                        fixed_features=['cntry', 'gndr', 'agea', 'essround'], country_field='cntry',
                                        country_specific_variables=country_specific_variables_2023,
                                        random_state=42)

In [62]:
profiles_short = prof_generator_short.generate_profiles(25) # 3, 2

Duplicate profile encountered for respondent 50004.
Duplicate profile encountered for respondent 50004.
Duplicate profile encountered for respondent 50004.
Duplicate profile encountered for respondent 50059.
Duplicate profile encountered for respondent 50420.
Duplicate profile encountered for respondent 50420.
Duplicate profile encountered for respondent 50420.
Duplicate profile encountered for respondent 50562.
Duplicate profile encountered for respondent 50833.
Duplicate profile encountered for respondent 51226.
Duplicate profile encountered for respondent 51273.
Duplicate profile encountered for respondent 51277.
Duplicate profile encountered for respondent 51277.
Duplicate profile encountered for respondent 51389.
Duplicate profile encountered for respondent 51389.
Duplicate profile encountered for respondent 51426.
Duplicate profile encountered for respondent 51426.
Duplicate profile encountered for respondent 51426.
Duplicate profile encountered for respondent 51445.
Duplicate pr

In [63]:
profiles_long = prof_generator_long.generate_profiles(25)

In [36]:
print(len(profiles_short), len(profiles_long))

112985 121000


In [64]:
profiles = profiles_short + profiles_long
len(profiles)

119371

In [None]:
for profile in profiles[0:10]:
    preambule, question, response = prof_generator.profile_to_text(profile)
    print(f"Profile: \n{preambule}. \nQuestion: {question} \nResponse: {response}")
    print("-" * 40)

In [13]:
def count_feature_occurrences(profiles, feature_name, feature_value=None):
    """
    Counts the occurrences of a specified feature value as a response feature
    and as a profile feature across all profiles.

    Args:
        profiles (list): A list of profiles (dictionaries).
        feature_name (str): The feature name to look for in the profiles.
        feature_value: The value of the feature to count.

    Returns:
        dict: A dictionary with counts for 'response_feature' and 'profile_feature'.
    """
    response_count = 0
    profile_count = 0

    for profile in profiles:
        # Count as a response feature
        if profile.get("response_feature_name") == feature_name and profile.get("response_feature") == feature_value:
            response_count += 1

        # Count as a profile feature
        if profile.get(feature_name) == feature_value:
            profile_count += 1

    return {
        "response_feature_count": response_count,
        "profile_feature_count": profile_count
    }


In [78]:
result = count_feature_occurrences(profiles, 'weasoff', 5)  # 1 never 5 always
result1 = count_feature_occurrences(unique_profiles, 'weasoff', 5)  # 1 never 5 always

print(result,"\n", result1)

{'response_feature_count': 20, 'profile_feature_count': 87} 
 {'response_feature_count': 33, 'profile_feature_count': 159}


In [79]:
unique_profiles = []
profile_signatures = set()

for profile in profiles:
    # Create a signature for each profile based on its features
    feature_items = [
        (feature, profile[feature])
        for feature in profile
        if feature not in ['respondent_id', 'response_feature', 'response_feature_name']
    ]
    feature_items.sort()  # Ensure a consistent order for comparison
    profile_signature = tuple(feature_items)

    # Check for duplicates
    if profile_signature not in profile_signatures:
        profile_signatures.add(profile_signature)  # Add the unique signature to the set
        unique_profiles.append(profile)  # Keep the unique profile


# profile_signatures = set()
# for profile in profiles:
#     feature_items = [
#         (feature, profile[feature])
#         for feature in profile
#         if feature not in ['respondent_id', 'response_feature', 'response_feature_name']
#     ]
#     feature_items.sort()
#     profile_signature = tuple(feature_items)
#     assert profile_signature not in profile_signatures, "Duplicate profile found!"
#     profile_signatures.add(profile_signature)

In [80]:
random.shuffle(unique_profiles)

In [81]:
len(profiles) - len(unique_profiles)

14

In [84]:
len(unique_profiles)

119357

In [82]:
ids = []
prof_descriptions = []
for profile in unique_profiles:
    id = profile['respondent_id']
    preambule, question, response = prof_generator.profile_to_text(profile)
    prof_text = f"Profile: \n{preambule}. \nQuestion: {question} \nResponse: {response}"

    ids.append(id)
    prof_descriptions.append(prof_text)


In [59]:
pd.DataFrame({'id': ids, 'text': prof_descriptions})['id'].nunique()
#.to_csv('profiles.csv', index=False)

2420

In [None]:
ess['idno'].nunique()

17095

In [83]:
df = pd.DataFrame({'id': ids, 'text': prof_descriptions})
df.head()
df.to_csv('profiles_germany_short_2023.csv', index=False)

## Prifiles 2020

In [None]:
prof_generator = SurveyProfileGenerator(ess, survey_mappings=survey_mappings,
                                        respondent_id='idno', max_sections=4, max_features=3,
                                        fixed_features=['cntry', 'gndr', 'agea', 'essround'], country_field='cntry',
                                        country_specific_variables=country_specific_variables_2020,
                                        random_state=42)

In [None]:
profiles = prof_generator.generate_profiles(10)

In [None]:
len(profiles)

376110

In [None]:
profile_signatures = set()
for profile in profiles:
    feature_items = [
        (feature, profile[feature])
        for feature in profile
        if feature not in ['respondent_id', 'response_feature', 'response_feature_name']
    ]
    feature_items.sort()
    profile_signature = tuple(feature_items)
    assert profile_signature not in profile_signatures, "Duplicate profile found!"
    profile_signatures.add(profile_signature)

In [None]:
for profile in profiles[200000:200100]:
    preambule, question, response = prof_generator.profile_to_text(profile)
    print(f"Profile: \n{preambule}. \nQuestion: {question} \nResponse: {response}")
    print("-" * 40)

In [None]:
ids = []
prof_descriptions = []
for profile in profiles:
    id = profile['respondent_id']
    preambule, question, response = prof_generator.profile_to_text(profile)
    prof_text = f"Profile: \n{preambule}. \nQuestion: {question} \nResponse: {response}"

    ids.append(id)
    prof_descriptions.append(prof_text)


In [None]:
df = pd.DataFrame({'id': ids, 'text': prof_descriptions})
# print(df.shape)
# df.head()

df.to_csv('profiles_2020.csv', index=False)

# WVS

In [None]:
# WVS
directory = 'LLM-for-Social-Science-Research/mappings/WVS/updated_wvs_mappings.json'

survey_mappings = {}

with open(directory, 'r', encoding='utf-8') as file:
    survey_mappings = json.load(file)

In [None]:
wvs = pd.read_csv("/content/drive/MyDrive/Ox LLMs Model For Social Science/surveys/WVS_2017_22.csv")
wvs.shape

  wvs = pd.read_csv("/content/drive/MyDrive/Ox LLMs Model For Social Science/surveys/WVS_2017_22.csv")


(97220, 613)

In [None]:
wvs['D_INTERVIEW'].nunique()

96221

In [None]:
random_sample = wvs.sample(n=200, random_state=42)

In [None]:
prof_generator = SurveyProfileGenerator(random_sample, survey_mappings=survey_mappings,
                                        respondent_id='D_INTERVIEW', max_sections=5, max_features=3,
                                        fixed_features=['B_COUNTRY', 'Q260', 'Q262', 'A_YEAR'], country_field='B_COUNTRY',
                                        random_state=42)

In [None]:
profiles = prof_generator.generate_profiles(5)

In [None]:
profile_signatures = set()
for profile in profiles:
    feature_items = [
        (feature, profile[feature])
        for feature in profile
        if feature not in ['respondent_id', 'response_feature', 'response_feature_name']
    ]
    feature_items.sort()
    profile_signature = tuple(feature_items)
    assert profile_signature not in profile_signatures, "Duplicate profile found!"
    profile_signatures.add(profile_signature)

In [None]:
survey_mappings.keys()


dict_keys(['Demographic and Socioeconomic Variables', 'Social Values, Norms, Stereotypes', 'Happiness and Wellbeing', 'Social Capital, Trust and Organizational Membership', 'Economic Values', 'Perceptions of Corruption', 'Perceptions of Migration', 'Perceptions of Security', 'Index of Postmaterialism', 'Perceptions about Science and Technology', 'Religious Values', 'Ethical Values', 'Political Interest and Political Participation', 'Political Culture and Political Regimes', 'Political trust module'])

In [None]:
len(survey_mappings.keys())

15

In [None]:
for profile in profiles:
    preambule, question, response = prof_generator.profile_to_text(profile)
    print(f"Profile: \n{preambule}. \nQuestion: {question} \nResponse: {response}")
    print("-" * 40)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ISO 3166-1 numeric country code: Turkey
Sex: Female
Age: 56
A_YEAR: 2018
Frequency in your neighborhood: Racist behavior: Not frequently
Frequency in your neighborhood: Alcohol consumed in the streets: Not frequently
Immigration in your country: Strengthens cultural diversity: Disagree
Immigration in your country: Helps poor people establish new lives: Agree
Immigration in your country: Increases the crime rate: Disagree
Private vs state ownership of business (scale 1 for 'private should increase' to 10 for 'state/government should increase'): 5
Government´s vs individual´s responsibility (scale 1 for 'government's responsibility' to 10 'individual'): 5
Success: hard work vs luck (scale 1 for 'work brings better life' to 10 for 'success comes from luck and connections, not work'): 5
Sector of employment: Not applicable; Never had a job
Highest educational level: Respondent´s Spouse [ISCED 2011]: Primary education (ISCED 1

In [None]:
ids = []
prof_descriptions = []
for profile in profiles:
    id = profile['respondent_id']
    preambule, question, response = prof_generator.profile_to_text(profile)
    prof_text = f"Profile: \n\n{preambule}. \n\nQuestion: {question} \n\nResponse: {response}"

    ids.append(id)
    prof_descriptions.append(prof_text)

In [None]:
df = pd.DataFrame({'id': ids, 'text': prof_descriptions})
# print(df.shape)
# df.head()

df.to_csv('profiles_wvs_test.csv', index=False)

In [None]:
df['id'].nunique()

199

In [None]:
# profiles = pd.read_csv('profiles_wvs_test.csv')
results = pd.read_csv('wvs_test_2_results.csv')
results_two = pd.read_csv('wvs_test_2_few_shot_results.csv')

In [None]:
print(df['text'][111])

Profile: 

ISO 3166-1 numeric country code: Peru
Sex: Female
Age: 26
A_YEAR: 2018
Respondent immigrant?: I am born in this country
Father immigrant?: Not an immigrant
One of the bad effects of science is that it breaks down people’s ideas of right and wrong (scale 1 for 'disagree completely' to 10 for 'agree completely'): 3
It is not important for me to know about science in my daily life (scale 1 for 'disagree completely' to 10 for 'agree completely'): 2
Science and technology are making our lives healthier, easier, and more comfortable (scale 1 for 'disagree completely' to 10 for 'agree completely'): 2
Social activism: Contacting a government official: Would never do
Information source: Daily newspaper: Less than monthly
How often in country´s elections: Votes are counted fairly: Not often
Men make better political leaders than women do: Disagree
Important child qualities: Religious faith: Not so important
Problem if women have more income than husband: Disagree
Justifiable (scale 1 

In [None]:
results_two['response'][895]

'{\'prompt\': "Meet a 43-year-old Indonesian woman, a housewife residing in Java in 2018.  She views her leisure time as rather important and doesn\'t feel particularly close to the world stage. Her husband\'s education ended at the primary level. While she rates her satisfaction with the political system performance as a moderate 5, she feels that bribery and wealthy individuals influencing elections are not frequent occurrences, and that journalists tend to offer relatively fair election coverage.  She doesn\'t prioritize obedience as an important quality for children,  and considers a military-led political system to be \'fairly good\'. Please answer the following question from this perspective. Question: Do you consider independence to be an especially important quality for children to learn at home?", \'response\': \'Important\'}'

In [None]:
results['response'][0]

"{'prompt': 'Imagine a 31-year-old woman living in Ukraine in 2020.  She believes in hell and life after death, finding the meaning of religion in understanding life after death. Good manners are important to her in raising children. She is neutral about the increased emphasis on technology in the future and neither agrees nor disagrees with women earning more than their husbands. Her mother was not an immigrant. Her family spent some savings and borrowed money during the past year. She somewhat agrees that deciding on the right moral rules is difficult these days (rating it a 3 on a 1-10 scale). She strongly feels that avoiding public transport fares (2) and homosexuality (2) are never justifiable. She prefers private business ownership over state ownership (rating it an 8 on a 1-10 scale), believing hard work leads to a better life. She prefers larger income differences over income equality (rating it an 8 on a 1-10 scale).  Answer the following question from her perspective. Were yo

# Next Steps

In [None]:
!pip install -q -U google-generativeai

In [None]:
# Import the Python SDK
import google.generativeai as genai
# Used to securely store your API key
from google.colab import userdata

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
model = genai.GenerativeModel('gemini-1.5-flash')

In [None]:
def create_prompts(profiles: Dict[str, Any]):
    """
    """

    prompt_main = "Edit the text below to create a dialog. First part should prompt a model to take on a given personality from features given in the profile. Then, this model with the personality should be asked a question stated in the prompt. Then, the model should answer the qestion with the probided response."

    prompts = []
    for profile in profiles:
        preamble, question, response = prof_generator.profile_to_text(profile)
        prompt = f"{prompt_main}\n<<<Profile: {preamble}>>>. \n<<<Question: {question}>>>. \n<<<Response: {response}>>>"
        prompts.append(prompt)
    return prompts

In [None]:
len(profiles)

5

In [None]:
prompts = create_prompts(profiles)

In [None]:
outputs = [model.generate_content(prompt) for prompt in prompts]
text_outputs = [output.text for output in outputs]

In [None]:
for output in text_outputs:
    print(output)
    print("-" * 40)

In [None]:
for profile in profiles:
    preambule, question, response = prof_generator.profile_to_text(profile)
    print(f"Profile: {preambule}. \n\nQuestion: {question} \n\nResponse: {response}")
    print("-" * 40)

In [None]:
profiles[1]

{'respondent_id': 10053,
 'cntry': 'BE',
 'gndr': 2,
 'bctprd': 2,
 'pbldmna': 2,
 'pstplonl': 2,
 'lrscale': 5,
 'euftf': 5,
 'wkhct': 40,
 'uemp3m': 2,
 'hinctnta': 88,
 'response_feature': 5,
 'response_feature_name': 'stfeco'}

In [None]:
#@title Dealing with bad Mappings

def find_numeric_to_numeric_mappings(survey_mappings: dict) -> dict:
    """
    Identifies mappings with numeric-to-numeric key-value pairs in the nested survey_mappings dictionary.
    Returns a dictionary with the section and feature names for each problematic mapping.

    Parameters:
    -----------
    survey_mappings : dict
        The nested dictionary containing sections and features of survey mappings.

    Returns:
    --------
    dict
        A dictionary where each key is a section name and each value is a list of features that contain
        numeric-to-numeric mappings in their "values" field.
    """

    problematic_mappings = {}

    for section, features in survey_mappings.items():
        for feature, feature_data in features.items():
            # Check if "values" key exists in feature data
            if "values" in feature_data:
                # Flag to indicate if this feature has numeric-to-numeric mappings
                has_numeric_to_numeric = False

                for key, value in feature_data["values"].items():
                    # Check if both key and value are numeric (integer-like or string numeric)
                    if key.isdigit() and (value.isdigit() or isinstance(value, int)):
                        has_numeric_to_numeric = True
                        break

                if has_numeric_to_numeric:
                    if section not in problematic_mappings:
                        problematic_mappings[section] = []
                    problematic_mappings[section].append(feature)

    return problematic_mappings


In [None]:
problematic_mappings = find_numeric_to_numeric_mappings(survey_mappings)
problematic_mappings

{'relationship_parents_and_at_work': ['stfmjob',
  'mansupp',
  'teamfeel',
  'wrkextra'],
 'political_opinions': ['lrscale',
  'stflife',
  'stfeco',
  'stfgov',
  'stfdem',
  'stfedu',
  'stfhlth',
  'euftf',
  'imbgeco',
  'imueclt',
  'imwbcnt'],
 'internet_use_social_trust': ['ppltrst',
  'pplfair',
  'pplhlp',
  'trstprl',
  'trstlgl',
  'trstplc',
  'trstplt',
  'trstprt',
  'trstep',
  'trstun',
  'trstsci'],
 'well_being_emot_attachment': ['happy', 'inprdsc', 'atchctr', 'atcherp'],
 'religion': ['rlgrl'],
 'climate_change_eu': ['ccrdprs', 'testic34', 'testic35', 'testic36'],
 'understanding_democracy': ['fairelc',
  'dfprtal',
  'medcrgv',
  'rghmgpr',
  'votedir',
  'cttresa',
  'gptpel',
  'gvctzpv',
  'grdfinc',
  'viepol',
  'wpestop',
  'keydec',
  'fairelcc',
  'dfprtalc',
  'medcrgvc',
  'rghmgprc',
  'votedirc',
  'cttresac',
  'gptpelcc',
  'gvctzpvc',
  'grdfincc',
  'viepolc',
  'wpestopc',
  'keydecc',
  'chpldmi',
  'chpldmc',
  'stpldmi',
  'stpldmc',
  'accalaw'

In [None]:
ess.iloc[:, 0:15].head(2)

Unnamed: 0,name,essround,edition,proddate,idno,cntry,dweight,pspwght,pweight,anweight,nwspol,netusoft,netustm,ppltrst,pplfair
0,ESS10e03_2,10,3.2,02.11.2023,10038,BE,0.88222,0.972276,0.718075,0.698167,30,5,8,6,7
1,ESS10e03_2,10,3.2,02.11.2023,10053,BE,1.047643,0.888635,0.718075,0.638107,10,5,240,3,4
