In [20]:
import pandas as pd
import numpy as np
import os
import json
import random
from typing import List, Dict, Tuple, Any

In [30]:
!git clone https://github.com/antndlcrx/LLM-for-Social-Science-Research.git

fatal: destination path 'LLM-for-Social-Science-Research' already exists and is not an empty directory.


In [32]:
directory = 'LLM-for-Social-Science-Research/mappings'

survey_mappings = {}

for filename in os.listdir(directory):
    if filename.endswith('.json'):
        section_name = os.path.splitext(filename)[0]

        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            survey_mappings[section_name] = json.load(file)


ess = pd.read_csv('ESS10.csv')

  ess = pd.read_csv('ESS10.csv')


In [24]:
class SurveyProfileGenerator:
    def __init__(self,
                 data: pd.DataFrame,
                 respondent_id: str,
                 survey_mappings: Dict[str, Dict[str, Any]],
                 max_sections: int = 3,
                 max_features: int = 3,
                 fixed_features: List[str] = None):
        """
        Initializes the SurveyProfileGenerator with survey mappings, maximum number of features per section,
        and any fixed features that should always be included in the profiles.

        Parameters:
        - data (pd.DataFrame): The survey dataset.
        - respondent_id (str): The column name for respondent IDs.
        - survey_mappings (dict): Nested dictionary mapping of survey questions.
        - max_sections (int): Maximum number of sections to randomly select.
        - max_features (int): Maximum number of features to randomly select per section.
        - fixed_features (List[str]): List of feature names that are fixed and always included.
        """
        self.data = data
        self.respondent_id = respondent_id
        self.survey_mappings = survey_mappings
        self.max_sections = max_sections
        self.max_features = max_features
        self.fixed_features = fixed_features or []

        # Build a mapping from feature names to their sections
        self.feature_to_section = {
            feature: section
            for section, features in self.survey_mappings.items()
            for feature in features
        }

    def create_random_profile(self, respondent: pd.Series) -> Dict[str, Any]:
        """
        Creates a random profile for a single respondent.

        Parameters:
        - respondent (pd.Series): A row from the DataFrame representing a respondent.

        Returns:
        - profile (dict): A dictionary representing the respondent's profile.
        """
        profile = {'respondent_id': respondent[self.respondent_id]}

        # Add fixed features
        predictor_features = []
        for feature in self.fixed_features:
            if feature in respondent:
                profile[feature] = respondent[feature]
                predictor_features.append(feature)

        # Available sections excluding those already included via fixed features
        available_sections = list(set(self.survey_mappings.keys()) -
                                  set(self.feature_to_section.get(f) for f in self.fixed_features))

        # Randomly select sections
        num_sections_to_select = min(self.max_sections, len(available_sections))
        random_sections = random.sample(available_sections, num_sections_to_select)

        # Collect selected features
        selected_features = []
        for section in random_sections:
            features_in_section = list(self.survey_mappings[section].keys())
            num_features_to_select = min(self.max_features, len(features_in_section))
            selected_in_section = random.sample(features_in_section, num_features_to_select)
            selected_features.extend(selected_in_section)

        # Remove any fixed features from selected features
        selected_features = [f for f in selected_features if f not in self.fixed_features]

        if not selected_features:
            # If no features are left after removing fixed features
            return profile

        # Select one feature as the response feature
        response_feature = random.choice(selected_features)
        selected_features.remove(response_feature)

        # Add predictor features
        for feature in selected_features:
            if feature in respondent:
                profile[feature] = respondent[feature]
                predictor_features.append(feature)

        # Add the response feature
        if response_feature in respondent:
            profile['response_feature'] = respondent[response_feature]
            profile['response_feature_name'] = response_feature

        return profile

    def generate_profiles(self, num_profiles_per_respondent: int) -> List[Dict[str, Any]]:
        """
        Generates profiles for all respondents in the dataset.

        Parameters:
        - num_profiles_per_respondent (int): Number of profiles to generate per respondent.

        Returns:
        - profiles (List[dict]): A list of respondent profiles.
        """
        profiles = []
        for _, respondent in self.data.iterrows():
            for _ in range(num_profiles_per_respondent):
                profile = self.create_random_profile(respondent)
                profiles.append(profile)
        return profiles

    def profile_to_text(self, profile: Dict[str, Any]) -> Tuple[str, str]:
        """
        Converts a profile into text form, turning profile features into a text description
        and the response feature into an answer to an interview question.

        Parameters:
        - profile (dict): A respondent's profile.

        Returns:
        - preamble (str): The text description of the respondent.
        - response (str): The answer to the interview question.
        """
        lines = []

        # Extract the response feature name and value
        response_feature_name = profile.get('response_feature_name')
        response_feature_value = profile.get('response_feature')

        # Iterate over predictor features
        for feature, value in profile.items():
            if feature in ['respondent_id', 'response_feature', 'response_feature_name']:
                continue  # Skip non-feature fields

            section = self.feature_to_section.get(feature)
            if not section:
                continue  # Skip if section is not found

            feature_mapping = self.survey_mappings.get(section, {}).get(feature)
            if not feature_mapping:
                continue  # Skip if feature mapping is not found

            description = feature_mapping.get('description', feature)
            values_mapping = feature_mapping.get('values', {})
            value_text = values_mapping.get(str(value), str(value))

            lines.append(f"{description}: {value_text}")

        # Get the question and response for the response feature
        if response_feature_name and response_feature_value is not None:
            section = self.feature_to_section.get(response_feature_name)
            if section:
                feature_mapping = self.survey_mappings.get(section, {}).get(response_feature_name)
                if feature_mapping:
                    question = feature_mapping.get('question', f"Please answer the following question about {response_feature_name}:")
                    values_mapping = feature_mapping.get('values', {})
                    response_text = values_mapping.get(str(response_feature_value), str(response_feature_value))
                else:
                    question = f"Please answer the following question about {response_feature_name}:"
                    response_text = str(response_feature_value)
            else:
                question = f"Please answer the following question about {response_feature_name}:"
                response_text = str(response_feature_value)

            lines.append(question)
        else:
            # If no response feature is available
            response_text = ""

        preamble = '\n'.join(lines)
        return preamble, response_text


In [38]:
prof_generator = SurveyProfileGenerator(ess[0:3], survey_mappings=survey_mappings,
                                        respondent_id='idno', max_sections=3, max_features=3,
                                        fixed_features=['cntry', 'gndr'])

In [45]:
profiles = prof_generator.generate_profiles(1)

In [46]:
for profile in profiles:
    preamble, response = prof_generator.profile_to_text(profile)
    print("Preamble:")
    print(preamble)
    print("Response:")
    print(response)
    print("-" * 40)

Preamble:
Country: Belgium
Gender: Female
Donated to or participated in political party or pressure group last 12 months: No
Boycotted certain products last 12 months: No
Party voted for in last national election, Belgium: 66.0
Party voted for in last national election, Switzerland: nan
Respondent or household member victim of burglary/assault last 5 years: Don't know
How happy are you: 7
Take part in social activities compared to others of same age: Less than most
During the last 12 months, have you contacted a politician, government or local government official?
Response:
No
----------------------------------------
Preamble:
Country: Belgium
Gender: Female
How many people with whom you can discuss intimate and personal matters: 4-6
How emotionally attached to [country]: 3
Feeling of safety of walking alone in local area after dark: Safe
Language most often spoken at home: second mentioned: 000
Discrimination of respondent's group: disability: Not marked
Citizen of country: Yes
Volunt

## Next Steps

In [None]:
!pip install -q -U google-generativeai

In [None]:
# Import the Python SDK
import google.generativeai as genai
# Used to securely store your API key
from google.colab import userdata

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
model = genai.GenerativeModel('gemini-1.5-flash')

In [None]:
prompt = "Create a short biographic description for the profile below."

In [None]:
model.generate_content()

In [None]:
#@title Dealing with bad Mappings

def find_numeric_to_numeric_mappings(survey_mappings: dict) -> dict:
    """
    Identifies mappings with numeric-to-numeric key-value pairs in the nested survey_mappings dictionary.
    Returns a dictionary with the section and feature names for each problematic mapping.

    Parameters:
    -----------
    survey_mappings : dict
        The nested dictionary containing sections and features of survey mappings.

    Returns:
    --------
    dict
        A dictionary where each key is a section name and each value is a list of features that contain
        numeric-to-numeric mappings in their "values" field.
    """

    problematic_mappings = {}

    for section, features in survey_mappings.items():
        for feature, feature_data in features.items():
            # Check if "values" key exists in feature data
            if "values" in feature_data:
                # Flag to indicate if this feature has numeric-to-numeric mappings
                has_numeric_to_numeric = False

                for key, value in feature_data["values"].items():
                    # Check if both key and value are numeric (integer-like or string numeric)
                    if key.isdigit() and (value.isdigit() or isinstance(value, int)):
                        has_numeric_to_numeric = True
                        break

                if has_numeric_to_numeric:
                    if section not in problematic_mappings:
                        problematic_mappings[section] = []
                    problematic_mappings[section].append(feature)

    return problematic_mappings


In [None]:
problematic_mappings = find_numeric_to_numeric_mappings(survey_mappings)
problematic_mappings

{'relationship_parents_and_at_work': ['stfmjob',
  'mansupp',
  'teamfeel',
  'wrkextra'],
 'political_opinions': ['lrscale',
  'stflife',
  'stfeco',
  'stfgov',
  'stfdem',
  'stfedu',
  'stfhlth',
  'euftf',
  'imbgeco',
  'imueclt',
  'imwbcnt'],
 'internet_use_social_trust': ['ppltrst',
  'pplfair',
  'pplhlp',
  'trstprl',
  'trstlgl',
  'trstplc',
  'trstplt',
  'trstprt',
  'trstep',
  'trstun',
  'trstsci'],
 'well_being_emot_attachment': ['happy', 'inprdsc', 'atchctr', 'atcherp'],
 'religion': ['rlgrl'],
 'climate_change_eu': ['ccrdprs', 'testic34', 'testic35', 'testic36'],
 'understanding_democracy': ['fairelc',
  'dfprtal',
  'medcrgv',
  'rghmgpr',
  'votedir',
  'cttresa',
  'gptpel',
  'gvctzpv',
  'grdfinc',
  'viepol',
  'wpestop',
  'keydec',
  'fairelcc',
  'dfprtalc',
  'medcrgvc',
  'rghmgprc',
  'votedirc',
  'cttresac',
  'gptpelcc',
  'gvctzpvc',
  'grdfincc',
  'viepolc',
  'wpestopc',
  'keydecc',
  'chpldmi',
  'chpldmc',
  'stpldmi',
  'stpldmc',
  'accalaw'

In [None]:
1+1

2