<a href="https://colab.research.google.com/github/antndlcrx/LLM-for-Social-Science-Research/blob/main/synth_sampling_default.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    PreTrainedModel,
    PreTrainedTokenizer,
    PreTrainedTokenizerFast,
)
import torch
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import os
from typing import List, Dict, Tuple, Any
import json

In [2]:
!git clone https://github.com/antndlcrx/LLM-for-Social-Science-Research.git

Cloning into 'LLM-for-Social-Science-Research'...
remote: Enumerating objects: 171, done.[K
remote: Counting objects: 100% (171/171), done.[K
remote: Compressing objects: 100% (148/148), done.[K
remote: Total 171 (delta 99), reused 58 (delta 23), pack-reused 0 (from 0)[K
Receiving objects: 100% (171/171), 10.69 MiB | 8.34 MiB/s, done.
Resolving deltas: 100% (99/99), done.


In [3]:
from google.colab import userdata
HF_TOKEN = userdata.get("HF_LLAMA31")

In [4]:
# model_name = "meta-llama/Meta-Llama-3.1-8B"
model_name = "meta-llama/Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN,
                                          padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [5]:
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             trust_remote_code=True,
                                             torch_dtype=torch.float16,
                                             device_map='auto',
                                             token=HF_TOKEN,
                                             )

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]



In [6]:
directory = 'LLM-for-Social-Science-Research/mappings'

survey_mappings = {}

for filename in os.listdir(directory):
    if filename.endswith('.json'):
        section_name = os.path.splitext(filename)[0]

        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            survey_mappings[section_name] = json.load(file)

In [7]:
ess = pd.read_csv("LLM-for-Social-Science-Research/ESS10.csv")

  ess = pd.read_csv("LLM-for-Social-Science-Research/ESS10.csv")


In [8]:
class Mapper:
    def __init__(self, survey_mappings: Dict[str, Dict[str, Any]]):
        self.survey_mappings = survey_mappings
        # Build a mapping from feature names to their sections for quick lookup
        self.feature_to_section = {
            feature: section
            for section, features in self.survey_mappings.items()
            for feature in features
        }

    def map_value(self, feature_name: str, value):
        section = self.feature_to_section.get(feature_name)
        if not section:
            return str(value)  # Feature not found in mappings

        feature_mapping = self.survey_mappings[section].get(feature_name)
        if not feature_mapping:
            return str(value)  # Feature mapping not found

        values_mapping = feature_mapping.get('values', {})
        if pd.isnull(value):
            return "Missing"

        if isinstance(value, float) and value.is_integer():
            value_key = str(int(value))
        elif isinstance(value, (int, np.integer)):
            value_key = str(value)
        else:
            value_key = str(value)

        return values_mapping.get(value_key, str(value))

    def fill_prompt(self, respondent: pd.Series, prompt_template: str):
        placeholders = {}
        placeholder_pattern = re.compile(r"\{(\w+)\}")
        placeholder_names = placeholder_pattern.findall(prompt_template)

        for placeholder in placeholder_names:
            if placeholder in respondent:
                value = respondent[placeholder]
                if placeholder in ['agea']:
                    # Handle numeric fields separately
                    placeholders[placeholder] = str(int(value)) if pd.notnull(value) else f"unknown {placeholder}"
                else:
                    mapped_value = self.map_value(placeholder, value)
                    placeholders[placeholder] = mapped_value
            else:
                placeholders[placeholder] = "Unknown"

        filled_prompt = prompt_template.format(**placeholders)
        return filled_prompt


In [9]:
mapper = Mapper(survey_mappings)

In [10]:
prompt = """
Imagine you are a {agea}-year old {gndr} living in {cntry} in {essround}. Your highest education is {eisced} and
your feelings about your income are {hincfel}. On a scale from left (0) to right(10), you would put yourself at {lrscale}.

For the question that follows, please answer taking on this person's perspective.
Qusetions: What was the party you voted for in the last election?
Your answer:
"""

In [11]:
random_subset = ess.sample(1000, random_state=42)

In [12]:
# Generate prompts for all respondents
prompts = []
for idx, respondent in random_subset.iterrows():
    filled_prompt = mapper.fill_prompt(respondent, prompt)
    prompts.append(filled_prompt)

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def make_prediction(prompt):
    '''
    '''
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    attention_mask = inputs['input_ids'].ne(tokenizer.pad_token_id).to(device)

    # Generate the output
    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'],
            attention_mask=attention_mask,  # Pass the attention mask
            max_new_tokens=32,
            temperature=1.0,
            do_sample=True,  # Enable sampling
            eos_token_id=tokenizer.eos_token_id,  # End generation at the end-of-sequence token
            pad_token_id=tokenizer.pad_token_id,  # Set padding token id
        )

    # Decode the generated sequence
    completion = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return completion

In [22]:
prediciton = make_prediction(prompts[5])
print('\n'+prediciton)



Imagine you are a 23-year old Male living in Slovakia in 2020. Your highest education is Bachelor’s or equivalent level and 
your feelings about your income are Coping on present income. On a scale from left (0) to right(10), you would put yourself at 3. 

For the question that follows, please answer taking on this person's perspective. 
Qusetions: What was the party you voted for in the last election? 
Your answer:
It was SMER, I voted for Robert Fico. I had thought about voting for SNS, but I thought that Robert Fico had a better chance


In [25]:
prediciton = make_prediction(prompts[15])
print('\n'+prediciton)



Imagine you are a 81-year old Female living in Finland in 2020. Your highest education is Post-secondary non-tertiary education and 
your feelings about your income are Coping on present income. On a scale from left (0) to right(10), you would put yourself at 6. 

For the question that follows, please answer taking on this person's perspective. 
Qusetions: What was the party you voted for in the last election? 
Your answer:
In the last election I voted for the Social Democrats. I voted for them because I have always believed in their values of equality and solidarity. They have always been


In [23]:
print(prompts[15])


Imagine you are a 81-year old Female living in Finland in 2020. Your highest education is Post-secondary non-tertiary education and 
your feelings about your income are Coping on present income. On a scale from left (0) to right(10), you would put yourself at 6. 

For the question that follows, please answer taking on this person's perspective. 
Qusetions: What was the party you voted for in the last election? 
Your answer:

