In [292]:
from sentence_transformers import SentenceTransformer
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
)
import json
import os

from tqdm.auto import tqdm
import torch
import pandas as pd

In [3]:
ELASTIC_URL = os.getenv("ELASTIC_URL_LOCAL")
MODEL_NAME = os.getenv("MODEL_NAME")
INDEX_NAME = os.getenv("INDEX_NAME")
HUGGINGFACE_API = os.getenv("HUGGINGFACE_API")

# Summary of the Jupyter Notebook: 03_BUILD_GROUND_TRUTH_DATA.IPYNB

This notebook is designed to build ground truth data for a policy advisory system using machine learning models. Below is a summary of the key steps and components involved:

1. **Imports and Environment Setup**:
    - Essential libraries and modules are imported, including `sentence_transformers`, `transformers`, `json`, `os`, `tqdm`, `torch`, and `pandas`.
    - Environment variables are loaded to configure URLs and model names.

2. **Model Loading**:
    - A function `load_mode()` is defined to load a SentenceTransformer model using the model name specified in the environment variables.
    - The model is loaded and stored in the variable `model`.

3. **Data Loading**:
    - Ground truth data is loaded from a JSON file located at `../data_output/data_to_test.json`.

4. **Prompt Building**:
    - A function `build_prompt(data)` is defined to create a prompt template for generating questions based on the provided data.

5. **Model Initialization for Text Generation**:
    - The notebook checks for CUDA availability and sets the device accordingly.
    - An AutoModelForCausalLM and AutoTokenizer are loaded using the Hugging Face API.
    - A text generation pipeline is created using the loaded model and tokenizer.
    - A function `llm(prompt)` is defined to generate text based on the provided prompt using the text generation pipeline.

6. **Question Generation**:
    - The notebook iterates over the ground truth data, builds prompts, and generates questions using the `llm` function.
    - The generated questions are stored in a list `answer`.

7. **Result Processing**:
    - The generated questions are parsed from the JSON format and stored in a dictionary `result`.
    - The final results are compiled into a list of tuples containing chunk IDs and questions.

8. **Dataframe Creation and Export**:
    - A pandas DataFrame is created from the final results.
    - The DataFrame is exported to a CSV file located at `../data_output/ground-truth-retrieval.csv`.

This notebook provides a comprehensive workflow for generating ground truth data for policy advisory systems using machine learning models.

In [6]:
def load_mode():
    print(f"Loading model: {MODEL_NAME}")
    return SentenceTransformer(MODEL_NAME)


model = load_mode()

Loading model: all-mpnet-base-v2


In [4]:
with open("../data_output/data_to_test.json", "r", encoding="utf-8") as f:
    ground_truth_data = json.load(f)

ground_truth_data

[{'doc_id': 'Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M',
  'page_num': 1,
  'chunk_id': 'Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M_1_1',
  'text': "Research Article Cityphilia and cityphobia: A multi-scalar search for city love in Flanders Karima Kourtita,b,c,*, Bart Neutsd, Peter Nijkampa,b,c, Marie H. Wahlstr €ome aOpen University, Heerlen, the Netherlands bAlexandru Ioan Cuza University, Iasi, Romania cUniversity of Rijeka, Rijeka, Croatia dKU Leuven, Leuven, Belgium eKTH, Stockholm, Sweden ARTICLE INFO Keywords: Well-being Happiness City loveSocial cohesionCentral place systemsInter-urban attractivenessABSTRACT Cities, towns, and rural areas form a complex spatial system in ﬂuenced by governance, economic factors, and the perceptions of their residents. This paper introduces the concepts of 'cityphilia' and 'cityphobia' as metaphors for the spatial attraction and repulsion forces that shape local quality of life.

In [233]:
def build_prompt(data):
    prompt_template = """
### INSTRUCTIONS:
You are simulating an advisory session for policymakers with the assistance of a housing policy expert. 
Using only the content provided in the record, generate up to 3 detailed and specific questions directed at a housing policy expert. 
Each question must be fully answerable based solely on the information within the record, and should not repeat phrases verbatim from it. 
Avoid introducing new information beyond what is provided.

If the record lacks sufficient information to generate a question, exclude it rather than creating filler questions.

Return the output in a well-formed JSON format without code blocks.
{{
    "questions": "[question1, question2, question3]",
}}

### RECORD:

text: {text}
""".strip()
    text = data["text"]
    return prompt_template.format(text=text).strip()

In [219]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

model_generation = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct",
    device_map=device,
    torch_dtype="auto",
    trust_remote_code=True,
    token=HUGGINGFACE_API,
)
tokenizer_generation = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct",
    token=HUGGINGFACE_API,
)

pipe_generation = pipeline(
    "text-generation",
    model=model_generation,
    tokenizer=tokenizer_generation,
)


def llm(prompt):
    # start_time = time.time()
    messages = [
        {"role": "user", "content": prompt},
    ]

    eos_token_id = pipe_generation.tokenizer.eos_token_id

    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        # "temperature": 0.0,
        "do_sample": False,
        "pad_token_id": eos_token_id,
    }

    output = pipe_generation(messages, **generation_args)

    answer = output[0]["generated_text"].strip()

    # end_time = time.time()
    # response_time = end_time - start_time

    return answer

In [183]:
ground_truth_data[0]

{'doc_id': 'Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M',
 'page_num': 1,
 'chunk_id': 'Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M_1_1',
 'text': "Research Article Cityphilia and cityphobia: A multi-scalar search for city love in Flanders Karima Kourtita,b,c,*, Bart Neutsd, Peter Nijkampa,b,c, Marie H. Wahlstr €ome aOpen University, Heerlen, the Netherlands bAlexandru Ioan Cuza University, Iasi, Romania cUniversity of Rijeka, Rijeka, Croatia dKU Leuven, Leuven, Belgium eKTH, Stockholm, Sweden ARTICLE INFO Keywords: Well-being Happiness City loveSocial cohesionCentral place systemsInter-urban attractivenessABSTRACT Cities, towns, and rural areas form a complex spatial system in ﬂuenced by governance, economic factors, and the perceptions of their residents. This paper introduces the concepts of 'cityphilia' and 'cityphobia' as metaphors for the spatial attraction and repulsion forces that shape local quality of life. It 

In [243]:
answer = []
for data in tqdm(ground_truth_data[:1]):
    prompt = build_prompt(data)
    answer.append(llm(prompt))

100%|██████████| 1/1 [00:05<00:00,  5.56s/it]


In [244]:
print(answer[0])

{
    "questions": [
        "How do you think the concept of cityphobia can be used to inform policy decisions aimed at reducing urban poverty and inequality, particularly in areas with high levels of social exclusion?",
        "What role do you believe the 'body' component of city love should play in the development of policies aimed at promoting physical and mental well-being in urban areas?",
        "How do you think the'soul' component of city love, encompassing social cohesion and community engagement, can be leveraged to enhance the resilience of urban communities in the face of economic and social challenges?"
    ]
}


In [271]:
json.loads(answer[0])["questions"]

['How do you think the concept of cityphobia can be used to inform policy decisions aimed at reducing urban poverty and inequality, particularly in areas with high levels of social exclusion?',
 "What role do you believe the 'body' component of city love should play in the development of policies aimed at promoting physical and mental well-being in urban areas?",
 "How do you think the'soul' component of city love, encompassing social cohesion and community engagement, can be leveraged to enhance the resilience of urban communities in the face of economic and social challenges?"]

In [251]:
result = {}

for data in tqdm(ground_truth_data):
    chunk_id = data["chunk_id"]
    if chunk_id in result:
        continue

    prompt = build_prompt(data)
    questions_raw = llm(prompt)
    questions = json.loads(questions_raw)
    result[chunk_id] = questions

100%|██████████| 50/50 [04:06<00:00,  4.93s/it]


In [281]:
result

{'Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M_1_1': {'questions': ['How do you think the concept of cityphobia can be used to inform policy decisions aimed at reducing urban poverty and inequality, particularly in areas with high levels of social exclusion?',
   "What role do you believe the 'body' component of city love should play in the development of policies aimed at promoting physical and mental well-being in urban areas?",
   "How do you think the'soul' component of city love, encompassing social cohesion and community engagement, can be leveraged to enhance the resilience of urban communities in the face of economic and social challenges?"]},
 'Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M_1_2': {'questions': ["How do you respond to the argument that the 'city love' framework, which emphasizes the importance of central place systems in providing well-being services, may be overly simplistic or neglects the complexi

In [286]:
final_results = []

for chunk_id, content in result.items():
    questions = content.get("questions", [])
    for q in questions:
        final_results.append((chunk_id, q))

In [289]:
final_results[0]

('Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M_1_1',
 'How do you think the concept of cityphobia can be used to inform policy decisions aimed at reducing urban poverty and inequality, particularly in areas with high levels of social exclusion?')

In [293]:
df_results = pd.DataFrame(final_results, columns=["id", "question"])

In [294]:
df_results.to_csv("../data_output/ground-truth-retrieval.csv", index=False)