In [1]:
import pandas as pd 
import numpy as np 
import os 
import re
import sys 

In [2]:
results_dir = "../../../european-city-data/rag-sustainability/results/results-combined_prompts/"
folders = os.listdir(results_dir)

In [18]:
cities_df = pd.read_csv("../../../european-city-data/cities/worldcities.csv")
cities = list(cities_df['city'])
# cities_df.head()

In [7]:
eucities_df = pd.read_csv("../../../european-city-data/archive/city_abstracts_embeddings.csv")
eu_cities = list(cities_df['city'])

In [59]:
import spacy
from geopy.geocoders import Nominatim
nlp = spacy.load("en_core_web_sm")
tokenizer = nlp.tokenizer

# Download the required NLTK models (only needed once)
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

def extract_city(text):
    # Define the regex pattern to find the city name right after "I recommend"
    pattern = r'I recommend\s+(?:visiting\s+)?(?:the city of\s+)?(\w+)'
    match = re.search(pattern, text)
    
    # If a match is found, return the captured group
    if match:
        return match.group(1)
    else:
        return None
    
def extract_first_list_item(text):
    # Define the regex pattern to find the first item in a numbered list
    pattern = r'\b1\.\s+([^0-9]+)(?=\s*\b[2-9]\.)(\w+)'
    
    # Search for the pattern in the text
    match = re.search(pattern, text, re.DOTALL)
    
    # If a match is found, return the captured group
    if match:
        first_item = match.group(1).strip()
        cleaned_item = re.sub(r'[^a-zA-Z\s]', '', first_item)
        cleaned_item = cleaned_item.strip().split(" ")[0]
        # return cleaned_item
        if cleaned_item in cities:
            return cleaned_item
    else:
        return None
    
def find_first_city(paragraph):
    # Create a regex pattern to match any city in the list
    city_pattern = r'\b(' + '|'.join(re.escape(city) for city in cities) + r')\b'
    
    # Search for the pattern in the paragraph
    match = re.search(city_pattern, paragraph, re.IGNORECASE)
    
    # If a match is found, return the matched city name
    if match:
        return match.group(1)
    else:
        return None

def find_all_cities(text):
    months = [
        "January", "February", "March", "April", "May", "June",
        "July", "August", "September", "October", "November", "December"
    ]

    seasons = {
        "spring": ["March", "April", "May"],
        "summer": ["June", "July", "August"],
        "fall": ["September", "October", "November"],
        "autumn": ["September", "October", "November"],
        "winter": ["December", "January", "February"]
    }

    # Process the passage with spaCy to perform NER
    doc = nlp(text)
    # print()
    tokens = [token.text for token in doc]
    # print(tokens)
    pattern = r'\b\w+(?:-\w+)*\b'
    words = re.findall(pattern, text)

    # print(words)

    rec_cities = []
    # print(cities)

    # Iterate through the recognized entities
    for entity in doc.ents:
        if entity.text in cities and entity.text not in rec_cities:
                if entity.text not in months:
                    rec_cities.append(entity.text)

    return rec_cities

print(find_all_cities("Today I am in Paris. But tomorrow I'm flying to Chicago, after which I'll go to Copenhagen, Denmark in August. I recommend Vitoria-Gasteiz because it is super cool"))

['Paris', 'Chicago', 'Copenhagen', 'Vitoria-Gasteiz']


In [60]:
results_dict = []

for model in folders:
    if ".csv" in model: 
        continue 

    for prompt in os.listdir(os.path.join(results_dir, model)):
        
        with open(os.path.join(results_dir, model, prompt, "response.txt")) as f:
            response = f.read()
        
        with open(os.path.join(results_dir, model, prompt, "response_sustainable.txt")) as f:
            response_sustainable = f.read()

        rec_cities = find_all_cities(response)

        rec_cities_sustainable = find_all_cities(response_sustainable)

        results_dict.append({
            'model': model, 
            'prompt_id': prompt,
            'rec_cities': rec_cities, 
            'response': response,
            'rec_cities_sustainable': rec_cities_sustainable,
            'response_sustainable': response_sustainable
        })
        
results_df = pd.DataFrame(results_dict)

In [61]:
results_df.head(5)

Unnamed: 0,model,prompt_id,rec_cities,response,rec_cities_sustainable,response_sustainable
0,llama3point1-instruct,prompt_17_gemini-ui,[Kaunas],I recommend Kaunas in Lithuania for your expe...,[Vitoria-Gasteiz],I recommend Vitoria-Gasteiz because it is a c...
1,llama3point1-instruct,prompt_27_gemini-1.5-pro-001,[Copenhagen],"I recommend Copenhagen, Denmark because it is...",[Copenhagen],I recommend Copenhagen and why I recommended ...
2,llama3point1-instruct,prompt_42_gpt-4o-mini,[Amsterdam],"I recommend Amsterdam, Netherlands. Amsterdam...","[Nalchik, Varna, Trabzon]","I recommend Nalchik, Russia because of its be..."
3,llama3point1-instruct,prompt_0_gpt-4o-mini,"[Kaunas, Greece, Strasbourg]","I recommend Kaunas, Lithuania. I recommend Ka...","[Kaunas, Oradea, Strasbourg, Greece, Varna, Si...","I recommend Kaunas, Lithuania because it is a..."
4,llama3point1-instruct,prompt_44_gemini-1.5-pro-001,[Kaunas],I recommend Kaunas for your medieval history ...,[Kaunas],"I recommend Kaunas, Lithuania because it has ..."


In [63]:
results_df.to_csv(os.path.join(results_dir, "recommended_cities.csv"), index=False)