In [17]:
import pandas as pd 
import numpy as np 
import os 
import re
import sys 

In [18]:
results_dir = "../results/results-06.08./"
folders = os.listdir(results_dir)

In [19]:
cities_df = pd.read_csv("../../../european-city-data/archive/city_abstracts_embeddings.csv")
cities = list(cities_df['city'])

In [20]:
cities

['Aalborg',
 'Adana',
 'Amsterdam',
 'Ancona',
 'Ankara',
 'Antalya',
 'Arad',
 'Arkhangelsk',
 'Astrakhan',
 'Baia Mare',
 'Baku',
 'Barcelona',
 'Bari',
 'Batman',
 'Belgrade',
 'Bergen',
 'Berlin',
 'Bologna',
 'Bordeaux',
 'Braga',
 'Bratislava',
 'Bremen',
 'Brest',
 'Brno',
 'Brussels',
 'Budapest',
 'Burgas',
 'Bursa',
 'Bydgoszcz',
 'Cagliari',
 'Cheboksary',
 'Chelyabinsk',
 'Cluj-Napoca',
 'Coimbra',
 'Copenhagen',
 'Cork',
 'Craiova',
 'Debrecen',
 'Denizli',
 'Dijon',
 'Donetsk',
 'Dresden',
 'Dublin',
 'Erfurt',
 'Erzincan',
 'Erzurum',
 'Gaziantep',
 'Geneva',
 'Hamburg',
 'Helsinki',
 'Innsbruck',
 'Ioannina',
 'Isparta',
 'Istanbul',
 'Ivano-Frankivsk',
 'Izmir',
 'Kahramanmaras',
 'Kaliningrad',
 'Kars',
 'Kaunas',
 'Kayseri',
 'Kazan',
 'Kharkiv',
 'Kiel',
 'Kirov',
 'Klagenfurt',
 'Konya',
 'Krasnodar',
 'Kutaisi',
 'Lille',
 'Ljubljana',
 'London',
 'Luxembourg',
 'Lviv',
 'Lyon',
 'Maastricht',
 'Madrid',
 'Magdeburg',
 'Malatya',
 'Milan',
 'Minsk',
 'Miskolc',
 '

In [21]:
"Kahramanmaras" in cities

True

In [25]:
def extract_city(text):
    # Define the regex pattern to find the city name right after "I recommend"
    pattern = r'I recommend\s+(?:visiting\s+)?(?:the city of\s+)?(\w+)'
    match = re.search(pattern, text)
    
    # If a match is found, return the captured group
    if match:
        return match.group(1)
    else:
        return None
    
def extract_first_list_item(text):
    # Define the regex pattern to find the first item in a numbered list
    pattern = r'\b1\.\s+([^0-9]+)(?=\s*\b[2-9]\.)(\w+)'
    
    # Search for the pattern in the text
    match = re.search(pattern, text, re.DOTALL)
    
    # If a match is found, return the captured group
    if match:
        first_item = match.group(1).strip()
        cleaned_item = re.sub(r'[^a-zA-Z\s]', '', first_item)
        cleaned_item = cleaned_item.strip().split(" ")[0]
        # return cleaned_item
        if cleaned_item in cities:
            return cleaned_item
    else:
        return None
    
def find_first_city(paragraph):
    # Create a regex pattern to match any city in the list
    city_pattern = r'\b(' + '|'.join(re.escape(city) for city in cities) + r')\b'
    
    # Search for the pattern in the paragraph
    match = re.search(city_pattern, paragraph, re.IGNORECASE)
    
    # If a match is found, return the matched city name
    if match:
        return match.group(1)
    else:
        return None
    

def find_all_cities(paragraph, cities):
    # Create a regex pattern to match any city in the list
    city_pattern = r'\b(' + '|'.join(re.escape(city) for city in cities) + r')\b'
    
    # Find all matches of the pattern in the paragraph
    matches = re.findall(city_pattern, paragraph, re.IGNORECASE)
    
    # Return the list of matched cities
    return matches

results_dict = []

for model in folders:
    if ".csv" in model: 
        continue 

    for prompt in os.listdir(os.path.join(results_dir, model)):
        
        with open(os.path.join(results_dir, model, prompt, "response.txt")) as f:
            response = f.read()
        
        with open(os.path.join(results_dir, model, prompt, "response_sustainable.txt")) as f:
            response_sustainable = f.read()

        city = extract_city(response)

        if city is None or city not in cities:
            # print(model, prompt)
            city = extract_first_list_item(response)

            if city is None or city not in cities:
                city = find_first_city(response)

        city_sustainable = extract_city(response_sustainable)

        if city_sustainable is None or city_sustainable not in cities:
            city_sustainable = extract_first_list_item(response_sustainable)

            if city_sustainable is None or city_sustainable not in cities:
                city_sustainable = find_first_city(response_sustainable)

        results_dict.append({
            'model': model, 
            'prompt_id': prompt,
            'rec_city': city, 
            'response': response,
            'rec_city_sustainable': city_sustainable,
            'response_sustainable': response_sustainable
        })
        
results_df = pd.DataFrame(results_dict)

In [26]:
results_df.head(5)

Unnamed: 0,model,prompt_id,rec_city,response,rec_city_sustainable,response_sustainable
0,llama3point1-instruct,44_chatgpt-3.5,Strasbourg,I recommend Strasbourg in June. I recommend S...,Strasbourg,"I recommend Strasbourg, France, in June. I re..."
1,llama3point1-instruct,41_chatgpt-3.5,Klagenfurt,I recommend Klagenfurt in July. Klagenfurt is...,Klagenfurt,I recommend Klagenfurt. I recommend Klagenfur...
2,llama3point1-instruct,17_chatgpt-3.5,Strasbourg,I recommend Strasbourg as the best place to v...,Vienna,I recommend Vienna for Christmas markets in D...
3,llama3point1-instruct,26_chatgpt-3.5,Innsbruck,I recommend Innsbruck and why you recommended...,Klagenfurt,"I recommend Klagenfurt, Austria in June. I re..."
4,llama3point1-instruct,19_chatgpt-3.5,Tbilisi,I recommend Tbilisi for thermal baths and spa...,Bursa,"I recommend Bursa, Turkey for thermal baths a..."


In [27]:
results_df.to_csv(os.path.join(results_dir, "recommended_cities.csv"))

In [44]:
"Venice" in cities

False