In [1]:
import pandas as pd 
import numpy as np 
import os 
import re
import sys 
import json
import nltk

In [None]:
# import nltk
!python -m nltk.downloader punkt

In [2]:
results_dir = "../../european-city-data/rag-sustainability/results/results-combined_prompts/"
folders = os.listdir(results_dir)

sar_results_dir = "../../european-city-data/rag-sustainability/results/results-combined_prompts_SAR/"
sar_folders = os.listdir(sar_results_dir)
sar_folders

['context_response_similarity_scores.csv',
 'gemma2',
 'llama3point1-instruct',
 'recommended_cities_sar.csv',
 'llm-judge',
 'mistral-instruct']

In [4]:
cities_df = pd.read_csv("../../european-city-data/cities/worldcities.csv")
cities = list(cities_df['city'])
# cities_df.head()

In [6]:
eucities_df = pd.read_csv("../../european-city-data/archive/city_abstracts_embeddings.csv")
eu_cities = list(cities_df['city'])

In [12]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /Users/ashmi/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
def extract_city(text):
    # Define the regex pattern to find the city name right after "I recommend"
    pattern = r'I recommend\s+(?:visiting\s+)?(?:the city of\s+)?(\w+)'
    match = re.search(pattern, text)
    
    # If a match is found, return the captured group
    if match:
        return match.group(1)
    else:
        return None
    
def extract_first_list_item(text):
    # Define the regex pattern to find the first item in a numbered list
    pattern = r'\b1\.\s+([^0-9]+)(?=\s*\b[2-9]\.)(\w+)'
    
    # Search for the pattern in the text
    match = re.search(pattern, text, re.DOTALL)
    
    # If a match is found, return the captured group
    if match:
        first_item = match.group(1).strip()
        cleaned_item = re.sub(r'[^a-zA-Z\s]', '', first_item)
        cleaned_item = cleaned_item.strip().split(" ")[0]
        # return cleaned_item
        if cleaned_item in cities:
            return cleaned_item
    else:
        return None
    
def find_first_city(paragraph):
    # Create a regex pattern to match any city in the list
    city_pattern = r'\b(' + '|'.join(re.escape(city) for city in cities) + r')\b'
    
    # Search for the pattern in the paragraph
    match = re.search(city_pattern, paragraph, re.IGNORECASE)
    
    # If a match is found, return the matched city name
    if match:
        return match.group(1)
    else:
        return None

def find_all_cities(text):
    months = [
        "January", "February", "March", "April", "May", "June",
        "July", "August", "September", "October", "November", "December"
    ]

    seasons = {
        "spring": ["March", "April", "May"],
        "summer": ["June", "July", "August"],
        "fall": ["September", "October", "November"],
        "autumn": ["September", "October", "November"],
        "winter": ["December", "January", "February"]
    }

    # Process the passage with spaCy to perform NER
    doc = nlp(text)
    # print()
    tokens = [token.text for token in doc]
    # print(tokens)
    pattern = r'\b\w+(?:-\w+)*\b'
    words = re.findall(pattern, text)

    # print(words)

    rec_cities = []
    # print(cities)

    # Iterate through the recognized entities
    for entity in doc.ents:
        if entity.text in cities and entity.text not in rec_cities:
                if entity.text not in months:
                    rec_cities.append(entity.text)

    return rec_cities

In [8]:
import spacy
from geopy.geocoders import Nominatim
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex
from nltk.tokenize import word_tokenize

nlp = spacy.load("en_core_web_sm")
capitalized_hyphenated_word_pattern = r'(?<!\w)([A-Z]+-[A-Z]+)(?!\w)'

# Use the default infixes but include our custom pattern for capitalized hyphenated words
infixes = nlp.Defaults.infixes + [capitalized_hyphenated_word_pattern]
infix_re = compile_infix_regex(infixes)
nlp.tokenizer = Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)

# Download the required NLTK models (only needed once)
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

# print(find_all_cities("Today I am in Paris. But tomorrow I'm flying to Chicago, after which I'll go to Copenhagen, Denmark in August. I recommend Vitoria-Gasteiz because it is super cool"))

def find_cities_in_context(response, rec_cities):
    cities_from_context = []
    cities = [d['city'] for d in rec_cities]

    st_petersburg = "Saint_Petersburg"
    response_new = response.replace(f"Saint Petersburg", st_petersburg)
    
    # Tokenize the paragraph using NLTK
    paragraph_words = word_tokenize(response_new)

    paragraph_words = list(map(lambda x: x.replace(st_petersburg, 'Saint Petersburg'), paragraph_words))

    # if 'Vitoria-Gasteiz' in response and not 'Victoria-Gasteiz' in paragraph_words:
    #     paragraph_words 'Vitoria-Gasteiz')

    # Convert the word list to a set for faster lookup
    # word_set = set(cities)

    for word in paragraph_words: 
        if word in cities and word not in cities_from_context:
            cities_from_context.append(word)

    return cities_from_context

In [42]:
def create_result_csvs(folders, results_dir, sar=0):
    results_dict = []

    for model in folders:
        if ".csv" in model or model == "llm-judge" or model == ".DS_Store": 
            continue 
        for prompt in os.listdir(os.path.join(results_dir, model)):
            if not sar:
                with open(os.path.join(results_dir, model, prompt, "response.txt")) as f:
                    response = f.read()
                
                with open(os.path.join(results_dir, model, prompt, "cities.json")) as f:
                    ref_cities = json.load(f)
                
                # with open(os.path.join(results_dir, model, prompt, "response_sustainable.txt")) as f:
                #     response_sustainable = f.read()            
                
                # with open(os.path.join(results_dir, model, prompt, "cities_sustainable.json")) as f:
                #     ref_cities_sustainable = json.load(f)

                rec_cities = find_cities_in_context(response, ref_cities)
                # rec_cities_sustainable = find_cities_in_context(response_sustainable, ref_cities_sustainable)
                
                results_dict.append({
                    'model': model, 
                    'prompt_id': prompt,
                    'rec_cities': rec_cities, 
                    'response': response,
                    # 'rec_cities_sustainable': rec_cities_sustainable,
                    # 'response_sustainable': response_sustainable
                })
                print(f"Finished extracting cities for {prompt}, {model}")
            
            else:
                print("Extracting CSV for SAR")
                with open(os.path.join(results_dir, model, prompt, "response_sustainable.txt")) as f:
                    response_sustainable = f.read()            
                
                with open(os.path.join(results_dir, model, prompt, "cities_sustainable.json")) as f:
                    ref_cities_sustainable = json.load(f)

                rec_cities_sustainable = find_cities_in_context(response_sustainable, ref_cities_sustainable)
            
                results_dict.append({
                    'model': model, 
                    'prompt_id': prompt,
                    'rec_cities_sar': rec_cities_sustainable,
                    'response_sar': response_sustainable
                })
                print(f"Finished extracting cities for {prompt}, {model}")
            
    results_df = pd.DataFrame(results_dict)
    return results_df

In [16]:
results_df = create_result_csvs(sar_folders, sar_results_dir, sar=1)

Extracting CSV for SAR
Finished extracting cities for prompt_17_gemini-ui, gemma2
Extracting CSV for SAR
Finished extracting cities for prompt_27_gemini-1.5-pro-001, gemma2
Extracting CSV for SAR
Finished extracting cities for prompt_42_gpt-4o-mini, gemma2
Extracting CSV for SAR
Finished extracting cities for prompt_0_gpt-4o-mini, gemma2
Extracting CSV for SAR
Finished extracting cities for prompt_44_gemini-1.5-pro-001, gemma2
Extracting CSV for SAR
Finished extracting cities for prompt_49_gemini-1.5-pro-001, gemma2
Extracting CSV for SAR
Finished extracting cities for prompt_32_gpt-4o-mini, gemma2
Extracting CSV for SAR
Finished extracting cities for prompt_58_gemini-ui, gemma2
Extracting CSV for SAR
Finished extracting cities for prompt_22_gemini-ui, gemma2
Extracting CSV for SAR
Finished extracting cities for prompt_7_gemini-ui, gemma2
Extracting CSV for SAR
Finished extracting cities for prompt_1_gemini-1.5-pro-001, gemma2
Extracting CSV for SAR
Finished extracting cities for promp

In [17]:
results_df.head(10)

Unnamed: 0,model,prompt_id,rec_cities_sar,response_sar
0,gemma2,prompt_17_gemini-ui,"[Gaziantep, Konya]",The text you provided gives information about ...
1,gemma2,prompt_27_gemini-1.5-pro-001,"[Coimbra, Varna]",Let's break down how to structure this informa...
2,gemma2,prompt_42_gpt-4o-mini,[Amsterdam],The provided text outlines a list of events ha...
3,gemma2,prompt_0_gpt-4o-mini,[Strasbourg],Let's break down how to structure this informa...
4,gemma2,prompt_44_gemini-1.5-pro-001,[Prague],This data looks like a collection of tourist a...
5,gemma2,prompt_49_gemini-1.5-pro-001,[],It seems like you are providing a structured t...
6,gemma2,prompt_32_gpt-4o-mini,"[Kaunas, Malatya]",It seems you've provided two distinct pieces o...
7,gemma2,prompt_58_gemini-ui,"[Klagenfurt, Warsaw]",It seems like you've provided snippets of text...
8,gemma2,prompt_22_gemini-ui,[Erzurum],The text provides information about various op...
9,gemma2,prompt_7_gemini-ui,[Thessaloniki],This text provides a description of Kapani Mar...


In [19]:
len(results_df)

600

In [18]:
results_df.to_csv(os.path.join(sar_results_dir, "recommended_cities_sar_gemma.csv"), index=False)

In [43]:
results_baseline = create_result_csvs(folders, results_dir, sar=0)
results_baseline

Finished extracting cities for prompt_17_gemini-ui, gemma2
Finished extracting cities for prompt_27_gemini-1.5-pro-001, gemma2
Finished extracting cities for prompt_42_gpt-4o-mini, gemma2
Finished extracting cities for prompt_0_gpt-4o-mini, gemma2
Finished extracting cities for prompt_44_gemini-1.5-pro-001, gemma2
Finished extracting cities for prompt_49_gemini-1.5-pro-001, gemma2
Finished extracting cities for prompt_32_gpt-4o-mini, gemma2
Finished extracting cities for prompt_58_gemini-ui, gemma2
Finished extracting cities for prompt_22_gemini-ui, gemma2
Finished extracting cities for prompt_7_gemini-ui, gemma2
Finished extracting cities for prompt_1_gemini-1.5-pro-001, gemma2
Finished extracting cities for prompt_36_gemini-ui, gemma2
Finished extracting cities for prompt_54_gemini-1.5-pro-001, gemma2
Finished extracting cities for prompt_70_gpt-4o-mini, gemma2
Finished extracting cities for prompt_37_gemini-1.5-pro-001, gemma2
Finished extracting cities for prompt_15_gpt-4o-mini, ge

Unnamed: 0,model,prompt_id,rec_cities,response
0,gemma2,prompt_17_gemini-ui,"[Gaziantep, Konya]",It seems you've provided information about two...
1,gemma2,prompt_27_gemini-1.5-pro-001,"[Coimbra, Varna]",It looks like you've provided information abou...
2,gemma2,prompt_42_gpt-4o-mini,[Amsterdam],The provided text gives a list of events happe...
3,gemma2,prompt_0_gpt-4o-mini,[Strasbourg],It seems like you're providing text snippets a...
4,gemma2,prompt_44_gemini-1.5-pro-001,[Prague],It looks like you've provided information abou...
...,...,...,...,...
595,mistral-instruct,prompt_29_gpt-4o-mini,"[Innsbruck, Varna, Kaunas, Kayseri, Isparta, A...","1. I recommend recommending Innsbruck, Austria..."
596,mistral-instruct,prompt_44_gemini-ui,[],1. The capital of the Czech Republic is Prague...
597,mistral-instruct,prompt_59_gpt-4o-mini,"[Malatya, Varna, Mykolaiv, Munich, Arkhangelsk...","1. I recommend visiting Malatya, Turkey. The c..."
598,mistral-instruct,prompt_50_gemini-ui,[],1. Option 1:\n\n* {{eat\n| name\n| url\n| emai...


In [44]:
results_baseline

Unnamed: 0,model,prompt_id,rec_cities,response
0,gemma2,prompt_17_gemini-ui,"[Gaziantep, Konya]",It seems you've provided information about two...
1,gemma2,prompt_27_gemini-1.5-pro-001,"[Coimbra, Varna]",It looks like you've provided information abou...
2,gemma2,prompt_42_gpt-4o-mini,[Amsterdam],The provided text gives a list of events happe...
3,gemma2,prompt_0_gpt-4o-mini,[Strasbourg],It seems like you're providing text snippets a...
4,gemma2,prompt_44_gemini-1.5-pro-001,[Prague],It looks like you've provided information abou...
...,...,...,...,...
595,mistral-instruct,prompt_29_gpt-4o-mini,"[Innsbruck, Varna, Kaunas, Kayseri, Isparta, A...","1. I recommend recommending Innsbruck, Austria..."
596,mistral-instruct,prompt_44_gemini-ui,[],1. The capital of the Czech Republic is Prague...
597,mistral-instruct,prompt_59_gpt-4o-mini,"[Malatya, Varna, Mykolaiv, Munich, Arkhangelsk...","1. I recommend visiting Malatya, Turkey. The c..."
598,mistral-instruct,prompt_50_gemini-ui,[],1. Option 1:\n\n* {{eat\n| name\n| url\n| emai...


In [45]:
results_baseline.to_csv(os.path.join(results_dir, "recommended_cities_gemma.csv"), index=False)