In [1]:
# Get `OPENAI_API_KEY` from .env file.
%reload_ext dotenv
%dotenv

In [11]:
%pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl (11.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hUsing cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.15.1-cp311-cp311-macosx_14_0_arm64.whl (24.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.8/24.8 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Inst

In [21]:
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display, Markdown
import pandas as pd
import numpy as np

In [3]:
openai_client = OpenAI()

In [7]:
def get_embedding(text):
    response = openai_client.embeddings.create(
        input=text,
        model="text-embedding-3-large"
    )
    return response.data[0].embedding

In [8]:
text_input = ["East Asia", "India", "China", "Shanghai", "Bengaluru"]
embedded_text = {text: get_embedding(text) for text in text_input}

In [15]:
def compute_cosine_similarity(embedding1, embedding2):
    # Convert embeddings to 2D arrays
    embedding1 = np.array(embedding1).reshape(1, -1)
    embedding2 = np.array(embedding2).reshape(1, -1)
    
    similarity = cosine_similarity(embedding1, embedding2)
    return similarity[0][0]

In [16]:
# Create a list of text pairs
text_pairs = [(text1, text2) for text1 in text_input for text2 in text_input]

# Compute cosine similarity for each pair
similarities = [
    compute_cosine_similarity(embedded_text[text1], embedded_text[text2])
    for text1, text2 in text_pairs
]

# Create a DataFrame to represent the similarities
similarity_matrix = pd.DataFrame(
    data=np.array(similarities).reshape(len(text_input), len(text_input)),
    index=text_input,
    columns=text_input
)

# Display the DataFrame
similarity_matrix

Unnamed: 0,East Asia,India,China,Shanghai,Bengaluru
East Asia,1.0,0.261944,0.429677,0.387365,0.176355
India,0.261944,1.0,0.497503,0.275422,0.334764
China,0.429677,0.497503,1.0,0.475599,0.220031
Shanghai,0.387365,0.275422,0.475599,1.0,0.348633
Bengaluru,0.176355,0.334764,0.220031,0.348633,1.0


In [18]:
regions = {
    "East Asia": [
        {"country": "China", "cities": ["Beijing", "Shanghai", "Guangzhou"]},
        {"country": "Japan", "cities": ["Tokyo", "Osaka", "Kyoto"]},
        {"country": "South Korea", "cities": ["Seoul", "Busan", "Incheon"]},
        {"country": "Taiwan", "cities": ["Taipei", "Kaohsiung", "Taichung"]},
        {"country": "Mongolia", "cities": ["Ulaanbaatar", "Erdenet", "Darkhan"]},
    ],
    "North Europe": [
        {"country": "Sweden", "cities": ["Stockholm", "Gothenburg", "Malmö"]},
        {"country": "Norway", "cities": ["Oslo", "Bergen", "Trondheim"]},
        {"country": "Denmark", "cities": ["Copenhagen", "Aarhus", "Odense"]},
        {"country": "Finland", "cities": ["Helsinki", "Espoo", "Tampere"]},
        {"country": "Iceland", "cities": ["Reykjavik", "Akureyri", "Keflavik"]},
    ],
    "Central Europe": [
        {"country": "Germany", "cities": ["Berlin", "Munich", "Frankfurt"]},
        {"country": "Poland", "cities": ["Warsaw", "Kraków", "Gdańsk"]},
        {"country": "Czech Republic", "cities": ["Prague", "Brno", "Ostrava"]},
        {"country": "Austria", "cities": ["Vienna", "Salzburg", "Graz"]},
        {"country": "Hungary", "cities": ["Budapest", "Debrecen", "Szeged"]},
    ],
    "South Asia": [
        {"country": "India", "cities": ["Mumbai", "Delhi", "Bangalore"]},
        {"country": "Pakistan", "cities": ["Karachi", "Lahore", "Islamabad"]},
        {"country": "Bangladesh", "cities": ["Dhaka", "Chittagong", "Khulna"]},
        {"country": "Sri Lanka", "cities": ["Colombo", "Kandy", "Galle"]},
        {"country": "Nepal", "cities": ["Kathmandu", "Pokhara", "Biratnagar"]},
    ],
    "North America": [
        {"country": "United States", "cities": ["New York", "Los Angeles", "Chicago"]},
        {"country": "Canada", "cities": ["Toronto", "Vancouver", "Montreal"]},
        {"country": "Mexico", "cities": ["Mexico City", "Guadalajara", "Monterrey"]},
        {"country": "Cuba", "cities": ["Havana", "Santiago de Cuba", "Camagüey"]},
        {"country": "Jamaica", "cities": ["Kingston", "Montego Bay", "Ocho Rios"]},
    ],
    "South America": [
        {"country": "Brazil", "cities": ["São Paulo", "Rio de Janeiro", "Brasília"]},
        {"country": "Argentina", "cities": ["Buenos Aires", "Córdoba", "Rosario"]},
        {"country": "Colombia", "cities": ["Bogotá", "Medellín", "Cali"]},
        {"country": "Chile", "cities": ["Santiago", "Valparaíso", "Concepción"]},
        {"country": "Peru", "cities": ["Lima", "Cusco", "Arequipa"]},
    ],
    "Middle East": [
        {"country": "Saudi Arabia", "cities": ["Riyadh", "Jeddah", "Mecca"]},
        {"country": "Iran", "cities": ["Tehran", "Mashhad", "Isfahan"]},
        {"country": "United Arab Emirates", "cities": ["Dubai", "Abu Dhabi", "Sharjah"]},
        {"country": "Turkey", "cities": ["Istanbul", "Ankara", "Izmir"]},
        {"country": "Israel", "cities": ["Tel Aviv", "Jerusalem", "Haifa"]},
    ],
    "Oceania": [
        {"country": "Australia", "cities": ["Sydney", "Melbourne", "Brisbane"]},
        {"country": "New Zealand", "cities": ["Auckland", "Wellington", "Christchurch"]},
        {"country": "Fiji", "cities": ["Suva", "Nadi", "Lautoka"]},
        {"country": "Papua New Guinea", "cities": ["Port Moresby", "Lae", "Madang"]},
        {"country": "Samoa", "cities": ["Apia", "Salelologa", "Vaitele"]},
    ],
}

In [19]:
# Produce embeddings for every region name, country, and city
region_embeddings = {region: get_embedding(region) for region in regions.keys()}
country_city_embeddings = {
    region: {
        country['country']: get_embedding(country['country']) for country in countries
    } for region, countries in regions.items()
}
for region, countries in regions.items():
    for country in countries:
        for city in country['cities']:
            country_city_embeddings[region][city] = get_embedding(city)

# Compute cosine similarities within each region and across regions
results = []

for region, region_embedding in region_embeddings.items():
    within_similarities = []
    cross_similarities = []
    
    for other_region in region_embeddings.keys():
        for entity, embedding in country_city_embeddings[other_region].items():
            similarity = compute_cosine_similarity(region_embedding, embedding)
            if region == other_region:
                within_similarities.append(similarity)
            else:
                cross_similarities.append(similarity)
    
    within_min = min(within_similarities)
    within_max = max(within_similarities)
    within_avg = np.mean(within_similarities)
    
    cross_min = min(cross_similarities)
    cross_max = max(cross_similarities)
    cross_avg = np.mean(cross_similarities)
    
    results.append({
        'Region': region,
        'Within Min': within_min,
        'Within Max': within_max,
        'Within Avg': within_avg,
        'Cross Min': cross_min,
        'Cross Max': cross_max,
        'Cross Avg': cross_avg
    })

# Create a pandas DataFrame to display the results
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Region,Within Min,Within Max,Within Avg,Cross Min,Cross Max,Cross Avg
0,East Asia,0.212199,0.488542,0.34223,0.102015,0.329652,0.220913
1,North Europe,0.226223,0.464575,0.326745,0.105077,0.375097,0.202494
2,Central Europe,0.279002,0.500755,0.401016,0.094494,0.329361,0.21198
3,South Asia,0.232874,0.477685,0.337454,0.064327,0.4051,0.191135
4,North America,0.17453,0.502212,0.302561,0.122017,0.34554,0.215424
5,South America,0.24853,0.517853,0.376381,0.089403,0.400432,0.23618
6,Middle East,0.226662,0.422387,0.325096,0.098782,0.354406,0.217162
7,Oceania,0.247796,0.438993,0.325121,0.112279,0.325671,0.218347


In [23]:
outliers = {}

for index, row in results_df.iterrows():
    region = row['Region']
    within_min = row['Within Min']
    within_max = row['Within Max']
    cross_min = row['Cross Min']
    cross_max = row['Cross Max']
    
    outliers[region] = {
        'within_region_outliers': [],
        'cross_region_outliers': []
    }
    
    # Find outliers within the current region
    for other_region, entities in country_city_embeddings.items():
        if other_region != region:
            for entity, embedding in entities.items():
                similarity = compute_cosine_similarity(region_embeddings[region], embedding)
                if within_min <= similarity <= within_max:
                    outliers[region]['within_region_outliers'].append((other_region, entity, similarity))
    
    # Find outliers within the cross region
    for entity, embedding in country_city_embeddings[region].items():
        similarity = compute_cosine_similarity(region_embeddings[region], embedding)
        if cross_min <= similarity <= cross_max:
            outliers[region]['cross_region_outliers'].append((region, entity, similarity))
    
    # Sort within region outliers in descending order
    outliers[region]['within_region_outliers'].sort(key=lambda x: x[2], reverse=True)
    
    # Sort cross region outliers in ascending order
    outliers[region]['cross_region_outliers'].sort(key=lambda x: x[2])

# Display the results using markdown
for region, data in outliers.items():
    display(Markdown(f"### Region: {region}"))
    display(Markdown(f"**Within Region Similarities**"))
    display(Markdown(f"- Min: {results_df.loc[results_df['Region'] == region, 'Within Min'].values[0]}\n- Max: {results_df.loc[results_df['Region'] == region, 'Within Max'].values[0]}"))
    
    display(Markdown(f"**Within Region Outliers**"))
    for other_region, entity, similarity in data['within_region_outliers']:
        entity_type = "Country" if entity in [country['country'] for country in regions[other_region]] else "City"
        display(Markdown(f"- {entity} ({entity_type}, {other_region}): {similarity}"))
    
    display(Markdown(f"**Cross Region Similarities**"))
    display(Markdown(f"- Min: {results_df.loc[results_df['Region'] == region, 'Cross Min'].values[0]}\n- Max: {results_df.loc[results_df['Region'] == region, 'Cross Max'].values[0]}"))
    
    display(Markdown(f"**Cross Region Outliers**"))
    for region, entity, similarity in data['cross_region_outliers']:
        entity_type = "Country" if entity in [country['country'] for country in regions[region]] else "City"
        display(Markdown(f"- {entity} ({entity_type}, {region}): {similarity}"))

### Region: East Asia

**Within Region Similarities**

- Min: 0.21219891515454117
- Max: 0.4885424068136953

**Within Region Outliers**

- United States (Country, North America): 0.32965166485938724

- Australia (Country, Oceania): 0.31566999067050616

- Sri Lanka (Country, South Asia): 0.30984158340607326

- New Zealand (Country, Oceania): 0.30771585065477336

- Lae (City, Oceania): 0.301528214252636

- Samoa (Country, Oceania): 0.29874417167363915

- Nepal (Country, South Asia): 0.2967048433956967

- Auckland (City, Oceania): 0.2954343726841165

- Los Angeles (City, North America): 0.2942366220059414

- Czech Republic (Country, Central Europe): 0.2935291339461724

- Israel (Country, Middle East): 0.29294339063874336

- Vancouver (City, North America): 0.2908564552746291

- Cuba (Country, North America): 0.2906646662306178

- Saudi Arabia (Country, Middle East): 0.29034850475924334

- Germany (Country, Central Europe): 0.2877355136794945

- Chile (Country, South America): 0.2854669583683148

- Papua New Guinea (Country, Oceania): 0.2845687454423753

- Denmark (Country, North Europe): 0.2827263770168143

- Poland (Country, Central Europe): 0.28211704789376957

- Austria (Country, Central Europe): 0.27889411848588347

- Canada (Country, North America): 0.2788883499959996

- New York (City, North America): 0.27858019019469205

- Chicago (City, North America): 0.2766994163072215

- Prague (City, Central Europe): 0.2751451629611531

- Norway (Country, North Europe): 0.27273997177802045

- Bangladesh (Country, South Asia): 0.27169994739831305

- Iceland (Country, North Europe): 0.2712863412537282

- Madang (City, Oceania): 0.27059314267194834

- Apia (City, Oceania): 0.26744521217852346

- Cali (City, South America): 0.2670393548132291

- Brazil (Country, South America): 0.26271478503327084

- Finland (Country, North Europe): 0.262429199322772

- India (Country, South Asia): 0.26181421056105914

- Aarhus (City, North Europe): 0.2593920282462078

- Sydney (City, Oceania): 0.2577685152312823

- Berlin (City, Central Europe): 0.2573992369409819

- Iran (Country, Middle East): 0.25314743546164886

- Hungary (Country, Central Europe): 0.2530730764262113

- Jerusalem (City, Middle East): 0.25156086227440827

- Peru (Country, South America): 0.2503591498552836

- Toronto (City, North America): 0.24915496779113794

- Copenhagen (City, North Europe): 0.2487389309988905

- Tel Aviv (City, Middle East): 0.2485841338163394

- Melbourne (City, Oceania): 0.2443543458596223

- Jamaica (Country, North America): 0.24331089500965725

- Frankfurt (City, Central Europe): 0.24130515901158725

- Buenos Aires (City, South America): 0.23958714356114424

- Fiji (Country, Oceania): 0.23933266818760457

- Vienna (City, Central Europe): 0.23767480865588225

- Helsinki (City, North Europe): 0.23744454723416675

- United Arab Emirates (Country, Middle East): 0.23694774591743675

- Mecca (City, Middle East): 0.2352413066902392

- Monterrey (City, North America): 0.23466892217997876

- Pakistan (Country, South Asia): 0.23284683952326313

- Mexico City (City, North America): 0.23118923577365758

- Haifa (City, Middle East): 0.2282823682800521

- Colombia (Country, South America): 0.2272757430287573

- Espoo (City, North Europe): 0.22680381495481175

- Akureyri (City, North Europe): 0.22622086936100255

- Sweden (Country, North Europe): 0.2251925794572788

- Warsaw (City, Central Europe): 0.22484009637169622

- Brno (City, Central Europe): 0.2228293550820661

- Dhaka (City, South Asia): 0.22268774620707585

- Ostrava (City, Central Europe): 0.222083058407129

- Kathmandu (City, South Asia): 0.21825036446390006

- Port Moresby (City, Oceania): 0.2167072693079405

- Istanbul (City, Middle East): 0.21651779992411507

- Khulna (City, South Asia): 0.21571903799080117

- Riyadh (City, Middle East): 0.21540410609991603

- Turkey (Country, Middle East): 0.21414148558358745

- Mashhad (City, Middle East): 0.21404024953216408

- Munich (City, Central Europe): 0.21366492166928852

- Abu Dhabi (City, Middle East): 0.21307527700182582

- Christchurch (City, Oceania): 0.21296086950298487

**Cross Region Similarities**

- Min: 0.10201505910568548
- Max: 0.32965166485938724

**Cross Region Outliers**

- Darkhan (City, East Asia): 0.21219891515454117

- Erdenet (City, East Asia): 0.23408265140413145

- Ulaanbaatar (City, East Asia): 0.2374785355845517

- Incheon (City, East Asia): 0.27898500204235915

- Guangzhou (City, East Asia): 0.28345634010652476

- Kaohsiung (City, East Asia): 0.2849317853777814

- Taichung (City, East Asia): 0.29738711320097566

- Busan (City, East Asia): 0.30445433754153284

### Region: North Europe

**Within Region Similarities**

- Min: 0.22622341986602498
- Max: 0.46457548559536915

**Within Region Outliers**

- Germany (Country, Central Europe): 0.375096877997427

- Poland (Country, Central Europe): 0.33877370901439385

- New Zealand (Country, Oceania): 0.33419876183907077

- United States (Country, North America): 0.31003111057897476

- Czech Republic (Country, Central Europe): 0.30998449459025673

- Prague (City, Central Europe): 0.30578457584818747

- Gdańsk (City, Central Europe): 0.3012259876331519

- South Korea (Country, East Asia): 0.2928541889724628

- Vienna (City, Central Europe): 0.2874739800003242

- Austria (Country, Central Europe): 0.28655879953781993

- Frankfurt (City, Central Europe): 0.28305027798458593

- New York (City, North America): 0.28288407364362167

- Hungary (Country, Central Europe): 0.2808161269252666

- Munich (City, Central Europe): 0.2797260597146484

- Berlin (City, Central Europe): 0.2756401972266981

- Canada (Country, North America): 0.2731414781687541

- Australia (Country, Oceania): 0.26125229307919107

- Auckland (City, Oceania): 0.26079213382887356

- Mongolia (Country, East Asia): 0.25928851169848866

- Brazil (Country, South America): 0.25482644392485015

- Budapest (City, Central Europe): 0.24927540070510573

- China (Country, East Asia): 0.24757140257550597

- Japan (Country, East Asia): 0.2451539543732028

- Debrecen (City, Central Europe): 0.2438771803769691

- Brno (City, Central Europe): 0.24361423277747285

- Nepal (Country, South Asia): 0.24338646045412052

- Galle (City, South Asia): 0.24166839515104255

- Chile (Country, South America): 0.24125334854750408

- Israel (Country, Middle East): 0.2403350105209104

- Istanbul (City, Middle East): 0.23922645187108132

- Shanghai (City, East Asia): 0.23903905923376217

- Seoul (City, East Asia): 0.23867656611727894

- United Arab Emirates (Country, Middle East): 0.23651062169318904

- India (Country, South Asia): 0.2355886371397351

- Vancouver (City, North America): 0.23346097387272954

- Erdenet (City, East Asia): 0.23183201935122133

- Warsaw (City, Central Europe): 0.23089186516190152

- Cuba (Country, North America): 0.23000333830130315

- Melbourne (City, Oceania): 0.22868384813148473

- Christchurch (City, Oceania): 0.22755050273712424

**Cross Region Similarities**

- Min: 0.10507734424679988
- Max: 0.375096877997427

**Cross Region Outliers**

- Tampere (City, North Europe): 0.22622341986602498

- Odense (City, North Europe): 0.24609256167251942

- Keflavik (City, North Europe): 0.26258685465302845

- Akureyri (City, North Europe): 0.26890971493458804

- Reykjavik (City, North Europe): 0.2715602134921276

- Espoo (City, North Europe): 0.27457513758318297

- Malmö (City, North Europe): 0.28977540637712507

- Trondheim (City, North Europe): 0.2936972547207335

- Oslo (City, North Europe): 0.29839101601309115

- Aarhus (City, North Europe): 0.30390558986698146

- Bergen (City, North Europe): 0.31258302086743667

- Stockholm (City, North Europe): 0.3256551691122932

- Gothenburg (City, North Europe): 0.3287537089675537

- Sweden (Country, North Europe): 0.3483704257140118

- Helsinki (City, North Europe): 0.35526601064160046

### Region: Central Europe

**Within Region Similarities**

- Min: 0.27900198217284355
- Max: 0.500754725012794

**Within Region Outliers**

- Denmark (Country, North Europe): 0.32936096034120244

- Copenhagen (City, North Europe): 0.31904014826430327

- Chicago (City, North America): 0.3022405838572949

- Helsinki (City, North Europe): 0.30021195056657624

- Sweden (Country, North Europe): 0.2950651353956599

- Mongolia (Country, East Asia): 0.2927415713387179

- Iceland (Country, North Europe): 0.2927277064443219

- Istanbul (City, Middle East): 0.2902466529047544

- Mexico City (City, North America): 0.28955003828050807

- Cuba (Country, North America): 0.2849567148899518

- Norway (Country, North Europe): 0.28040029834579505

- Taichung (City, East Asia): 0.28021278606313893

- United States (Country, North America): 0.27964625434100565

**Cross Region Similarities**

- Min: 0.09449364426821844
- Max: 0.32936096034120244

**Cross Region Outliers**

- Gdańsk (City, Central Europe): 0.27900198217284355

### Region: South Asia

**Within Region Similarities**

- Min: 0.23287445140620333
- Max: 0.47768517242461495

**Within Region Outliers**

- South Korea (Country, East Asia): 0.40509969950885605

- Saudi Arabia (Country, Middle East): 0.3258648927525003

- Seoul (City, East Asia): 0.29140573356908617

- Los Angeles (City, North America): 0.2855910668607431

- United States (Country, North America): 0.2764329479735852

- Australia (Country, Oceania): 0.2763371408791808

- New Zealand (Country, Oceania): 0.2723259067327069

- Fiji (Country, Oceania): 0.2693464120062987

- Lae (City, Oceania): 0.26789003407955486

- Nadi (City, Oceania): 0.2656956110306327

- Papua New Guinea (Country, Oceania): 0.26044425159841944

- Samoa (Country, Oceania): 0.259636104548635

- Peru (Country, South America): 0.2592916142418658

- Mongolia (Country, East Asia): 0.2564112597946637

- Chile (Country, South America): 0.2541182022881615

- Colombia (Country, South America): 0.25103687561356114

- Japan (Country, East Asia): 0.24966767712622978

- Taiwan (Country, East Asia): 0.2496018716144242

- Salzburg (City, Central Europe): 0.24738306685208605

- Busan (City, East Asia): 0.2463084368995349

- Austria (Country, Central Europe): 0.2378875232015235

- Santiago (City, South America): 0.23546957747009545

- Christchurch (City, Oceania): 0.23494714587455492

- Argentina (Country, South America): 0.23393149072222383

- Buenos Aires (City, South America): 0.23343771161577268

**Cross Region Similarities**

- Min: 0.06432689113707567
- Max: 0.40509969950885605

**Cross Region Outliers**

- Biratnagar (City, South Asia): 0.23287445140620333

- Galle (City, South Asia): 0.2620109786466004

- Bangalore (City, South Asia): 0.26605016235075735

- Colombo (City, South Asia): 0.27064105260419163

- Kandy (City, South Asia): 0.2788699320588339

- Pokhara (City, South Asia): 0.2965840860864276

- Khulna (City, South Asia): 0.2974742380453429

- Mumbai (City, South Asia): 0.3016255783903067

- Delhi (City, South Asia): 0.32164200138927534

- Chittagong (City, South Asia): 0.3272813985588143

- Islamabad (City, South Asia): 0.32940289995054717

- Lahore (City, South Asia): 0.33099782952485357

- Kathmandu (City, South Asia): 0.33110880816076105

- Karachi (City, South Asia): 0.33385030862827836

- Dhaka (City, South Asia): 0.3576857686800011

- India (Country, South Asia): 0.40487624126514676

### Region: North America

**Within Region Similarities**

- Min: 0.17453002007513707
- Max: 0.5022116157163325

**Within Region Outliers**

- Australia (Country, Oceania): 0.3455396286200373

- New Zealand (Country, Oceania): 0.33448063522015536

- Norway (Country, North Europe): 0.33415709427923185

- South Korea (Country, East Asia): 0.313869497679798

- Iceland (Country, North Europe): 0.3035839336187274

- Cali (City, South America): 0.3032130746582097

- Japan (Country, East Asia): 0.2945679367198765

- Colombia (Country, South America): 0.2921414797930715

- Taiwan (Country, East Asia): 0.2882357859596123

- Saudi Arabia (Country, Middle East): 0.2845678392898433

- Auckland (City, Oceania): 0.28401387838259357

- Chile (Country, South America): 0.2814118818229817

- Germany (Country, Central Europe): 0.2778253605426311

- India (Country, South Asia): 0.2737694911115212

- Mongolia (Country, East Asia): 0.2734237938647738

- Samoa (Country, Oceania): 0.2730456323943439

- Nepal (Country, South Asia): 0.2725195327691506

- Brazil (Country, South America): 0.2682428945859319

- Sydney (City, Oceania): 0.26581275059766635

- Tokyo (City, East Asia): 0.26552814802709085

- China (Country, East Asia): 0.25803340034205

- Austria (Country, Central Europe): 0.25716336093834186

- Peru (Country, South America): 0.2566069039536192

- United Arab Emirates (Country, Middle East): 0.253945961067538

- Israel (Country, Middle East): 0.2537062425809561

- Czech Republic (Country, Central Europe): 0.2512973812872741

- Finland (Country, North Europe): 0.25122089551847016

- Poland (Country, Central Europe): 0.2504012041990284

- Denmark (Country, North Europe): 0.2486167345710132

- Shanghai (City, East Asia): 0.2485921865489156

- Akureyri (City, North Europe): 0.24742243540542827

- Frankfurt (City, Central Europe): 0.24343955853592472

- Santiago (City, South America): 0.24323643331357578

- Jerusalem (City, Middle East): 0.24307273093611587

- Tel Aviv (City, Middle East): 0.24294265523147895

- Buenos Aires (City, South America): 0.242471907925343

- Seoul (City, East Asia): 0.24230009557128102

- Argentina (Country, South America): 0.24229251300988958

- Oslo (City, North Europe): 0.24224494273649447

- Mecca (City, Middle East): 0.24046674223281525

- Abu Dhabi (City, Middle East): 0.2400905620076696

- Osaka (City, East Asia): 0.238372329747368

- Papua New Guinea (Country, Oceania): 0.23708327025885098

- Sri Lanka (Country, South Asia): 0.23580174389701736

- Apia (City, Oceania): 0.2345779203017942

- Vienna (City, Central Europe): 0.234215062314583

- Copenhagen (City, North Europe): 0.23161503158750926

- Melbourne (City, Oceania): 0.23091334292716814

- Trondheim (City, North Europe): 0.22949120478600898

- Prague (City, Central Europe): 0.22896187646535893

- Aarhus (City, North Europe): 0.2288803757012163

- Fiji (Country, Oceania): 0.22886521092496548

- Kyoto (City, East Asia): 0.22693924126662823

- Taipei (City, East Asia): 0.22673059279781543

- Jeddah (City, Middle East): 0.22449680103608208

- Medellín (City, South America): 0.2233774102656187

- Lae (City, Oceania): 0.2229187230117553

- Lima (City, South America): 0.2207274398414405

- Istanbul (City, Middle East): 0.2198404058296614

- Riyadh (City, Middle East): 0.2186247535103628

- Lahore (City, South Asia): 0.21850611425982658

- Guangzhou (City, East Asia): 0.21622962951828262

- Dubai (City, Middle East): 0.21523106426828686

- Reykjavik (City, North Europe): 0.21486250395054146

- Brno (City, Central Europe): 0.21299390408659113

- Kathmandu (City, South Asia): 0.21278977667107082

- Sweden (Country, North Europe): 0.2125017710881113

- Incheon (City, East Asia): 0.2120940672996973

- Bergen (City, North Europe): 0.21197704554777613

- Port Moresby (City, Oceania): 0.21077044150911606

- Munich (City, Central Europe): 0.21011871087918754

- Gothenburg (City, North Europe): 0.20938817164532514

- Helsinki (City, North Europe): 0.2093104413266892

- Brisbane (City, Oceania): 0.20902719753967913

- Cusco (City, South America): 0.20797196760977538

- Malmö (City, North Europe): 0.2069087738203334

- Busan (City, East Asia): 0.2065819342133225

- Nadi (City, Oceania): 0.20630491243267068

- Hungary (Country, Central Europe): 0.20585841404965322

- Turkey (Country, Middle East): 0.20518791001358425

- Haifa (City, Middle East): 0.20461510702687852

- Ostrava (City, Central Europe): 0.20269322801859105

- Keflavik (City, North Europe): 0.2026123265863996

- Beijing (City, East Asia): 0.20257955934746658

- Stockholm (City, North Europe): 0.20164822592583753

- Pokhara (City, South Asia): 0.20159407158212994

- Erdenet (City, East Asia): 0.2004106029436773

- Biratnagar (City, South Asia): 0.1997036916969212

- Berlin (City, Central Europe): 0.19885957480744287

- São Paulo (City, South America): 0.1986085555038683

- Rio de Janeiro (City, South America): 0.19834039775029083

- Budapest (City, Central Europe): 0.1978229746493441

- Arequipa (City, South America): 0.19766079197297676

- Islamabad (City, South Asia): 0.19641499248679695

- Delhi (City, South Asia): 0.1959499540407072

- Brasília (City, South America): 0.1919880385376893

- Bangalore (City, South Asia): 0.19006965935230608

- Christchurch (City, Oceania): 0.18951927769215646

- Bangladesh (Country, South Asia): 0.18836255563591653

- Karachi (City, South Asia): 0.18832629963924802

- Gdańsk (City, Central Europe): 0.18772276733396026

- Pakistan (Country, South Asia): 0.18706969875320084

- Sharjah (City, Middle East): 0.1866095236652301

- Bogotá (City, South America): 0.1848005436652051

- Kandy (City, South Asia): 0.1837398854356289

- Dhaka (City, South Asia): 0.18120421168164336

- Kraków (City, Central Europe): 0.18039433625128012

- Ulaanbaatar (City, East Asia): 0.17996983942545638

- Iran (Country, Middle East): 0.178804385052407

- Debrecen (City, Central Europe): 0.1786731802792318

- Madang (City, Oceania): 0.1777207049323629

- Warsaw (City, Central Europe): 0.17741103425277038

- Mumbai (City, South Asia): 0.17564618137255708

- Khulna (City, South Asia): 0.17556635946291557

- Suva (City, Oceania): 0.17525525169291903

**Cross Region Similarities**

- Min: 0.12201719056140248
- Max: 0.3455396286200373

**Cross Region Outliers**

- Camagüey (City, North America): 0.17453002007513707

- Kingston (City, North America): 0.20177174207455417

- Ocho Rios (City, North America): 0.20695151072793694

- Santiago de Cuba (City, North America): 0.21558410728011862

- Guadalajara (City, North America): 0.22713908087078621

- Montego Bay (City, North America): 0.23829229776841265

- Havana (City, North America): 0.25548106703820433

- Monterrey (City, North America): 0.2888126791022056

- Toronto (City, North America): 0.2988750855786464

- Mexico (Country, North America): 0.30155193898845445

- Chicago (City, North America): 0.30185028395996355

- Mexico City (City, North America): 0.3172880301338772

- Vancouver (City, North America): 0.3200365146120743

- Jamaica (Country, North America): 0.32374683449633196

- Cuba (Country, North America): 0.3271730860268709

### Region: South America

**Within Region Similarities**

- Min: 0.24852978775464307
- Max: 0.5178527675925418

**Within Region Outliers**

- South Korea (Country, East Asia): 0.4004323951207516

- Cuba (Country, North America): 0.39317767057823844

- Australia (Country, Oceania): 0.3683079958342209

- United States (Country, North America): 0.35655519936959257

- Saudi Arabia (Country, Middle East): 0.34345419609621075

- New Zealand (Country, Oceania): 0.3418158091204546

- Sri Lanka (Country, South Asia): 0.33801008425130197

- Mexico City (City, North America): 0.3233595372918787

- Santiago de Cuba (City, North America): 0.3221771252613169

- Seoul (City, East Asia): 0.3185714647088478

- Samoa (Country, Oceania): 0.31839638596304953

- Nepal (Country, South Asia): 0.3155347863764812

- Havana (City, North America): 0.3129644317601966

- Jamaica (Country, North America): 0.31097232091901417

- Melbourne (City, Oceania): 0.30326811068368853

- China (Country, East Asia): 0.30284227845819206

- India (Country, South Asia): 0.3020239850008031

- Los Angeles (City, North America): 0.30190250076260744

- Mongolia (Country, East Asia): 0.29969161473921435

- Austria (Country, Central Europe): 0.2990238157048407

- Mexico (Country, North America): 0.2956835171353871

- Norway (Country, North Europe): 0.29492920140075796

- Iceland (Country, North Europe): 0.29367908402123744

- Bangladesh (Country, South Asia): 0.2923526184506429

- Brisbane (City, Oceania): 0.2894925795510875

- Colombo (City, South Asia): 0.28795890513049427

- Canada (Country, North America): 0.2848497441463651

- New York (City, North America): 0.28278119639238875

- Czech Republic (Country, Central Europe): 0.2806850134146108

- Sweden (Country, North Europe): 0.2795809524219143

- Papua New Guinea (Country, Oceania): 0.27908271798141104

- Sydney (City, Oceania): 0.2776647821416008

- Chicago (City, North America): 0.27726860850435553

- Japan (Country, East Asia): 0.27429851937644334

- Germany (Country, Central Europe): 0.2734031635317458

- Salzburg (City, Central Europe): 0.2718311432222028

- Montreal (City, North America): 0.2678638888708842

- Taiwan (Country, East Asia): 0.26548435799773495

- Kathmandu (City, South Asia): 0.2591528210744318

- Poland (Country, Central Europe): 0.25759072401774763

- Shanghai (City, East Asia): 0.25757650340889005

- Fiji (Country, Oceania): 0.257544312730604

- Hungary (Country, Central Europe): 0.25605425985470764

- Wellington (City, Oceania): 0.2547843725984398

- Prague (City, Central Europe): 0.2546589663389936

- Vancouver (City, North America): 0.25423434457858096

- Iran (Country, Middle East): 0.25298291764428704

- Frankfurt (City, Central Europe): 0.2515090461127133

- Finland (Country, North Europe): 0.25042560073436676

- Monterrey (City, North America): 0.2502257359603051

- Christchurch (City, Oceania): 0.2500518420135894

- Israel (Country, Middle East): 0.2498722338243463

**Cross Region Similarities**

- Min: 0.0894030383374275
- Max: 0.4004323951207516

**Cross Region Outliers**

- Córdoba (City, South America): 0.24852978775464307

- Concepción (City, South America): 0.254031932759949

- Valparaíso (City, South America): 0.289643380117209

- Brasília (City, South America): 0.2966079438926155

- Rosario (City, South America): 0.3045614973330507

- São Paulo (City, South America): 0.3233041437855124

- Bogotá (City, South America): 0.3303932796560025

- Cali (City, South America): 0.3314003626547515

- Arequipa (City, South America): 0.3585308281909465

- Medellín (City, South America): 0.3750793059653481

- Rio de Janeiro (City, South America): 0.3767702878022109

- Cusco (City, South America): 0.38370443515067265

- Lima (City, South America): 0.3939946965879552

### Region: Middle East

**Within Region Similarities**

- Min: 0.22666153424026225
- Max: 0.4223871759523936

**Within Region Outliers**

- United States (Country, North America): 0.35440594255057134

- Mongolia (Country, East Asia): 0.3333332199486573

- South Korea (Country, East Asia): 0.3271508546306888

- Munich (City, Central Europe): 0.3228269514326717

- Taiwan (Country, East Asia): 0.31370947214040557

- Japan (Country, East Asia): 0.3107518002444227

- Melbourne (City, Oceania): 0.3081768380113453

- Beijing (City, East Asia): 0.30344253759207845

- China (Country, East Asia): 0.30225366124570163

- Cuba (Country, North America): 0.3001465568875573

- Sri Lanka (Country, South Asia): 0.2972494886603472

- Seoul (City, East Asia): 0.2950806215330003

- New York (City, North America): 0.2948807996063538

- New Zealand (Country, Oceania): 0.2896024665405197

- Germany (Country, Central Europe): 0.28706716889936423

- Nepal (Country, South Asia): 0.28480324947445745

- Peru (Country, South America): 0.28210188277137493

- Vienna (City, Central Europe): 0.28105091205843885

- Czech Republic (Country, Central Europe): 0.2794246194661817

- Tokyo (City, East Asia): 0.2769607389699804

- Bangladesh (Country, South Asia): 0.2748321421023483

- Chile (Country, South America): 0.27307073832463047

- Erdenet (City, East Asia): 0.27217326899008976

- Prague (City, Central Europe): 0.27133319422134694

- Los Angeles (City, North America): 0.2709400864501079

- Shanghai (City, East Asia): 0.2691454672713893

- Norway (Country, North Europe): 0.26695172633759456

- Poland (Country, Central Europe): 0.2627612677619951

- Montreal (City, North America): 0.26249514677697294

- Denmark (Country, North Europe): 0.26138617865752567

- Frankfurt (City, Central Europe): 0.260669489131875

- Pakistan (Country, South Asia): 0.25817558372173594

- Mexico City (City, North America): 0.25656625531658483

- Australia (Country, Oceania): 0.25582164680462394

- Taipei (City, East Asia): 0.25543856458674236

- Hungary (Country, Central Europe): 0.2546967942588369

- Medellín (City, South America): 0.25467295011314833

- Delhi (City, South Asia): 0.2541496860508713

- Austria (Country, Central Europe): 0.2525567277647457

- India (Country, South Asia): 0.2512443235574734

- Monterrey (City, North America): 0.2502329498217001

- Mexico (Country, North America): 0.2495403023963053

- Jamaica (Country, North America): 0.2493211966585552

- Oslo (City, North Europe): 0.24898064967932915

- Berlin (City, Central Europe): 0.24827553121808965

- Islamabad (City, South Asia): 0.24643318458172106

- Budapest (City, Central Europe): 0.2448702582459119

- Mumbai (City, South Asia): 0.24427092393568173

- Auckland (City, Oceania): 0.24338969107151626

- Helsinki (City, North Europe): 0.2433771968351901

- Brazil (Country, South America): 0.23867314051468058

- Canada (Country, North America): 0.23713726874578872

- Colombia (Country, South America): 0.23685964487554995

- Chicago (City, North America): 0.23627953540722668

- Lae (City, Oceania): 0.2353194830292178

- Lahore (City, South Asia): 0.2348559387322336

- Brno (City, Central Europe): 0.23480690883651983

- Warsaw (City, Central Europe): 0.23467997174444288

- Iceland (Country, North Europe): 0.23390296634217436

- Dhaka (City, South Asia): 0.23370368246272316

- Cali (City, South America): 0.2321897694024294

- Kyoto (City, East Asia): 0.2311518886257115

- Guangzhou (City, East Asia): 0.22997864150982722

- Copenhagen (City, North Europe): 0.22834703628368788

- Madang (City, Oceania): 0.22828318466771913

- Malmö (City, North Europe): 0.22780678999815984

- Espoo (City, North Europe): 0.2267874191710323

**Cross Region Similarities**

- Min: 0.09878151057733608
- Max: 0.35440594255057134

**Cross Region Outliers**

- Sharjah (City, Middle East): 0.22666153424026225

- Ankara (City, Middle East): 0.23172736436852592

- Izmir (City, Middle East): 0.25618355813801325

- Isfahan (City, Middle East): 0.2660152284758042

- Turkey (Country, Middle East): 0.2837418086342237

- Mashhad (City, Middle East): 0.29383685640552576

- Jeddah (City, Middle East): 0.2959592020769426

- Haifa (City, Middle East): 0.29724586459587954

- Istanbul (City, Middle East): 0.3071858215899733

- Riyadh (City, Middle East): 0.3139134156024126

- Iran (Country, Middle East): 0.32477806040149404

- Abu Dhabi (City, Middle East): 0.3271193462370698

- Tel Aviv (City, Middle East): 0.3351275507030942

### Region: Oceania

**Within Region Similarities**

- Min: 0.24779579505915533
- Max: 0.43899283645213627

**Within Region Outliers**

- Ocho Rios (City, North America): 0.32567137784756317

- Jamaica (Country, North America): 0.32143899151504884

- Taiwan (Country, East Asia): 0.32051180201887297

- Osaka (City, East Asia): 0.31822219194344575

- Iceland (Country, North Europe): 0.3150542969656118

- United States (Country, North America): 0.307470004853196

- Japan (Country, East Asia): 0.29943930440041633

- Tokyo (City, East Asia): 0.29640334997150475

- Mongolia (Country, East Asia): 0.29282828769901004

- Sri Lanka (Country, South Asia): 0.28966496807530706

- Chile (Country, South America): 0.28940406270264296

- South Korea (Country, East Asia): 0.2882817268842771

- Peru (Country, South America): 0.2874192302862629

- Rio de Janeiro (City, South America): 0.28678258758621755

- Finland (Country, North Europe): 0.2863266517383049

- Cuba (Country, North America): 0.2858557631700279

- Oslo (City, North Europe): 0.28454053616051395

- Ostrava (City, Central Europe): 0.2745550770571821

- Austria (Country, Central Europe): 0.2737140550822904

- Poland (Country, Central Europe): 0.27173335786723873

- Canada (Country, North America): 0.2714496425978251

- Nepal (Country, South Asia): 0.2706369594019516

- Israel (Country, Middle East): 0.26694210987414074

- Vienna (City, Central Europe): 0.2660954502319379

- Lima (City, South America): 0.26565872517752154

- Abu Dhabi (City, Middle East): 0.2644118231619972

- Brazil (Country, South America): 0.2620241765472188

- Cali (City, South America): 0.26081598769596614

- Norway (Country, North Europe): 0.25904601875862593

- New York (City, North America): 0.25902758541900034

- Helsinki (City, North Europe): 0.2585693115957302

- Colombia (Country, South America): 0.2568148051216303

- Copenhagen (City, North Europe): 0.2566665807162828

- Havana (City, North America): 0.25487510312097583

- India (Country, South Asia): 0.25461500781568647

- Vancouver (City, North America): 0.25450470369017864

- Bangladesh (Country, South Asia): 0.2542973314103667

- Seoul (City, East Asia): 0.2537944446865615

- Los Angeles (City, North America): 0.25196477268785994

- Taipei (City, East Asia): 0.2495249164411039

- Pokhara (City, South Asia): 0.2478201818636645

**Cross Region Similarities**

- Min: 0.11227929258788191
- Max: 0.32567137784756317

**Cross Region Outliers**

- Salelologa (City, Oceania): 0.24779579505915533

- Vaitele (City, Oceania): 0.24956954432214223

- Christchurch (City, Oceania): 0.25712765934518245

- Lautoka (City, Oceania): 0.26230380177100976

- Madang (City, Oceania): 0.2650529330369275

- Brisbane (City, Oceania): 0.2757155701984191

- Suva (City, Oceania): 0.28891360799341725

- Lae (City, Oceania): 0.28973499373349554

- Wellington (City, Oceania): 0.2942762007185111

- Port Moresby (City, Oceania): 0.2995619655424417

- Melbourne (City, Oceania): 0.30549113251755045

- Nadi (City, Oceania): 0.3076291870869684

In [26]:
def compare_text_similarities(text1, text2, text3):
    embedding1 = get_embedding(text1)
    embedding2 = get_embedding(text2)
    embedding3 = get_embedding(text3)
    
    similarity1 = compute_cosine_similarity(embedding1, embedding2)
    similarity2 = compute_cosine_similarity(embedding1, embedding3)
    
    print(f"'{text1}'<->'{text2}': {similarity1} VS '{text1}' and '{text3}': {similarity2}")

compare_text_similarities("East Asia", "United States", "Guangzhou")

'East Asia'<->'United States': 0.3296542393589786 VS 'East Asia' and 'Guangzhou': 0.28358280332550345


In [27]:
compare_text_similarities("The person's location is in East Asia", "The person's location is in United States", "The person's location is in Guangzhou")

'The person's location is in East Asia'<->'The person's location is in United States': 0.6469525178947896 VS 'The person's location is in East Asia' and 'The person's location is in Guangzhou': 0.6573887071614923


In [28]:
compare_text_similarities("Location: East Asia", "Location: United States", "Location: Guangzhou")

'Location: East Asia'<->'Location: United States': 0.5006624339907891 VS 'Location: East Asia' and 'Location: Guangzhou': 0.4854374665789587


In [29]:
compare_text_similarities("Work place location of the person is in East Asia", "Work place location of the person is in United States", "Work place location of the person is in Guangzhou")

'Work place location of the person is in East Asia'<->'Work place location of the person is in United States': 0.6982256164892127 VS 'Work place location of the person is in East Asia' and 'Work place location of the person is in Guangzhou': 0.7197743840743416


In [31]:
def call_llm(system_prompt, message):
    completion = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
            {"role": "user", "content": [{"type": "text", "text": message}]}
        ])
    return completion.choices[0].message.content

In [43]:
system_prompt="""Please expand the given location into a hierarchical, concise and searchable description.
                 The location should include its geographic containment. Include the city, country, and region for cities, or the country and region for countries.
                 Focus on maintaining semantic clarity for embedding purposes.
                 Highlight specific relationships and omit unrelated details.
                 Avoid overly detailed descriptions."""

In [44]:
def compare_text_similarities_llm(text1, text2, text3):
    expanded_text1 = call_llm(system_prompt, text1)
    print(f"### Original Text: {text1}\n--- Begin Expanded Text ---\n{expanded_text1}\n--- End Expanded Text ---\n")
    expanded_text2 = call_llm(system_prompt, text2)
    print(f"### Original Text: {text2}\n--- Begin Expanded Text ---\n{expanded_text2}\n--- End Expanded Text ---\n")
    expanded_text3 = call_llm(system_prompt, text3)
    print(f"### Original Text: {text3}\n--- Begin Expanded Text ---\n{expanded_text3}\n--- End Expanded Text ---\n")
    
    embedding1 = get_embedding(expanded_text1)
    embedding2 = get_embedding(expanded_text2)
    embedding3 = get_embedding(expanded_text3)
    
    similarity1 = compute_cosine_similarity(embedding1, embedding2)
    similarity2 = compute_cosine_similarity(embedding1, embedding3)
    
    print(f"'{text1}'<->'{text2}': {similarity1} VS '{text1}' and '{text3}': {similarity2}")

compare_text_similarities_llm("East Asia", "United States", "Guangzhou")

### Original Text: East Asia
--- Begin Expanded Text ---
**East Asia**  
- **Region**: Asia  
  - **Countries**:  
    - **China**  
      - **Major Cities**: Beijing, Shanghai, Guangzhou  
    - **Japan**  
      - **Major Cities**: Tokyo, Osaka, Yokohama  
    - **South Korea**  
      - **Major Cities**: Seoul, Busan, Incheon  
    - **Mongolia**  
      - **Capital City**: Ulaanbaatar  
    - **Taiwan**  
      - **Capital City**: Taipei  
  - **Geographic Containment**: Borders the Pacific Ocean to the east, the Arctic Ocean to the north, and Southeast Asia to the south.
--- End Expanded Text ---

### Original Text: United States
--- Begin Expanded Text ---
- Country: United States
  - Region: North America
    - Continent: North America
--- End Expanded Text ---

### Original Text: Guangzhou
--- Begin Expanded Text ---
- **Location**: Guangzhou  
  - **City**: Guangzhou  
  - **Region**: Guangdong Province  
  - **Country**: China  
--- End Expanded Text ---

'East Asia'<->'Unite