# Compute Embeddings for the WikiVoyage Data

In [20]:
import pandas as pd 
import numpy as np 
import re
import json
import os 
from sentence_transformers import SentenceTransformer
import pickle

### Computing embeddings for Wikivoyage Documents

In [21]:
wikivoyage_dir = "../../european-city-data/data-sources/wikivoyage/"
embeddings_dir = wikivoyage_dir + "/embeddings/"
data_dir = wikivoyage_dir + "cleaned/"
len(os.listdir(wikivoyage_dir))

6

In [22]:
cities = pd.read_csv("../city_abstracts_embeddings.csv")
cities.head()

Unnamed: 0,city,country,lat,lng,population,abstract,combined,embedding
0,Aalborg,Denmark,57.05,9.9167,143598.0,"Aalborg is the largest city in North Jutland, ...","city: Aalborg, country: Denmark, population: 1...","[-0.0032697843853384256, 0.007246419321745634,..."
1,Adana,Turkey,37.0,35.3213,1765981.0,Adana is a city on the Cilician Plains of cent...,"city: Adana, country: Turkey, population: 1765...","[-0.020492911338806152, 0.0039081997238099575,..."
2,Amsterdam,Netherlands,52.3728,4.8936,1459402.0,Amsterdam is the capital of the Netherlands. I...,"city: Amsterdam, country: Netherlands, populat...","[0.01612328179180622, -0.0028123168740421534, ..."
3,Ancona,Italy,43.6169,13.5167,100924.0,Ancona is the capital of the Italian region ca...,"city: Ancona, country: Italy, population: 1009...","[0.0012721708044409752, 0.0070460038259625435,..."
4,Ankara,Turkey,39.93,32.85,5503985.0,"Ankara is the capital of Turkey, central withi...","city: Ankara, country: Turkey, population: 550...","[0.006992552895098925, -0.0026167023461312056,..."


In [23]:
def compute_wikivoyage_doc_embeddings():
    wikivoyage_embeddings = []
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    vector_dimension = model.get_sentence_embedding_dimension()

    for index, row in cities.iterrows():
        with open(data_dir + row['city'] + ".txt") as file:
            text = file.readlines()
            print("Computing embeddings for {0}".format(row['city']))
            embeddings = model.encode(text, show_progress_bar=True)

            file_name = embeddings_dir + row['city'] + ".pkl"

            with open(file_name, "wb") as fOut:
                pickle.dump({'text': text, 'embeddings': embeddings},fOut)

# compute_wikivoyage_doc_embeddings()

#         wikivoyage_embeddings.append({
#             'city': row['city'],
#             'country': row['country'],
#             'embedding': embeddings.tolist()
#         })
#         print("Added embeddings for {0}".format(row['city']))

### Computing Embeddings for Wikivoyage Listings

In [29]:
listings_dir = "../../european-city-data/data-sources/wikivoyage/listings/"

listings = pd.read_csv(listings_dir + "wikivoyage-listings-cleaned.csv")
listings.head()

Unnamed: 0,city,type,title,description
0,Aalborg,see,Aalborg Akvavit,Distillery.
1,Aalborg,see,Aalborghus Castle,A castle and garden from the 16th century.
2,Aalborg,see,Aalborg Tower,No description found.
3,Aalborg,see,Aalborg Townhall,Built in 1759.
4,Aalborg,see,Aalborg Zoo,"Cosy zoo with many ""classic"" zoo animals such ..."


In [36]:
def find_country(city):
    return cities[cities['city'] == city]['country'].values[0]

listings['country'] = listings['city'].apply(find_country)

In [40]:
combined = []
for i, row in listings.iterrows():
    combined.append(f"city: {row['city']}, country: {row['country']}, type: {row['type']}, title: {row['title']}, description: {row['description']}")

listings['combined'] = combined
listings.head()

Unnamed: 0,city,type,title,description,country,combined
0,Aalborg,see,Aalborg Akvavit,Distillery.,Denmark,"city: Aalborg, country: Denmark, type: see, ti..."
1,Aalborg,see,Aalborghus Castle,A castle and garden from the 16th century.,Denmark,"city: Aalborg, country: Denmark, type: see, ti..."
2,Aalborg,see,Aalborg Tower,No description found.,Denmark,"city: Aalborg, country: Denmark, type: see, ti..."
3,Aalborg,see,Aalborg Townhall,Built in 1759.,Denmark,"city: Aalborg, country: Denmark, type: see, ti..."
4,Aalborg,see,Aalborg Zoo,"Cosy zoo with many ""classic"" zoo animals such ...",Denmark,"city: Aalborg, country: Denmark, type: see, ti..."


In [49]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
vector_dimension = model.get_sentence_embedding_dimension()

In [51]:
listings

Unnamed: 0,city,type,title,description,country,combined
0,Aalborg,see,Aalborg Akvavit,Distillery.,Denmark,"city: Aalborg, country: Denmark, type: see, ti..."
1,Aalborg,see,Aalborghus Castle,A castle and garden from the 16th century.,Denmark,"city: Aalborg, country: Denmark, type: see, ti..."
2,Aalborg,see,Aalborg Tower,No description found.,Denmark,"city: Aalborg, country: Denmark, type: see, ti..."
3,Aalborg,see,Aalborg Townhall,Built in 1759.,Denmark,"city: Aalborg, country: Denmark, type: see, ti..."
4,Aalborg,see,Aalborg Zoo,"Cosy zoo with many ""classic"" zoo animals such ...",Denmark,"city: Aalborg, country: Denmark, type: see, ti..."
...,...,...,...,...,...,...
7163,Rivne,sleep,Marlen Hotel,Wi-Fi. 10 room / 21 bed.,Ukraine,"city: Rivne, country: Ukraine, type: sleep, ti..."
7164,Rivne,sleep,Feniks Hotel,No description found.,Ukraine,"city: Rivne, country: Ukraine, type: sleep, ti..."
7165,Rivne,sleep,Ukraine,70 r / 125 beds.,Ukraine,"city: Rivne, country: Ukraine, type: sleep, ti..."
7166,Rivne,sleep,Myr Hotel,18 rooms. / 28 beds.,Ukraine,"city: Rivne, country: Ukraine, type: sleep, ti..."


In [54]:
wv_embeddings = "../../european-city-data/data-sources/wikivoyage/listings/"

def compute_wv_listings_embeddings(df):
    # model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    # vector_dimension = model.get_sentence_embedding_dimension()
    
    print("Computing embeddings")
    embeddings = []
    for i, row in df.iterrows():
        emb = model.encode(row['combined'], show_progress_bar=True).tolist()
        embeddings.append(emb)

    print("Finished computing embeddings for wikivoyage listings. Saving file...")
    df['embeddings'] = embeddings
    df.to_csv(wv_embeddings + "wikivoyage-listings-embeddings.csv")
    print("Finished saving file.")

compute_wv_listings_embeddings(listings)

Computing embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.25it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 54.53it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 63.81it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 52.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 55.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 57.96it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 62.21it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 53.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 52.55it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.02it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 59.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 56.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 54.60it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 62.50it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 58.56it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 55.60it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 54.05it/s]
Batches: 1

Finished computing embeddings for wikivoyage listings. Saving file...
Finished saving file.


In [8]:
# wikivoyage = pd.DataFrame(wikivoyage_embeddings)

In [41]:
# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# vector_dimension = model.get_sentence_embedding_dimension()

In [8]:
# from qdrant_client import QdrantClient

# client = QdrantClient(
#     url="https://cb8144d3-3bc4-4b48-9b8f-0d5162835b62.us-east4-0.gcp.cloud.qdrant.io:6333", 
#     api_key="Kxe4k8QDskHicKFj7DVMkLSHW1A7jApqX1nCN09jm1WPH9ycBVFO8Q",
# )

In [42]:
# from qdrant_client.models import VectorParams, Distance

# client.delete_collection(collection_name="wikivoyage")

# client.recreate_collection(
#    collection_name="wikivoyage",
#    vectors_config=VectorParams(size=vector_dimension, distance=Distance.COSINE, on_disk=True),
# )

  client.recreate_collection(


True

In [43]:
# from qdrant_client.models import PointStruct # Import the PointStruct to store the vector and payload
# from tqdm import tqdm # Library to show the progress bar 

# # Populate collection with vectors using tqdm to show progress
# def insert_wikivoyage_listings():
#     for k, v in tqdm(listings.iterrows(), desc="Upserting articles", total=len(listings)):
#         try:
#             client.upsert(
#                 collection_name='wikivoyage',
#                 points=[
#                     PointStruct(
#                         id=k,
#                         vector=model.encode(v['combined'], show_progress_bar=True).tolist(),
#                         payload={
#                             'city': v['city'], 
#                             'country': v['country'],
#                             'type': v['type'],
#                             'title': v['title']
#                         }
#                     )
#                 ]
#             )
#         except Exception as e:
#             print(f"Failed to upsert row {k}: {v}")
#             print(f"Exception: {e}")

# # insert_wikivoyage_listings()

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.60s/it]?it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.55it/s]3:04,  2.12s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.51it/s]3:27,  1.45s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.09it/s]6:55,  1.57s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.33it/s]9:26,  1.08s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.60it/s]6:26,  1.23s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.83it/s]1:05,  1.18s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 57.27it/s]1:16,  1.02s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.67it/s]7:52,  1.36it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.61it/s]7:07,  1.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.89it/s]27:43,  1.36it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 57.95it/s]24:53,  1.41it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]03:46,  1.87it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.38it/s]10:00,  1.70it/s]
Batches: 100

Failed to upsert row 1657: city                                                    Debrecen
type                                                         see
title                                         Millenium fountain
description                                No description found.
country                                                  Hungary
combined       city: Debrecen, country: Hungary, type: see, t...
Name: 1657, dtype: object
Exception: Server disconnected without sending a response.


Batches: 100%|██████████| 1/1 [00:00<00:00, 59.89it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 48.67it/s]27:06,  3.39it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.39it/s]23:23,  3.92it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.76it/s]20:39,  4.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.57it/s]18:30,  4.96it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 36.28it/s]16:53,  5.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.77it/s]15:49,  5.80it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.44it/s]17:38,  5.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 52.49it/s]16:06,  5.69it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.22it/s]15:04,  6.08it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 34.02it/s]30:01,  3.05it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 35.60it/s]30:49,  2.97it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.07it/s]26:01,  3.52it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 52.21it/s]21:46,  4.21it/s]
Batches: 

Failed to upsert row 6538: city                                                    Valencia
type                                                       drink
title                                    Horchatería Els Sariers
description    A very large and popular horchatería with arti...
country                                                    Spain
combined       city: Valencia, country: Spain, type: drink, t...
Name: 6538, dtype: object
Exception: The read operation timed out


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.50it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.63it/s]15:53,  1.52s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.70it/s]11:41,  1.12s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.12it/s]08:35,  1.21it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 51.44it/s]06:26,  1.62it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.57it/s]04:56,  2.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 48.34it/s]04:10,  2.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 51.67it/s]03:22,  3.08it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.72it/s]02:46,  3.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.94it/s]02:21,  4.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 55.03it/s]02:05,  4.95it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.93it/s]01:54,  5.39it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 48.05it/s]01:49,  5.65it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.39it/s]01:41,  6.06it/s]
Batches: 

Failed to upsert row 6983: city                                                      Zurich
type                                                         eat
title                                                  Gambrinus
description    is a typical Swiss restaurant with good food a...
country                                              Switzerland
combined       city: Zurich, country: Switzerland, type: eat,...
Name: 6983, dtype: object
Exception: The read operation timed out


Batches: 100%|██████████| 1/1 [00:00<00:00, 15.81it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.10it/s]08:09,  2.68s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 22.51it/s]05:52,  1.94s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.39it/s]04:15,  1.41s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 35.98it/s]03:15,  1.09s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 20.18it/s]03:12,  1.07s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.02it/s]03:21,  1.13s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.04it/s]02:59,  1.02s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.97it/s]02:15,  1.30it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.37it/s]01:45,  1.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.01it/s]01:35,  1.83it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 34.48it/s]01:22,  2.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 21.64it/s]01:07,  2.56it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 21.03it/s]02:16,  1.26it/s]
Batches: 

In [45]:
# import qdrant
def query_qdrant(query, vector_name='title', top_k=20):

    # Creates embedding vector from user query
    embedding = model.encode(query).tolist()
    
    query_results = client.search(
        collection_name='wikivoyage',
        query_vector=(
            embedding
        ),
        limit=top_k, 
        query_filter=None
    )
    
    return query_results

In [48]:
# query_results = query_qdrant('Museums in Europe')
# for i, article in enumerate(query_results):
#     print(f'{i + 1}. {article.payload["title"]}, City: {article.payload["city"]} (Score: {round(article.score, 3)})')

1. Museo Europeo degli Studenti, City: Bologna (Score: 0.64)
2. Staatliches Museum für Naturkunde Stuttgart, City: Stuttgart (Score: 0.637)
3. Musée gallo-romain de Fourvière, City: Lyon (Score: 0.631)
4. Musée Alsacien, City: Strasbourg (Score: 0.631)
5. Fine Arts Museum, City: Vitoria-Gasteiz (Score: 0.612)
6. Musée d'Art contemporain, City: Lyon (Score: 0.607)
7. Albertinum Museum, City: Dresden (Score: 0.601)
8. Musée du Cinema/Filmmuseum, City: Brussels (Score: 0.599)
9. Museu de la impremta i les Arts Gràfiques, City: Valencia (Score: 0.597)
10. National Museum of History, City: Sofia (Score: 0.595)
11. Thyssen-Bornemisza Museum of Art, City: Madrid (Score: 0.59)
12. Leonhardi Museum, City: Dresden (Score: 0.584)
13. Bibliotheca Wittockiana, City: Brussels (Score: 0.579)
14. Uzhhorod Castle, City: Uzhhorod (Score: 0.579)
15. Museum of Natural History, City: Lille (Score: 0.578)
16. Long Night of the Museums, City: Stuttgart (Score: 0.578)
17. Ottó Herman Museum, Arts museum, City