<a href="https://colab.research.google.com/github/aurioldegbelo/sis2025/blob/main/2025_SIS_Demo_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Topics: Relevance Ranking
* tf-idf
* BM25
* Sentence embeddings
* Cosine similarity
* Maximal Marginal Relevance
* Spatial Similarity Computation (Area of overlap, Hausdorff distance, Earth mover's distance)

# Part 1: Sparse retrieval | tf-idf

In [108]:
sentence1 = "This is a sentence about a city in Germany called Münster"
sentence2 = "This sentence is also about a city in Germany. It is called Berlin."
sentence3 = "Third city, but not in Germany. It is called Paris"

sentences = [sentence1, sentence2, sentence3]
sentences_names = ["sentence1", "sentence2", "sentence3"]

In [51]:
# term-document matrix

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
count_vector = count_vectorizer.fit_transform(sentences)

count_df = pd.DataFrame(count_vector.toarray(), index=sentences_names, columns=count_vectorizer.get_feature_names_out())
count_df

Unnamed: 0,about,also,berlin,but,called,city,germany,in,is,it,münster,not,paris,sentence,third,this
sentence1,1,0,0,0,1,1,1,1,1,0,1,0,0,1,0,1
sentence2,1,1,1,0,1,1,1,1,2,1,0,0,0,1,0,1
sentence3,0,0,0,1,1,1,1,1,1,1,0,1,1,0,1,0


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(input="content", norm = None, smooth_idf=False)
tfidf_vector = tfidf_vectorizer.fit_transform(sentences)

tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=sentences_names, columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,about,also,berlin,but,called,city,germany,in,is,it,münster,not,paris,sentence,third,this
sentence1,1.405465,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,2.098612,0.0,0.0,1.405465,0.0,1.405465
sentence2,1.405465,2.098612,2.098612,0.0,1.0,1.0,1.0,1.0,2.0,1.405465,0.0,0.0,0.0,1.405465,0.0,1.405465
sentence3,0.0,0.0,0.0,2.098612,1.0,1.0,1.0,1.0,1.0,1.405465,0.0,2.098612,2.098612,0.0,2.098612,0.0


In [11]:
import math

# is, for document 1
n = 3
df = 3
tf = 2


# it, for document 1
n = 3
df = 2
tf = 1

# berlin, for sentence 2
n = 3
df = 1
tf = 1

idf = math.log(n/df) + 1 # math.log returns the natural logarithm

tf_idf = tf * idf
tf_idf

2.09861228866811

Computing similarity scores for an input sentence

In [109]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()
docs_tfidf = vectorizer.fit_transform(sentences)

def get_tf_idf_query_similarity(vectorizer, docs_tfidf, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc

    return: cosine similarity between query and all docs
    """
    query_tfidf = vectorizer.transform([query])
    #print(query_tfidf)
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return query_tfidf, cosineSimilarities

print(docs_tfidf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 30 stored elements and shape (3, 16)>
  Coords	Values
  (0, 15)	0.35934131343820924
  (0, 8)	0.27906058578109566
  (0, 13)	0.35934131343820924
  (0, 0)	0.35934131343820924
  (0, 5)	0.27906058578109566
  (0, 7)	0.27906058578109566
  (0, 6)	0.27906058578109566
  (0, 4)	0.27906058578109566
  (0, 10)	0.4724906440206689
  (1, 15)	0.2853355149492719
  (1, 8)	0.44317696278241336
  (1, 13)	0.2853355149492719
  (1, 0)	0.2853355149492719
  (1, 5)	0.22158848139120668
  (1, 7)	0.22158848139120668
  (1, 6)	0.22158848139120668
  (1, 4)	0.22158848139120668
  (1, 1)	0.37518191251208155
  (1, 9)	0.2853355149492719
  (1, 2)	0.37518191251208155
  (2, 8)	0.2348873451103372
  (2, 5)	0.2348873451103372
  (2, 7)	0.2348873451103372
  (2, 6)	0.2348873451103372
  (2, 4)	0.2348873451103372
  (2, 9)	0.30246022334437567
  (2, 14)	0.3976988461227783
  (2, 3)	0.3976988461227783
  (2, 11)	0.3976988461227783
  (2, 12)	0.3976988461227783


In [110]:
query = "Tell me something about Münster, Germany. I know nothing about it"

query_tfidf, similarities = get_tf_idf_query_similarity(vectorizer, docs_tfidf, query)
similarities

array([0.57488947, 0.37968183, 0.17906684])

In [111]:
modified_query = "about it me about something Münster, Germany. Tell I know nothing"

query_tfidf, similarities = get_tf_idf_query_similarity(vectorizer, docs_tfidf, modified_query)
similarities

array([0.57488947, 0.37968183, 0.17906684])

# Part 2: Sparse retrieval | BM25

In [15]:
!pip install --quiet rank_bm25 # https://pypi.org/project/rank-bm25/

In [64]:
sentence1 = "This is a sentence about a city in Germany called Münster"
sentence2 = "This sentence is also about a city in Germany. It is called Berlin."
sentence3 = "Third city, but not in Germany. It is called Paris"

sentences = [sentence1, sentence2, sentence3]
sentences_names = ["sentence1", "sentence2", "sentence3"]

In [112]:
from rank_bm25 import BM25Okapi

tokenized_corpus = [doc.split(" ") for doc in sentences]

bm25 = BM25Okapi(tokenized_corpus)
print(tokenized_corpus)

[['This', 'is', 'a', 'sentence', 'about', 'a', 'city', 'in', 'Germany', 'called', 'Münster'], ['This', 'sentence', 'is', 'also', 'about', 'a', 'city', 'in', 'Germany.', 'It', 'is', 'called', 'Berlin.'], ['Third', 'city,', 'but', 'not', 'in', 'Germany.', 'It', 'is', 'called', 'Paris']]


In [113]:
query = "Tell me something about Münster, Germany. I know nothing about it"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
doc_scores

array([-0.12843885, -0.17830856, -0.06691186])

In [114]:
modified_query = "about it me about something Münster, Germany. Tell I know nothing"

tokenized_query_modified = modified_query.split(" ")

doc_scores = bm25.get_scores(tokenized_query_modified)
doc_scores

array([-0.12843885, -0.17830856, -0.06691186])

# Part 3: Dense retrieval | sentence embeddings

Embeddings with Sentence Transformer

In [20]:
!pip install -qU sentence-transformers

In [21]:
# The sentences to encode
sentence1 = "This is a sentence about a city in Germany called Münster"
sentence2 = "This sentence is also about a city in Germany. It is called Berlin."
sentence3 = "Third city, but not in Germany. It is called Paris"
sentences = [sentence1, sentence2, sentence3]

# The queries to encode
query = "Tell me something about Münster, Germany. I know nothing about it"
modified_query = "about it me about something Münster, Germany. Tell I know nothing"
surprise_query = "This is related to a place outside the country"

In [22]:
# https://sbert.net/docs/sentence_transformer/pretrained_models.html
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-mpnet-base-v2") # all-MiniLM-L6-v2

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]

query_embedding = model.encode(query)
modified_query_embedding = model.encode(modified_query)
surprise_query_embedding = model.encode(surprise_query)

# 3. Calculate the embedding similarities
similarities1 = model.similarity(embeddings, query_embedding)
print(similarities1)

similarities2 = model.similarity(embeddings, modified_query_embedding)
print(similarities2)

similarities3 = model.similarity(embeddings, surprise_query_embedding)
print(similarities3)

similarities4 = model.similarity(query_embedding, modified_query_embedding)
print(similarities4)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(3, 768)
tensor([[0.7714],
        [0.5006],
        [0.4581]])
tensor([[0.6956],
        [0.5296],
        [0.4308]])
tensor([[0.4664],
        [0.5410],
        [0.3139]])
tensor([[0.8056]])


Embeddings with OpenAI

In [23]:
import getpass
import os

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key: ··········


In [24]:
!pip install -q openai

In [25]:
# The sentences to encode
sentence1 = "This is a sentence about a city in Germany called Münster"
sentence2 = "This sentence is also about a city in Germany. It is called Berlin."
sentence3 = "Third city, but not in Germany. It is called Paris"
sentences = [sentence1, sentence2, sentence3]

# The queries to encode
query = "Tell me something about Münster, Germany. I know nothing about it"
modified_query = "about it me about something Münster, Germany. Tell I know nothing"
surprise_query = "This is related to a place outside the country"

In [116]:
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

client = OpenAI()

# 1. Calculate embeddings using the OpenAI model

def create_openai_embeddings (input_text):
    response = client.embeddings.create(
        input= input_text,
        model="text-embedding-3-small" # text-embedding-3-small or text-embedding-3-large
    )
    return response

response =  create_openai_embeddings(sentences)
embeddings =  [result.embedding for result in response.data]

query_embedding = create_openai_embeddings(query).data[0].embedding
modified_query_embedding = create_openai_embeddings(modified_query).data[0].embedding
surprise_query_embedding = create_openai_embeddings(surprise_query).data[0].embedding

# print the shape of an embedding as an example
print(np.asarray(query_embedding).shape)

# 2. Calculate the embedding similarities
similarities1 = cosine_similarity(embeddings, [query_embedding])
print(similarities1)

similarities2 = cosine_similarity(embeddings, [modified_query_embedding])
print(similarities2)

similarities3 = cosine_similarity(embeddings, [surprise_query_embedding])
print(similarities3)

similarities4 = cosine_similarity([query_embedding], [modified_query_embedding])
print(similarities4)

(1536,)
[[0.72432119]
 [0.44355494]
 [0.41201002]]
[[0.6476365 ]
 [0.4668958 ]
 [0.39826448]]
[[0.32303414]
 [0.33480205]
 [0.29981994]]
[[0.86804574]]


# Part 4: Cosine Similarity

In [119]:
import math

X = [[0, 5, 0]]
Y = [[2, 7, 0]] #
Z = [[7, 7, 7]] #  [5, 0, 0] the length of the vector does not matter, test for example: [1, 1, 1], [2, 2, 2]

from sklearn.metrics.pairwise import cosine_similarity

print("XY similarity:", cosine_similarity(X, Y).flatten(), "angle:", math.degrees(math.acos(cosine_similarity(X, Y).flatten()[0])))
print("YZ similarity:", cosine_similarity(Y, Z).flatten(), "angle:", math.degrees(math.acos(cosine_similarity(Y, Z).flatten()[0])))
print("XZ similarity:", cosine_similarity(X, Z).flatten(), "angle:", math.degrees(math.acos(cosine_similarity(X, Z).flatten()[0])))

XY similarity: [0.96152395] angle: 15.945395900922858
YZ similarity: [0.71374643] angle: 44.45944170194541
XZ similarity: [0.57735027] angle: 54.735610317245346


# Part 5: Maximal Marginal Relevance

In [None]:
!pip install --quiet langchain langchain_openai langchain_community chromadb

In [46]:
# The sentences to encode
sentence1 = "This is a sentence about a city in Germany called Münster"
sentence2 = "This sentence is also about a city in Germany. It is called Berlin."
sentence3 = "Third city, but not in Germany. It is called Paris"
sentences = [sentence1, sentence2, sentence3]

# The queries to encode
query = "Tell me something about Münster, Germany. I know nothing about it"
modified_query = "about it me about something Münster, Germany. Tell I know nothing"
surprise_query = "This is related to a place outside the country"

In [47]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split the sentences into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5, chunk_overlap=0)

chunks = text_splitter.create_documents(sentences)

print (f"The sentences have been split into {len(chunks)} chunks")
chunks

The sentences have been split into 47 chunks


[Document(metadata={}, page_content='This'),
 Document(metadata={}, page_content='is a'),
 Document(metadata={}, page_content='sent'),
 Document(metadata={}, page_content='ence'),
 Document(metadata={}, page_content='abou'),
 Document(metadata={}, page_content='t'),
 Document(metadata={}, page_content='a'),
 Document(metadata={}, page_content='city'),
 Document(metadata={}, page_content='in'),
 Document(metadata={}, page_content='Germ'),
 Document(metadata={}, page_content='any'),
 Document(metadata={}, page_content='call'),
 Document(metadata={}, page_content='ed'),
 Document(metadata={}, page_content='Müns'),
 Document(metadata={}, page_content='ter'),
 Document(metadata={}, page_content='This'),
 Document(metadata={}, page_content='sent'),
 Document(metadata={}, page_content='ence'),
 Document(metadata={}, page_content='is'),
 Document(metadata={}, page_content='also'),
 Document(metadata={}, page_content='abou'),
 Document(metadata={}, page_content='t'),
 Document(metadata={}, page

In [48]:
import getpass
import os

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [32]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents=chunks, embedding=embedding, collection_name = "col")

retriever_simple = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 8})
retriever_mmr = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 8})

In [49]:
relevant_docs = retriever_simple.invoke("ent")
relevant_docs

[Document(metadata={}, page_content='ence'),
 Document(metadata={}, page_content='ence'),
 Document(metadata={}, page_content='ter'),
 Document(metadata={}, page_content='sent'),
 Document(metadata={}, page_content='sent'),
 Document(metadata={}, page_content='in'),
 Document(metadata={}, page_content='in'),
 Document(metadata={}, page_content='in')]

In [50]:
mmr_relevant_docs = retriever_mmr.invoke("ent")
mmr_relevant_docs

[Document(metadata={}, page_content='ence'),
 Document(metadata={}, page_content='ter'),
 Document(metadata={}, page_content='sent'),
 Document(metadata={}, page_content='in'),
 Document(metadata={}, page_content='ed'),
 Document(metadata={}, page_content='t'),
 Document(metadata={}, page_content='abou'),
 Document(metadata={}, page_content='It')]

# Part 6: Spatial Similarity Computation

## Retrieve and visualize the data

In [None]:
!pip install --quiet turfpy

In [106]:
import requests
from turfpy.transformation import circle

url = "https://raw.githubusercontent.com/aurioldegbelo/sis2025/refs/heads/main/vector_data/geo1_example.json"

data = requests.get(url).json()

geo1 = data["features"][0]
denkpause = data["features"][1]
king_kebab = data["features"][2]
krimphove = data["features"][3]
ifgi = data["features"][4]

print(geo1)
print(denkpause)
print(king_kebab)
print(krimphove)
print(ifgi)

q_footprint = circle(ifgi, radius=100, steps=10, units='m') # circle(ifgi, radius=100, units='m') generates a perfect circle
q_footprint['properties'] = ifgi['properties']

target_footprints = [geo1, denkpause, king_kebab, krimphove]

print(q_footprint)
print(target_footprints)

{'type': 'Feature', 'properties': {'Name': 'GEO1', 'id': 0}, 'geometry': {'coordinates': [[[7.595410964969801, 51.96985640112052], [7.595072152923166, 51.96894953936396], [7.596296431749778, 51.96883201435048], [7.596259418668183, 51.96931965343859], [7.596276501629234, 51.96956522507139], [7.596225252747445, 51.969682748162], [7.596020257222932, 51.9697827302507], [7.595410964969801, 51.96985640112052]]], 'type': 'Polygon'}}
{'type': 'Feature', 'properties': {'Name': 'Bistro Denkpause', 'id': 1}, 'geometry': {'coordinates': [[[7.59442844476078, 51.96876957052007], [7.594415375105655, 51.968695759584904], [7.5945373585494735, 51.96869039151193], [7.594480723379263, 51.96832670308601], [7.594731225092858, 51.968313282792536], [7.594837960605048, 51.96874675624372], [7.59442844476078, 51.96876957052007]]], 'type': 'Polygon'}}
{'type': 'Feature', 'properties': {'Name': 'King Kebab', 'id': 2}, 'geometry': {'coordinates': [[[7.596912214932445, 51.969501466701416], [7.596895026350126, 51.969

In [None]:
# https://sustainability-gis.readthedocs.io/en/latest/lessons/L1/intro-to-python-geostack.html
import shapely # geometric operations
import folium # rendering and visualization on the map

map = folium.Map(location=[51.969,  7.595], zoom_start="17")

popup = folium.GeoJsonPopup(fields=["Name"])

folium.GeoJson(data, name="examples", popup = popup).add_to(map)
folium.LayerControl().add_to(map)

map

## Area of overlap

In [38]:
!pip install --quiet turfpy

In [121]:
# What are the boundaries of the ifgi building?
from turfpy.measurement import area
from turfpy.transformation import intersect, circle, union
from geojson import FeatureCollection
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler


def area_of_overlaps(q_footprint, target_footprints):
    lst = []
    for target in target_footprints:
      aoo = 0
      if intersect([q_footprint, target]) != None:
         aoo = area(intersect([q_footprint, target]))
         aou = area(union(FeatureCollection([q_footprint, target])))
      new_row = {'qf': q_footprint['properties'],
                 'tf': target['properties'],
                 'area_of_overlap': aoo,
                 'intersection_over_union': aoo/aou}
      lst.append(new_row)
    return pd.DataFrame(lst)

df = area_of_overlaps(q_footprint, target_footprints)

# Brief intro to normalization in general at https://www.codecademy.com/article/normalization
# Normalization using MinMaxScaler https://scikit-learn.org/1.5/modules/generated/sklearn.preprocessing.MinMaxScaler.html
x_array = np.array(df['area_of_overlap'])
print("Array for area of overlaps: ", x_array)
scaler = MinMaxScaler()
expanded_array = np.expand_dims(x_array, axis=1) # we need to add a dimension to be able to use MinMaxScaler
normalized_arr = scaler.fit_transform(expanded_array)
#print(x_array.shape)
#print(expanded_array.shape)
#print(np.squeeze(normalized_arr).shape)
#print("Normalized Array for area of overlaps: ", normalized_arr)

df['normalized_score'] = np.squeeze(normalized_arr) # we need to remove a dimension after the scaling operation
df

Array for area of overlaps:  [7227.35873855    0.          100.28670279    0.        ]


Unnamed: 0,qf,tf,area_of_overlap,intersection_over_union,normalized_score
0,"{'Name': 'ifgi', 'id': 4}","{'Name': 'GEO1', 'id': 0}",7227.358739,0.229924,1.0
1,"{'Name': 'ifgi', 'id': 4}","{'Name': 'Bistro Denkpause', 'id': 1}",0.0,0.0,0.0
2,"{'Name': 'ifgi', 'id': 4}","{'Name': 'King Kebab', 'id': 2}",100.286703,0.00319,0.013876
3,"{'Name': 'ifgi', 'id': 4}","{'Name': 'Krimphove', 'id': 3}",0.0,0.0,0.0


In [None]:
from geojson import Point, Feature
from ipyleaflet import Map, GeoJSON
from turfpy.transformation import circle

center = [51.96954,  7.595]

m = Map(center=center, zoom=17)

qf_layer = GeoJSON(data=q_footprint, style={"color": "green"})
target_layer = GeoJSON(data=data)

m.add_layer(qf_layer)

m.add_layer(target_layer)

m

## Hausdorff distance

In [123]:
from turfpy.measurement import area, bbox, bbox_polygon
from geojson import Polygon
from turfpy.transformation import intersect
from shapely import hausdorff_distance
from shapely import polygons

# hausdorf distance computation needs inputs of the same size, hence direct computation over the raw polygons is not possible
def compute_hd_distance_on_bbox (q_footprint, target_footprint):

  bbox_q = bbox_polygon(bbox(q_footprint)) # create a bounding box (coordinates) and then the bounding box (polygon)
  bbox_q.properties=q_footprint['properties'] # update the property values of the new polygon

  bbox_t = bbox_polygon(bbox(target_footprint)) # create a bounding box (coordinates) and then the bounding box (polygon)
  bbox_t.properties=target_footprint['properties'] # update the property values of the new polygon

  # get the coordinates of the bounding box (all four points)
  u = (bbox_q['geometry'].coordinates)[0]
  v = (bbox_t['geometry'].coordinates)[0]

  # create polygons
  p1 = polygons(u)
  p2 = polygons(v)

  # compute and return the area of overlap as well, for comparison
  aoo = 0
  if intersect([bbox_q, bbox_t]) != None:
     aoo = area(intersect([bbox_q, bbox_t]))

 # return directed_hausdorff(u, v)[0], aoo
  return hausdorff_distance(p1, p2), aoo

def compute_hd_distance (q_footprint, target_footprint):

  u = (q_footprint['geometry'].coordinates)[0]
  v = (target_footprint['geometry']['coordinates'])[0]

  p1 = polygons(u)
  p2 = polygons(v)

  # compute and return the area of overlap as well, for comparison
  aoo = 0
  if intersect([q_footprint, target_footprint]) != None:
      aoo = area(intersect([q_footprint, target_footprint]))

  return hausdorff_distance(p1, p2), aoo

In [124]:
from turfpy.measurement import area
from turfpy.transformation import intersect, circle
from scipy.spatial.distance import directed_hausdorff
from sklearn.preprocessing import MinMaxScaler

import numpy as np
import pandas as pd

def hausdorff_distances_on_bbox(q_footprint, target_footprints):
    lst = []
    for target in target_footprints:
      hdd = compute_hd_distance_on_bbox(q_footprint, target)
      new_row = {'qf': q_footprint['properties'],
                 'tf': target['properties'],
                 'hausdorff_distance': hdd[0],
                 'area_of_overlap': hdd[1]}
      lst.append(new_row)
    return pd.DataFrame(lst)

def hausdorff_distances(q_footprint, target_footprints):
    lst = []
    for target in target_footprints:
      hdd = compute_hd_distance(q_footprint, target)
      new_row = {'qf': q_footprint['properties'],
                 'tf': target['properties'],
                 'hausdorff_distance': hdd[0],
                 'area_of_overlap': hdd[1]}
      lst.append(new_row)
    return pd.DataFrame(lst)


df2 = hausdorff_distances_on_bbox(q_footprint, target_footprints)

x_array2 = np.array(df2['hausdorff_distance'])
print("Array for hausdorff distances: ", x_array2)
scaler = MinMaxScaler(feature_range=(-1, 0))
expanded_array2 = np.expand_dims(x_array2, axis=1) # we need to add a dimension to be able to use MinMaxScaler
normalized_arr2 = scaler.fit_transform(expanded_array2)

df2['normalized_score'] = normalized_arr2 * (-1)
df2

Array for hausdorff distances:  [0.0011812  0.00299732 0.00268801 0.0031066 ]


Unnamed: 0,qf,tf,hausdorff_distance,area_of_overlap,normalized_score
0,"{'Name': 'ifgi', 'id': 4}","{'Name': 'GEO1', 'id': 0}",0.001181,9568.922329,1.0
1,"{'Name': 'ifgi', 'id': 4}","{'Name': 'Bistro Denkpause', 'id': 1}",0.002997,251.896751,0.056755
2,"{'Name': 'ifgi', 'id': 4}","{'Name': 'King Kebab', 'id': 2}",0.002688,178.830352,0.217406
3,"{'Name': 'ifgi', 'id': 4}","{'Name': 'Krimphove', 'id': 3}",0.003107,0.0,-0.0


In [None]:
from geojson import Point, Feature
from ipyleaflet import Map, GeoJSON, LayersControl
from turfpy.transformation import circle

center = [51.96954, 7.595]

m = Map(center=center, zoom=17)

qbbox_layer = GeoJSON(name="query bbox", data=bbox_polygon(bbox(q_footprint)), style={"color": "green"})
target_bbox_layer = GeoJSON(name = "raw data layer", data=data)

geo1_bbox_layer = GeoJSON(name = "geo1 bbox", data=bbox_polygon(bbox(geo1)) , style={"color": "red"})
denkpause_bbox_layer = GeoJSON(name = "denkpause bbox", data=bbox_polygon(bbox(denkpause)) , style={"color": "red"})
king_kebab_bbox_layer = GeoJSON(name = "king kebab bbox ", data=bbox_polygon(bbox(king_kebab)) , style={"color": "red"})
krimphove_bbox_layer = GeoJSON(name ="krimphove bbox", data=bbox_polygon(bbox(krimphove)) , style={"color": "red"})


control = LayersControl(position="topright")
m.add_control(control)

m.add_layer(target_bbox_layer)
m.add_layer(qbbox_layer)
m.add_layer(geo1_bbox_layer)
m.add_layer(denkpause_bbox_layer)
m.add_layer(king_kebab_bbox_layer)
m.add_layer(krimphove_bbox_layer)

m

## Earth Mover's distance

In [86]:
from scipy.stats import wasserstein_distance_nd

#https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wasserstein_distance_nd.html#scipy.stats.wasserstein_distance_nd
def compute_wasserstein_distance (q_footprint, target_footprint):

  u = (q_footprint['geometry'].coordinates)[0]
  v = (target_footprint['geometry']['coordinates'])[0]

  return wasserstein_distance_nd(u, v)

def earth_mover_distances(q_footprint, target_footprints):
    lst = []
    for target in target_footprints:
      wd = compute_wasserstein_distance(q_footprint, target)

      aoo = 0
      if intersect([q_footprint, target]) != None:
         aoo = area(intersect([q_footprint, target]))
      new_row = {'qf': q_footprint['properties'],
                 'tf': target['properties'],
                 'wasserstein_distance': wd,
                 'area_of_overlap':aoo}
      lst.append(new_row)
    return pd.DataFrame(lst)


df3 = earth_mover_distances(q_footprint, target_footprints)
df3

# normalization
x3_array = np.array(df3['wasserstein_distance'])
print("Array for earth mover's distances: ", x3_array)
scaler = MinMaxScaler(feature_range=(-1, 0))
expanded_array3 = np.expand_dims(x3_array, axis=1) # we need to add a dimension to be able to use MinMaxScaler
normalized_arr3 = scaler.fit_transform(expanded_array3)

df3['normalized_score'] = normalized_arr3 * (-1)
df3

Array for earth mover's distances:  [0.00073008 0.00179195 0.00140664 0.00178281]


Unnamed: 0,qf,tf,wasserstein_distance,area_of_overlap,normalized_score
0,"{'Name': 'ifgi', 'id': 4}","{'Name': 'GEO1', 'id': 0}",0.00073,7227.358739,1.0
1,"{'Name': 'ifgi', 'id': 4}","{'Name': 'Bistro Denkpause', 'id': 1}",0.001792,0.0,-0.0
2,"{'Name': 'ifgi', 'id': 4}","{'Name': 'King Kebab', 'id': 2}",0.001407,100.286703,0.362861
3,"{'Name': 'ifgi', 'id': 4}","{'Name': 'Krimphove', 'id': 3}",0.001783,0.0,0.008611


# Project work

* Exercice 01: pick a method for topical similarity computation
* Exercice 02: pick a method for spatial similarity computation