In [1]:
import pandas as pd
from globals import BASE_DIR, wikimedia_headers
import json
import geopandas as gpd
from shapely.geometry import Point
import folium
import requests
from branca.element import Element
from folium.plugins import TreeLayerControl
from bs4 import BeautifulSoup


available_datasets = ["gowalla"]
top_k_eval = 10

coordinates=(18.0686, 59.3290)


In [2]:
def get_wikimedia_images_at_point(lat, lon, headers=wikimedia_headers):
    url = (
        "https://commons.wikimedia.org/w/api.php"
        "?format=json"
        "&action=query"
        "&generator=geosearch"
        "&ggsprimary=all"
        "&ggsnamespace=6"
        "&ggsradius=200"
        "&ggscoord=" + str(lat) + "|" + str(lon) +
        "&ggslimit=10"
        "&prop=imageinfo"
        "&iiprop=extmetadata|url"
        "&iilimit=10"
        "&iiurlwidth=1200"
        "&iiurlheight=1200"
    )

    images = []
    call = requests.get(url, headers=headers)

    if call.status_code == 200:
        response = call.json()
        if 'query' in response and 'pages' in response['query']:
            for page in response['query']['pages']:
                page_data = response['query']['pages'][page]
                image_info = page_data['imageinfo'][0]
                extmeta = image_info.get('extmetadata', {})

                license_name = extmeta.get('LicenseShortName', {}).get('value', '')
                license_url = extmeta.get('LicenseUrl', {}).get('value', '')
                author = extmeta.get('Artist', {}).get('value', '')
                credit = extmeta.get('Credit', {}).get('value', '')
                description = extmeta.get('ImageDescription', {}).get('value', '')

                images.append({
                    'id': page_data['pageid'],
                    'title': page_data.get('title', ''),
                    'photo_url_thumb': image_info.get('thumburl', ''),
                    'photo_url_medium': '',  # Optional field for larger sizes
                    'description': description,
                    'author': author,
                    'credit': credit,
                    'license': license_name,
                    'license_url': license_url,
                    'info_url': image_info.get('descriptionurl', ''),
                    'info_raw': image_info,
                    'source': 'wikimedia'
                })
    else:
        print('Ein Fehler ist aufgetreten beim Abruf der Wikimedia-Bilder.')

    return images, call


In [3]:
def unstack_recommendations(df):
    unstacked_df = df.explode(["item_id:token"]).reset_index(drop=True)
    return unstacked_df

In [4]:
def top_k_to_df(recommender_dir, top_k_eval=top_k_eval):
    with open(recommender_dir) as f:
        data = json.load(f)

    base_recommendations = []

    for user, items in data.items():
        for item in items:
            base_recommendations.append({
                "user_id:token": user,
                "item_id:token": item
            })


    base_df = pd.DataFrame(base_recommendations)
    base_df = unstack_recommendations(base_df)

    df = base_df.groupby('user_id:token').head(top_k_eval)

    return df

In [5]:
def group_user_events(user_events, user_groups):
    stats = {}
    for group in user_groups.keys():
        stats[group] = {}
        user_events_group = user_events.copy()
        user_events_group = user_events_group.loc[user_events_group['user_id:token'].isin(user_groups[group])]
        user_dist = user_events_group["user_id:token"].value_counts()
        stats[group]["num_users"] = user_dist.shape[0]
        stats[group]["mean_checkins"] = user_events_group["user_id:token"].value_counts().mean()
        stats[group]["min_checkins"] = user_events_group["user_id:token"].value_counts().min()
        stats[group]["max_checkins"] = user_events_group["user_id:token"].value_counts().max()
        item_dist = user_events_group["item_id:token"].value_counts()
        stats[group]["num_items"] = item_dist.shape[0]
        stats[group]["sparsity"] = 1 - len(user_events_group) / (len(user_events_group["user_id:token"].unique()) * len(user_events_group["item_id:token"].unique()))

    return stats



In [6]:
full_stats = {}
for dataset in available_datasets:
    # Stays the same across all models 
    train_data = pd.read_csv(f"{BASE_DIR}{dataset}_dataset/processed_data_recbole/{dataset}_sample.train.inter", sep="\t")
    test_data = pd.read_csv(f"{BASE_DIR}{dataset}_dataset/processed_data_recbole/{dataset}_sample.test.inter", sep="\t")
    valid_data = pd.read_csv(f"{BASE_DIR}{dataset}_dataset/processed_data_recbole/{dataset}_sample.valid.inter", sep="\t")

    train_data = pd.concat([train_data, valid_data])
    user_group_dir = f"{BASE_DIR}{dataset}_dataset/{dataset}_user_id_popularity.json"
    with open(user_group_dir) as f:
        user_groups = json.load(f)

    poi_data = pd.read_csv(f"{BASE_DIR}{dataset}_dataset/processed_data_capri/poiCoos.txt", sep="\t", header=None, names=["item_id:token", "lat:float", "lon:float"])
    poi_data["item_id:token"] = poi_data["item_id:token"].astype(str) + "_x"

    all_user_ids = (
    set(user_groups["high"])
    | set(user_groups["medium"])
    | set(user_groups["low"])
    )

    value_counts = train_data["item_id:token"].value_counts().reset_index()
    value_counts.columns = ["item_id:token", "count"]
    value_counts["item_pop"] = value_counts["count"] / len(value_counts)
    train_data = train_data.merge(
        value_counts[["item_id:token", "item_pop"]],
        on="item_id:token",
        how="left",
    )

    user_events = pd.concat([train_data, test_data])

#### Turn the POI data into a Geodataframe and perform filtering

In [7]:
def geolocation_poi_filter(poi_data, coordinates=(18.0680, 59.3293) , radius=20000, train_data=train_data):

    gdf = gpd.GeoDataFrame(
        poi_data,
        geometry=gpd.points_from_xy(poi_data["lon:float"], poi_data["lat:float"]),
        crs="EPSG:4326"  # WGS84
    )

    gdf = gdf.to_crs("EPSG:3857")
    stockholm_point = gpd.GeoSeries(
        [Point(coordinates)], crs="EPSG:4326"
    ).to_crs("EPSG:3857")
    buffer = stockholm_point.buffer(radius)  # 20 km buffer default

    filtered_gdf = gdf[gdf.geometry.within(buffer.unary_union)]
    filtered_gdf = filtered_gdf.to_crs("EPSG:4326")

    relevant_items = filtered_gdf["item_id:token"].unique().tolist()

    relevant_train_data = train_data[train_data["item_id:token"].isin(relevant_items)]

    relevant_users = relevant_train_data["user_id:token"].unique().tolist()

    filtered_gdf = pd.merge(filtered_gdf, relevant_train_data[["item_id:token", "item_pop"]], on="item_id:token", how="left")
    filtered_gdf = filtered_gdf.drop_duplicates(subset=["item_id:token"])

    return filtered_gdf, relevant_train_data, relevant_users



    



In [8]:


filtered_gdf, relevant_train_data, relevant_users = geolocation_poi_filter(poi_data, coordinates, train_data=train_data)

  filtered_gdf = gdf[gdf.geometry.within(buffer.unary_union)]


In [9]:
filtered_gdf.sort_values("item_pop", ascending=False).head(10)

Unnamed: 0,item_id:token,lat:float,lon:float,geometry,item_pop
241,1151_x,59.330158,18.058079,POINT (18.05808 59.33016),0.020765
771,1204_x,59.320201,18.071169,POINT (18.07117 59.3202),0.00855
1161,2001_x,59.332243,18.06192,POINT (18.06192 59.33224),0.006786
2547,962_x,59.323196,18.067038,POINT (18.06704 59.3232),0.006786
62,1127_x,59.330935,18.059249,POINT (18.05925 59.33093),0.006379
2853,1352_x,59.299108,18.08076,POINT (18.08076 59.29911),0.006107
11,1096_x,59.334317,18.062704,POINT (18.0627 59.33432),0.005565
2279,6123_x,59.323278,18.097186,POINT (18.09719 59.32328),0.00475
602,919_x,59.330825,18.07178,POINT (18.07178 59.33083),0.004615
193,1674_x,59.333096,18.069044,POINT (18.06904 59.3331),0.004479


# User Groups

### Stockholm Sample
(Gowalla + coordinates=(18.0686, 59.3293) & radius=20000)
* user g1 = 481_x
* user g2 = 1023_x
* user g3 = 130_x

* BPR: gowalla_sample-BPR-Dec-19-2024_20-23-30/baseline
* BPR CP: gowalla_sample-BPR-Dec-19-2024_20-23-30/cp
* LORE: gowalla_sample-contextpoi-LORE-Sep-16-2024_09-00-00/baseline

In [10]:
# training data for the 3 users

train_g1 = relevant_train_data.loc[relevant_train_data["user_id:token"] == "481_x"]
train_g2 = relevant_train_data.loc[relevant_train_data["user_id:token"] == "1023_x"]
train_g3 = relevant_train_data.loc[relevant_train_data["user_id:token"] == "130_x"]

In [11]:
# baseline recommendations for the 3 users
baseline = top_k_to_df(f"{BASE_DIR}{dataset}_dataset/recommendations/{dataset}_sample-BPR-Dec-19-2024_20-23-30/baseline/top_k_recommendations.json")
cp = top_k_to_df(f"{BASE_DIR}{dataset}_dataset/recommendations/{dataset}_sample-BPR-Dec-19-2024_20-23-30/cp/top_k_recommendations.json")
context = top_k_to_df(f"{BASE_DIR}{dataset}_dataset/recommendations/{dataset}_sample-contextpoi-LORE-Sep-16-2024_09-00-00/baseline/top_k_recommendations.json")


In [12]:
user_samples = {"HighPop": "481_x", "MedPop": "1023_x", "LowPop": "130_x"}
for key, item in user_samples.items():
    print(key)

HighPop
MedPop
LowPop


In [13]:
relevant_train_data

Unnamed: 0,user_id:token,item_id:token,checkin_count:float,item_pop
418,1016_x,1125_x,1,0.001357
419,1016_x,1652_x,1,0.001764
420,1016_x,1222_x,2,0.001629
421,1016_x,919_x,1,0.004615
422,1016_x,1207_x,1,0.000814
...,...,...,...,...
42330,99_x,4200_x,1,0.000679
42331,99_x,1685_x,1,0.003393
42339,99_x,5134_x,1,0.000407
42340,99_x,1818_x,1,0.000271


In [14]:
def parse_author(author_html):
    soup = BeautifulSoup(author_html, "html.parser")
    a_tag = soup.find("a")
    if a_tag:
        name = a_tag.text.strip()
        url = "https:" + a_tag["href"]
        return name, url
    return None, None

In [15]:
def data_prep_visualization(filtered_gdf, inters, user_samples):
    data_list = []
    
    for user_group, user_id in user_samples.items():
        user_inters = inters.loc[inters["user_id:token"] == user_id]
        relevant_items = filtered_gdf.loc[filtered_gdf["item_id:token"].isin(user_inters["item_id:token"])]
        relevant_items = relevant_items.reset_index(drop=True)

        photo_urls_list = []  # Store image info as a list of lists
        
        for _, row in relevant_items.iterrows():
            lat, lon = row.geometry.y, row.geometry.x  # lat, lon
            images, _ = get_wikimedia_images_at_point(lat, lon, wikimedia_headers)
            
            photo_infos = []
            for image in images:
                author_name, author_url = parse_author(image.get("author", ""))
                photo_infos.append({
                    "photo_url_thumb": image.get("photo_url_thumb"),
                    "author_name": author_name,
                    "author_url": author_url,
                    "credit": BeautifulSoup(image.get("credit", ""), "html.parser").text.strip(),
                    "license": image.get("license"),
                    "license_url": image.get("license_url"),
                })
            photo_urls_list.append(photo_infos)

        relevant_items["photo_infos"] = photo_urls_list
        data_list.append(relevant_items)

    return data_list


In [16]:
training_data_list = data_prep_visualization(filtered_gdf, relevant_train_data, user_samples)


In [17]:
training_data_list[0]["photo_infos"].values.tolist()

[[{'photo_url_thumb': 'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c9/Stockholms_centralstation_ljuskrona.JPG/1200px-Stockholms_centralstation_ljuskrona.JPG',
   'author_name': 'Jssfrk',
   'author_url': 'https://commons.wikimedia.org/wiki/User:Jssfrk',
   'credit': 'Own work',
   'license': 'CC BY-SA 3.0',
   'license_url': 'https://creativecommons.org/licenses/by-sa/3.0'},
  {'photo_url_thumb': 'https://upload.wikimedia.org/wikipedia/commons/thumb/7/7b/Stockholms_centralstation_interi%C3%B6r.JPG/1200px-Stockholms_centralstation_interi%C3%B6r.JPG',
   'author_name': 'Jssfrk',
   'author_url': 'https://commons.wikimedia.org/wiki/User:Jssfrk',
   'credit': 'Own work',
   'license': 'CC BY-SA 3.0',
   'license_url': 'https://creativecommons.org/licenses/by-sa/3.0'},
  {'photo_url_thumb': 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a0/Stockholm_Centralstation-lockers.jpg/1200px-Stockholm_Centralstation-lockers.jpg',
   'author_name': 'Alexey M.',
   'author_url': 'h

In [18]:
baseline_list = data_prep_visualization(filtered_gdf, baseline, user_samples)
cp_list = data_prep_visualization(filtered_gdf, cp, user_samples)
#upd_list = data_prep_visualization(filtered_gdf, upd, user_samples)
context_list = data_prep_visualization(filtered_gdf, context, user_samples)

In [19]:
training_data_list[0]["photo_infos"].values.tolist()

[[{'photo_url_thumb': 'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c9/Stockholms_centralstation_ljuskrona.JPG/1200px-Stockholms_centralstation_ljuskrona.JPG',
   'author_name': 'Jssfrk',
   'author_url': 'https://commons.wikimedia.org/wiki/User:Jssfrk',
   'credit': 'Own work',
   'license': 'CC BY-SA 3.0',
   'license_url': 'https://creativecommons.org/licenses/by-sa/3.0'},
  {'photo_url_thumb': 'https://upload.wikimedia.org/wikipedia/commons/thumb/7/7b/Stockholms_centralstation_interi%C3%B6r.JPG/1200px-Stockholms_centralstation_interi%C3%B6r.JPG',
   'author_name': 'Jssfrk',
   'author_url': 'https://commons.wikimedia.org/wiki/User:Jssfrk',
   'credit': 'Own work',
   'license': 'CC BY-SA 3.0',
   'license_url': 'https://creativecommons.org/licenses/by-sa/3.0'},
  {'photo_url_thumb': 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a0/Stockholm_Centralstation-lockers.jpg/1200px-Stockholm_Centralstation-lockers.jpg',
   'author_name': 'Alexey M.',
   'author_url': 'h

In [20]:
m = folium.Map(location=[coordinates[1], coordinates[0]], zoom_start=12)

def create_layer_structure(group_key, group_name, group_color, data_lists, data_labels, data_icons):
    group_layer = {"label": group_key, "select_all_checkbox": True, "children": []}
    
    for data_list, data_label, data_icon in zip(data_lists, data_labels, data_icons):
        sublayer = folium.FeatureGroup(name=f"{data_label} ({group_name})")
        

        for df in data_list:  
            for _, row in df.iterrows():
                item_pop_percentage = f"{row['item_pop'] * 100:.4f}%"

   
                photo_infos = row.get("photo_infos", [])
                images_html = ""

                if photo_infos and isinstance(photo_infos, list) and len(photo_infos) > 0:
                    for idx, photo_info in enumerate(photo_infos):
                        img_url = photo_info.get("photo_url_thumb", "")
                        license = photo_info.get("license", "Unknown")
                        license_url = photo_info.get("license_url", "#")
                        author_name = photo_info.get("author_name", "Unknown")
                        author_url = photo_info.get("author_url", "#")

                        active_class = "active" if idx == 0 else ""
                        caption_html = f"""
                            <div class="text-center" style="font-size: 0.75rem; margin-top: 4px;">
                                <a href="{photo_info.get('photo_url_thumb', '#')}" target="_blank">View Photo</a><br>
                                © <a href="{author_url}" target="_blank">{author_name}</a>,
                                <a href="{license_url}" target="_blank">{license}</a> (no changes made)
                            </div>
                        """

                        images_html += f"""
                            <div class="carousel-item {active_class}">
                                <img src="{img_url}" class="d-block w-100" style="max-height: 200px; object-fit: contain;" alt="POI Image">
                                {caption_html}
                            </div>
                        """

                    carousel_html = f"""
                        <div id="carousel-{row['item_id:token']}" class="carousel slide" data-bs-ride="carousel" style="max-height: 250px; overflow: hidden;">
                            <div class="carousel-inner">
                                {images_html}
                            </div>
                            <a class="carousel-control-prev" href="#carousel-{row['item_id:token']}" role="button" data-bs-slide="prev">
                                <span class="carousel-control-prev-icon" aria-hidden="true"></span>
                                <span class="visually-hidden">Previous</span>
                            </a>
                            <a class="carousel-control-next" href="#carousel-{row['item_id:token']}" role="button" data-bs-slide="next">
                                <span class="carousel-control-next-icon" aria-hidden="true"></span>
                                <span class="visually-hidden">Next</span>
                            </a>
                        </div>
                    """
                else:
                    carousel_html = "<p><i>No images available</i></p>"

                popup_content = f"""
                    <b>User ID:</b> {user_samples[group_key]} <br>
                    <b>POI ID:</b> {row['item_id:token']}<br>
                    <b>POI Popularity:</b> {item_pop_percentage}<br>
                    <b>Data Type:</b> {data_label}<br>
                    <b>User Group:</b> {group_key} <br>
                    {carousel_html}
                """

                folium.Marker(
                    location=(row.geometry.y, row.geometry.x),
                    popup=folium.Popup(popup_content, max_width=300),
                    icon=folium.Icon(color=group_color, icon=data_icon, prefix="glyphicon"),
                ).add_to(sublayer)
        
        group_layer["children"].append({"label": data_label, "layer": sublayer})
        sublayer.add_to(m)
    
    return group_layer


user_samples = {"HighPop": "481_x", "MedPop": "1023_x", "LowPop": "130_x"}
color_scheme = ["darkpurple", "blue", "green"]
data_lists = [training_data_list, baseline_list, cp_list, context_list]
data_labels = ["Training Data", "Baseline", "Calibrated Popularity", "Context-Aware"]
data_icons = ["glyphicon glyphicon-cog", "glyphicon glyphicon-star-empty", "glyphicon-repeat", "glyphicon-eye-open"]

overlay_tree = {"label": "User Groups", "select_all_checkbox": "Select/Deselect All", "children": []}

for group_idx, (group_key, group_name, group_color) in enumerate(zip(user_samples.keys(), user_samples.values(), color_scheme)):

    data_lists_for_group = [
        [training_data_list[group_idx]],  # Training data for the current group
        [baseline_list[group_idx]],      # Baseline recommendations for the current group
        [cp_list[group_idx]],            # CP recommendations for the current group
        [context_list[group_idx]],       # Context-aware recommendations for the current group
    ]
    

    overlay_tree["children"].append(
        create_layer_structure(group_key, group_name, group_color, data_lists_for_group, data_labels, data_icons)
    )

# Add 20 km buffer as a circle (adjust coordinates as needed)
folium.Circle(
    location=coordinates,
    radius=20000,
    color="red",
    fill=False,
    fill_opacity=0.2,
).add_to(m)

# Add TreeLayerControl for filtering layers
TreeLayerControl(overlay_tree=overlay_tree).add_to(m)

legend_html = """
<div style="position: fixed; bottom: 50px; left: 50px; width: 220px; height: auto;
            background-color: white; border: 2px solid grey; z-index: 9999; font-size: 14px;
            padding: 10px; box-shadow: 2px 2px 10px rgba(0,0,0,0.2);">
    
    <b>About This Tool</b> <br>
    <button class="btn btn-info btn-sm" type="button" data-bs-toggle="collapse" data-bs-target="#aboutTool">More Info</button>
    <div id="aboutTool" class="collapse" style="margin-top: 5px;">
        This demo is part of a master's thesis on popularity bias mitigation in POI recommender systems. 
        We chose three users from the Gowalla dataset around Stockhom and visualize the places they visited in the past & the recommendations they received from different algorithms.
    </div>
    
    <hr>
    
    <b><i class="glyphicon glyphicon-user"></i> User Groups</b> <br>
    <button class="btn btn-secondary btn-sm" type="button" data-bs-toggle="collapse" data-bs-target="#userGroups">Learn More</button>
    <div id="userGroups" class="collapse" style="margin-top: 5px;">
        Users are divided into three groups based on check-in popularity:<br>
        <b style="color: green;"><i class="glyphicon glyphicon-user"></i> LowPop</b> <br>
        <b style="color: blue;"><i class="glyphicon glyphicon-user"></i> MedPop</b> <br>
        <b style="color: purple;"><i class="glyphicon glyphicon-user"></i> HighPop</b> <br>
        
    </div>
    
    <hr>
    
    <b><i class="glyphicon glyphicon-cog"></i> Training Data</b> <br>
    <button class="btn btn-secondary btn-sm" type="button" data-bs-toggle="collapse" data-bs-target="#trainingData">Learn More</button>
    <div id="trainingData" class="collapse" style="margin-top: 5px;">
        Locations that a user has visited in the past, used to calculate future recommendations.
    </div>
    
    <hr>
    
    <b><i class="glyphicon glyphicon-signal"></i> Recommendation Algorithms</b> <br>
    <button class="btn btn-secondary btn-sm" type="button" data-bs-toggle="collapse" data-bs-target="#algorithms">Learn More</button>
    <div id="algorithms" class="collapse" style="margin-top: 5px;">
        <b><i class="glyphicon glyphicon-star-empty"></i> Baseline:</b> Standard model (BPR). Recommends many popular places <br>
        <b><i class="glyphicon glyphicon-repeat"></i> Calibrated Popularity:</b> Adjusts recommendations to match user’s past preferences. <br>
        <b><i class="glyphicon glyphicon-eye-open"></i> Context-Aware POI Recommender:</b> Uses LORE algorithm to factor in time, geography, and visit sequences. Recommends many niche places.
    </div>
</div>

"""

legend = Element(legend_html)
m.get_root().html.add_child(legend)

m.save("poi_visualization_demo_tool.html")

m
