In [None]:
import json

def load_json(file_name):
    try:
        with open(file_name, 'r', encoding='utf-8') as file:
            data = json.load(file)
            print("Data loaded successfully from", file_name)
            return data
    except FileNotFoundError:
        print("Error: The file does not exist.")
        return None
    except json.JSONDecodeError:
        print("Error: The file is not a valid JSON.")
        return None

In [None]:
data = load_json("./socialmaps-items.json")
items = data["items"]

ITEMS_BY_ID = {item['id']: item for item in items}
assert len(items) == len(ITEMS_BY_ID)
len(ITEMS_BY_ID)

In [None]:
def find_unique_tags(item_list):
    key = "tags"
    unique_values = set()
    for item in item_list:
        if key in item and isinstance(item[key], list):  # Ensure it's a list
            for tag in item[key]:
                unique_values.add(tag)
    return unique_values

unique_tags = find_unique_tags(items)

print(unique_tags)



In [None]:
def find_unique_primarytopics(item_list):
    key = "primaryTopic"
    unique_values = set()
    for item in item_list:
        if key in item and isinstance(item[key], str):  # Ensure it's a string
            unique_values.add(item[key])
    return unique_values

unique_primarytopics = find_unique_primarytopics(items)
print(unique_primarytopics)


In [None]:
import os
import re
import json


## the below will aggregate all JSON exports from:
# search: https://socialmap-berlin.de/report/search/YYYY-MM
# browse: https://socialmap-berlin.de/report/item/YYYY-MM
# (subsequent processing will correctly interpret searc hand browsing behavior)

pattern = re.compile(r'^goaccess-\d+\.json$')

json_objects = []

files_in_dir = os.listdir()

for filename in files_in_dir:
    if pattern.match(filename):
        with open(filename, 'r') as file:
            json_content = json.load(file)
            print(filename)
            json_objects.append(json_content)


len(json_objects)

In [None]:
from dateutil import parser

class LocaleParserInfo(parser.parserinfo):
        MONTHS = [('Jan', 'Januar', 'January', 'Jänner'),
                  ('Feb', 'Februar', 'February'),
                  ('Mrz', 'März', 'March', 'Mar', 'Mär'),
                  ('Apr', 'April'),
                  ('Mai', 'May'),
                  ('Jun', 'Juni', 'June'),
                  ('Jul', 'Juli', 'July'),
                  ('Aug', 'August'),
                  ('Sep', 'September'),
                  ('Okt', 'Oktober', 'October', 'Oct', 'Okt'),
                  ('Nov', 'November'),
                  ('Dez', 'Dezember', 'Dec', 'December')]

def process_requesturls_file(json_obj_, cumulative_values_):
    start_date = parser.parse(json_obj_["general"]["start_date"], dayfirst=True, parserinfo=LocaleParserInfo())
    end_date = parser.parse(json_obj_["general"]["end_date"], dayfirst=True, parserinfo=LocaleParserInfo())
    
    key = "%s"%(start_date.strftime("%Y-%m"))
    values = dict()
    summary = dict(search=0, browse=0, suggest=0)
    for r in json_obj_["requests"]["data"]:
        
        n_hits = int(r["hits"]["count"])
        
        if r["data"].startswith("/search/"):
            searchterm = r["data"][len("/search/"):]
            
            values[searchterm] = values.get(searchterm, 0) + n_hits
            summary["search"] += n_hits
            cumulative_values_["search"][searchterm] = cumulative_values_["search"].get(searchterm, 0) + n_hits
        
        elif r["data"].startswith("/item/"):
            item = r["data"][len("/item/"):]
            
            if item.startswith("new_"):
                summary["suggest"] += n_hits
                
            else:
                values[item] = values.get(item, 0) + n_hits
                summary["browse"] += n_hits
                cumulative_values_["browse"][item] = cumulative_values_["browse"].get(item, 0) + n_hits

        else:
            print("??", r)
            return None, None, None
    return key, values, summary

cumulative_values = dict(search=dict(), browse=dict())

vega_summary_data = []

for jo in json_objects:
    timeframe, values, summary = process_requesturls_file(jo, cumulative_values)
    
    vega_summary_data.append(dict(
        month=timeframe,
        user_journey="search" if summary["search"] > 0 else "browse",
        hits=summary["search"] if summary["search"] > 0 else summary["browse"],
    ))
    if summary["browse"] > 0:
        vega_summary_data.append(dict(
            month=timeframe,
            user_journey="suggest",
            hits=summary["suggest"]
        ))
    print(timeframe, summary)


In [None]:
for id_ in cumulative_values["browse"]:
    if id_ == "9749276938bd792e":
        print(id_)

In [None]:
import altair as alt
import pandas as pd

alt.Chart(pd.DataFrame(vega_summary_data), width=600).mark_line().encode(
    x='month:T',
    color='user_journey',
    y='hits'
)

In [None]:

alt.Chart(pd.DataFrame(vega_summary_data), width=200).mark_boxplot().encode(
    x='user_journey',
    color='user_journey',
    y='hits'
)

In [None]:
item_hits = []
zero = 0
for item_id, item in ITEMS_BY_ID.items():
    # note that not all hits correspond to active entries
    if item_id in cumulative_values["browse"]:
        item_hits.append(dict(value=cumulative_values["browse"].get(item_id, 0)))
    else:
        zero += 1

chart = alt.Chart(pd.DataFrame(item_hits), width=800, title="From 2022-09 until 2024-03, only entries with at least 1 hit. {}% entries not represented (no hits) of {} total".format(
    round(zero/len(ITEMS_BY_ID) * 100), len(ITEMS_BY_ID))
                 ).mark_bar().encode(
    alt.X('value', bin=alt.Bin(maxbins=100)),
    y=alt.Y('count()', title='Number of entries with this many hits')
)
chart

In [None]:
websites = dict()
websites_w_mx_topics = set()

# website or title duplicates
for item_id, item in ITEMS_BY_ID.items():
    if not item["website"]:
        continue
    if item["website"] in websites and websites[item["website"]] != item["primaryTopic"]:
        #print(item["website"], "has at least 2 diff topics:", item["primaryTopic"], websites[item["website"]])
        websites_w_mx_topics.add(item["website"])
    websites[item["website"]] = item["primaryTopic"]
    
print("{}% of unique websites in entries having websites".format(round(len(websites)/len([v for v in ITEMS_BY_ID.values() if v["website"]]), 2)))
print("{}% of entries having websites that have different primary topic in the two different entries".format(round(len(websites_w_mx_topics)/len(websites), 2))) # few have multiple no

In [None]:
hits_deduped = dict()
for item_id, item in ITEMS_BY_ID.items():
    key = item["website"] if item["website"] else item_id
    hits_deduped[key] = hits_deduped.get(key, 0) + cumulative_values["browse"].get(item_id, 0)

item_hits = []
zero = 0
for website, hits in hits_deduped.items():
    if hits:
        item_hits.append(dict(value=hits))
    else:
        zero += 1
        
chart = alt.Chart(pd.DataFrame(item_hits), width=800, title="From 2022-09 until 2024-03; deduplicated based on website URL. {}% entries not represented (no hits) of {} total".format(
    round(zero/len(hits_deduped) * 100), len(hits_deduped))
                 ).mark_bar().encode(
    alt.X('value', bin=alt.Bin(maxbins=100)),
    y=alt.Y('count()', title='Number of entries with this many hits')
)
chart

In [None]:
# hits by primary topic
hits_by_topic = dict()
notopic = 0
for item_id, item in ITEMS_BY_ID.items():
    if not item["primaryTopic"]:
        notopic += 1
        continue
    hits_by_topic[item["primaryTopic"]] = hits_by_topic.get(item["primaryTopic"], dict(
        hits=0, entries=0, entries_nonprimary=0
    ))
    hits_by_topic[item["primaryTopic"]]["hits"] += cumulative_values["browse"].get(item_id, 0)
    hits_by_topic[item["primaryTopic"]]["entries"] += 1 # primary

print("NO TOPIC IN %s ENTRIES"%notopic)

for topic, v in hits_by_topic.items():
    for item_id, item in ITEMS_BY_ID.items():
        if item["primaryTopic"] == topic:
            continue
        for tag in item["tags"]:
            assert tag == tag.lower()
            assert topic == topic.lower()
            if topic in tag:
                v["entries_nonprimary"] += 1
                break
    
    print("Topic: %s Primary: %s Non-primary: %s"%(topic, v["entries"], v["entries_nonprimary"]))

vega_data = []
for k, v in hits_by_topic.items():
    vega_data.append(dict(topic=k,
                          hits=v["hits"],
                          entries=v["entries"]+v["entries_nonprimary"],
                          entries_primary=v["entries"],
                          entries_nonprimary=v["entries_nonprimary"]))
    

alt.vconcat(alt.Chart(pd.DataFrame([dict(topic=k, hits=v["hits"]) for k, v in hits_by_topic.items()]),
                  width=800, title="Topics by current visibility").mark_bar().encode(
    x = alt.X('topic:N', title='Primary Topic', sort='-y'),
    y=alt.Y('hits:Q', title='Hits')
),
alt.Chart(pd.DataFrame([dict(topic=k, hits=v["entries"]) for k, v in hits_by_topic.items()]),
                  width=800, title="Topics by current availability of organizations").mark_bar().encode(
    x = alt.X('topic:N', title='Primary Topic', sort='-y'),
    y=alt.Y('hits:Q', title='Hits')
))


base = alt.Chart(pd.DataFrame(vega_data), title="Current availability and visibility of entries by topic").encode(
    x=alt.X('topic:N', sort='-y')
)
line =  base.mark_square(color='red').encode(y='hits:Q')
bar = base.mark_bar(color='lightgray').encode(y='entries:Q')
bar2 = base.mark_bar(color='lightgray').encode(y='entries_nonprimary:Q')
bar3 = base.mark_bar(color='lightgray').encode(y='entries_primary:Q')

alt.hconcat(
    alt.layer(bar, line).resolve_scale(
        y='independent'
    ),
    alt.layer(bar2, line).resolve_scale(
    y='independent'
),
    alt.layer(bar3, line).resolve_scale(
    y='independent'
))

In [None]:
# Sort by opportunity for visibility - ie, where hits less than availability

for k, v in hits_by_topic.items():
    v["opportunity"] = (v["entries"]+v["entries_nonprimary"]) / v["hits"]

for topic, v in dict(sorted(hits_by_topic.items(), key=lambda item: item[1]["opportunity"], reverse=True)).items():
    # having minimum 100 entries
    if v["entries"]+v["entries_nonprimary"] >= 100:
        print(topic)

In [None]:
"""
arts
sports
recreation
health
kindergarden
education
self_help
volunteer_work
"""

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np


def pca_(tags_list, n_components): # expects 1 flat list of strings

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(tags_list)

    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X.toarray())

    pca = PCA(n_components=n_components)
    pca.fit(X_normalized)

    components = pca.components_

    feature_names = vectorizer.get_feature_names_out()

    for i, component in enumerate(components):
        sorted_indices = np.argsort(component)[::-1]

    explained_variance_ratio = pca.explained_variance_ratio_
    return sum(explained_variance_ratio)

In [None]:
# 1. breaking up topics that have high opportunity
# 2. and then cluster the rest - the bags of words are per topic
for topic, v in hits_by_topic.items():
    
    tags_list = []
    for item_id, item in ITEMS_BY_ID.items():
        
        in_this_topic = False
        if item["primaryTopic"] == topic:
            in_this_topic = True
            
        for tag in item["tags"]:
            if topic in tag:
                in_this_topic = True
                break
                
        if in_this_topic:
            tags_list.append(" ".join(item["tags"]))
    print(topic, pca_(tags_list, 10))

In [None]:
tot = 0
filt = 0
for term, hits in cumulative_values["search"].items():
    if "hous" in term.lower() or "wohn" in term.lower():
    #if "kinder" in term.lower() or "kita" in term.lower():
        filt += hits
    tot += hits

filt/tot

In [None]:
data_by_letter = dict()

for itemid, hits in cumulative_values["browse"].items():
    if ITEMS_BY_ID.get(itemid):
        key = (ITEMS_BY_ID.get(itemid)["title"].lower().strip())[0]
        data_by_letter[key] = data_by_letter.get(key, dict(hits=0, entries=0))
        data_by_letter[key]["hits"] += hits

for item in ITEMS_BY_ID.values():
    key = item["title"].lower().strip()[0]
    data_by_letter[key] = data_by_letter.get(key, dict(hits=0, entries=0))
    data_by_letter[key]["entries"] += hits

tot_hits = sum([v["hits"] for v in data_by_letter.values()])
tot_entries = sum([v["entries"] for v in data_by_letter.values()])

vega_letter_data = []
for k, v in data_by_letter.items():
    vega_letter_data.append(dict(
        letter=k,
        measure="entries (proportion of total)",
        count=v["entries"]/max_entries
    ))
    vega_letter_data.append(dict(
        letter=k,
        measure="hits (proportion of total)",
        count=v["hits"]/max_hits
    ))

print(data_by_letter["a"]["hits"])
print(data_by_letter["a"]["hits"]/tot_hits)
print(data_by_letter["a"]["entries"]/tot_entries)

alt.Chart(pd.DataFrame(vega_letter_data),
                  width=800, title="Alphabetic sort strongly affects visibility").mark_line().encode(
    x=alt.X('letter:O'),
    y=alt.Y('count:Q'),
    color='measure'
)

In [None]:
# dim reduce based on categories - not quite right?

In [None]:
import re

texts_to_cluster = []
item_ids = []
for item in ITEMS_BY_ID.values():
    #if "de" not in item["brief"]:
    #    print(item)
    #break
    # item["brief"]["en"].replace("[DeepL:] ", "")
    # vre.sub(r'\d+', '', text)
    texts_to_cluster.append(re.sub(r'\d+', '', (item["brief"].get("de", "") + " " + item["brief"].get("en", "").replace("[DeepL:] ", "") +\
                            " " + item["title"] +\
                            " ".join("tags")+ " " + (item["primaryTopic"] if item["primaryTopic"] else ""))))
    item_ids.append(item["id"])

len(texts_to_cluster)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score

from nltk.corpus import stopwords
german_stop_words = stopwords.words('german')
combined_stop_words = list(text.ENGLISH_STOP_WORDS.union(german_stop_words))
print("STOP WORDS COMBO:", len(combined_stop_words))

vectorizer = TfidfVectorizer(stop_words=combined_stop_words)
X = vectorizer.fit_transform(texts_to_cluster)

num_clusters = 15
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

clusters = kmeans.fit_predict(X)

cluster_combos = []
for i in range(num_clusters):
    cluster_indices = np.where(clusters == i)[0]
    cluster_combos.append(" ".join([texts_to_cluster[idx] for idx in cluster_indices]))

    X_cluster = vectorizer.fit_transform([texts_to_cluster[idx] for idx in cluster_indices])

    feature_names = vectorizer.get_feature_names_out()

    average_tfidf_scores = np.mean(X_cluster.toarray(), axis=0)

    sorted_indices = np.argsort(average_tfidf_scores)[::-1]

    top_words_indices = sorted_indices[:7]
    top_words = [feature_names[idx] for idx in top_words_indices]

    label = " ".join(top_words)

    print(i + 1,",",len(cluster_indices),",", label)
    for idx in cluster_indices:
        ITEMS_BY_ID[item_ids[idx]]["Cluster_ID"] = i+1
        ITEMS_BY_ID[item_ids[idx]]["Cluster_Words"] = label

In [None]:
print("id , title , Cluster_ID, Cluster_Words")
for item in ITEMS_BY_ID.values():
    print(item["id"], ",", item["title"].replace(",", " "), ",", item["Cluster_ID"], ",", item["Cluster_Words"], ",")

In [None]:
vega_values_data = []

for jo in json_objects:
    timeframe, values, summary = process_requesturls_file(jo, cumulative_values)
    print(values)
    for item_id, hits in values.items():
        if not ITEMS_BY_ID.get(item_id):
            continue
        if ITEMS_BY_ID
    break
    vega_values_data.append(dict(
        month=timeframe,
        user_journey="search" if summary["search"] > 0 else "browse",
        hits=summary["search"] if summary["search"] > 0 else summary["browse"],
    ))
    if summary["browse"] > 0:
        vega_summary_data.append(dict(
            month=timeframe,
            user_journey="suggest",
            hits=summary["suggest"]
        ))
    print(timeframe, summary)


In [None]:
# hits by primary topic
hits_by_topic = dict()
notopic = 0
for item_id, item in ITEMS_BY_ID.items():
    if not item["primaryTopic"]:
        notopic += 1
        continue
    hits_by_topic[item["primaryTopic"]] = hits_by_topic.get(item["primaryTopic"], dict(hits=0, entries=0))
    hits_by_topic[item["primaryTopic"]]["hits"] += cumulative_values["browse"].get(item_id, 0)
    hits_by_topic[item["primaryTopic"]]["entries"] += 1

vega_data = []
for k, v in hits_by_topic.items():
    vega_data.append(dict(topic=k, count=v["hits"], measure="hits"))
    vega_data.append(dict(topic=k, count=v["entries"], measure="entries"))
    
print("NO TOPIC IN %s ENTRIES"%notopic)



chart = alt.Chart(pd.DataFrame(vega_data),
                  width=800, title="Topics by current availability and visibility").mark_bar().encode(
    x = alt.X('topic:N', title='Primary Topic', sort='-y'),
    y=alt.Y('count:Q'),
    row='measure'
)
chart

In [None]:
import altair as alt
from vega_datasets import data

source = data.wheat()

base = alt.Chart(source).encode(x='year:O')

bar = base.mark_bar().encode(y='wheat:Q')

line =  base.mark_line(color='red').encode(
    y='wages:Q'
)

(bar + line).properties(width=600)

In [None]:
print(json_objects[1]["general"]["start_date"])
print(json_objects[1]["general"]["end_date"])

for r in json_objects[1]["requests"]["data"]:
    
    if r["data"].startswith("/search/"):
        searchterm = r["data"][len("/search/"):]
        print(r["hits"]["count"], r["visitors"]["count"], searchterm)

In [None]:
import json


with open("ClusteredItems.json", 'w') as json_file:
    json.dump(ITEMS_BY_ID, json_file, indent=4)
    
with open("CumulativeHitsData.json", 'w') as json_file:
    json.dump(cumulative_values, json_file, indent=4)