# Top2vec - search

Explore a top2vec model based on data.gouv.fr's catalog.

## Model training instructions

**warning** `learn_top2vec.py` needs a python 3.10 env to run (some libraries not compatible with python > 3.10 ATM).

You need to install [qsv](https://qsv.dathere.com) in order to preprocess the CSV catalog file.

```shell
pip install top2vec[sentence_transformers]
# download the latest catalog
make download
# create a cleaned version of the catalog
make filter
# train the model
python learn_top2vec.py train-model
```

In [83]:
from top2vec import Top2Vec

In [84]:
embedding_model = "distiluse-base-multilingual-cased"
model_file = f"top2vec_{embedding_model}.bin"
model = Top2Vec.load(model_file)

In [85]:
model.get_num_topics()

541

## Use keywords.yml for search_topics

Tokenize keywords.yml for every theme and apply search_topics for keywords that are in the vector space.

Get a list of interesting topics for every theme, filter them by common topics (too common, not precise enough) and manual pruning. Use `score = topic_score_in_query * doc_score_in_topic` to normalize scores across topics for a given theme. Keep the common topics apart because they might be interesting overall, even if they're not specific to a theme.

Export a list of documents for every prefered topic(s) for every theme.

In [86]:
from top2vec.Top2Vec import default_tokenizer

In [87]:
from utils import load_raw_keywords, load_kw_file

In [88]:
tokenized_tags = {}
for theme, tags in load_kw_file().items():
    tokenized_tags[theme] = [default_tokenizer(t) for t in tags]
tokenized_tags

{'Gestion des eaux': [['eau'],
  ['stress', 'hydrique'],
  ['trame', 'bleue'],
  ['eau', 'de', 'pluie'],
  ['eaux', 'pluviales'],
  ['eaux', 'de', 'surface'],
  ['eaux', 'souterraines']],
 'Qualité des sols': [['sol'],
  ['poussiere'],
  ['sediment'],
  ['sous', 'sol'],
  ['permeabilite', 'du', 'sol'],
  ['permeabilite', 'du', 'sous', 'sol'],
  ['geologie'],
  ['nappe']],
 'Aménagement et occupation du sol': [['sol'],
  ['occupation'],
  ['usage'],
  ['amiante'],
  []],
 'Champs électromagnétiques': [['rayonnements', 'ionisants'],
  ['rayonnements', 'non', 'ionisants'],
  ['rayonnements', 'optiques'],
  ['nanoparticules', 'de', 'carbone'],
  ['champs'],
  ['emetteur', 'de', 'radiofrequences'],
  ['haute', 'frequence'],
  ['antennes', 'relais'],
  ['antennes', 'de', 'telediffusion'],
  ['radars'],
  ['lignes', 'haut', 'tension'],
  ['lignes', 'tres', 'haute', 'tension'],
  ['basse', 'frequence']],
 "Qualité de l'air": [['air'],
  ['monoxyde', 'de', 'carbone'],
  ['oxyde', 'azote'],
  ['

In [89]:
def top_topics_for_theme(theme, num_topics=5):
    # flatten tokenized tags - maybe keep expressions together (eg "trame bleue" instead of "trame", "bleue")
    tags = [t for sublist in tokenized_tags[theme] for t in sublist if t in model.vocab]
    if not tags:
        print(f"Not tag in keyword space for {theme}")
        return []
    topic_words, word_scores, topic_scores, topic_nums = model.search_topics(
        keywords=tags, num_topics=num_topics
    )
    return topic_nums, topic_scores

In [90]:
topics = {theme: top_topics_for_theme(theme) for theme in tokenized_tags.keys()}
topics

{'Gestion des eaux': (array([ 20, 316, 163,  89, 426]),
  array([0.57170225, 0.45813242, 0.32250349, 0.23468896, 0.22629029])),
 'Qualité des sols': (array([ 20, 316,  84,  89, 452]),
  array([0.57867994, 0.48643996, 0.21198946, 0.19671947, 0.14767214])),
 'Aménagement et occupation du sol': (array([316,  20,  89,  84, 126]),
  array([0.56150682, 0.51652438, 0.22628652, 0.20274832, 0.19166021])),
 'Champs électromagnétiques': (array([ 20, 316,  89, 334,  84]),
  array([0.61146689, 0.55114131, 0.2792194 , 0.1552014 , 0.15339591])),
 "Qualité de l'air": (array([ 20, 316,  89, 326,  84]),
  array([0.62625113, 0.56429671, 0.28609323, 0.18781121, 0.1741331 ])),
 'Environnement sonore': (array([ 20, 316,  89, 334, 322]),
  array([0.54333522, 0.47539531, 0.24858894, 0.16891128, 0.15872146])),
 'Luminosité': (array([316,  20,  89, 334, 404]),
  array([0.5681867 , 0.5299371 , 0.2608235 , 0.22165282, 0.19437381])),
 'Température': (array([ 20, 316,  89, 334,  84]),
  array([0.60286174, 0.5044360

In [92]:
def get_topic_words(topic_num):
    return model.topic_words[topic_num]
    
# compute list of topics present in all the themes
topics_lists = [t[0] for t in topics.values() if t]
topics_intersection = set(topics_lists[0])
for array in topics_lists[1:]:
    if len(array):
        topics_intersection &= set(array)
topics_intersection = list(topics_intersection)

for ti in topics_intersection:
    print(ti, get_topic_words(ti))

20 ['cadastrale' 'aufilduboamp' 'cadastrales' 'paysdelaloire' 'cadastral'
 'dicopub' 'defavorisees' 'trottinettes' 'decibels' 'decoupees' 'rasters'
 'allocataires' 'pluenligne' 'municipaux' 'nappes' 'auvergne' 'cadastraux'
 'geograndest' 'decouverte' 'dalles' 'idarticle' 'navettes' 'matieres'
 'decoupages' 'epargne' 'relevees' 'baccalaureat' 'ruissellement'
 'demarches' 'fonciere' 'visent' 'allocataire' 'enquetes' 'canevas'
 'millesimes' 'cumulees' 'foncier' 'pornic' 'raster' 'sismicite'
 'menagers' 'menages' 'cavites' 'cotes' 'parcelles' 'amortissements'
 'estuaires' 'emetteurs' 'cotieres' 'rattachement']
316 ['entreprises' 'filiales' 'entreprise' 'entrepreneurs' 'agences'
 'landesamt' 'agence' 'societes' 'carrieres' 'paysdelaloire' 'agenceore'
 'agen' 'commercants' 'affaires' 'prestataires' 'emetteurs' 'instaure'
 'livraisons' 'etablies' 'fournitures' 'detenues' 'deploiement'
 'renseignes' 'crowdsourcing' 'municipaux' 'caducite' 'instituees'
 'delegataires' 'covoiturage' 'entites' 'c

**Warning: some topics might be manually excluded below. Their ids might change if the model is re-trained.**

In [93]:
excluded_topics = [
    *topics_intersection,
    # uninteresting topic (description about description)
    #182,
    # uninteresting topic (manual review GD4H)
    #373,
]

best_topics_for_themes = {}

for theme, theme_topics_with_score in topics.items():
    print(theme)
    if not theme_topics_with_score:
        best_topics_for_themes[theme] = []
        continue
    theme_topics, topic_scores = theme_topics_with_score
    unique_topics = [t for t in zip(theme_topics, topic_scores) if t[0] not in excluded_topics]
    for t in unique_topics:
        print(t[0], get_topic_words(t[0]))
    best_topics_for_themes[theme] = unique_topics

best_topics_for_themes

Gestion des eaux
163 ['hydrographiques' 'hydrographique' 'hydrologiques' 'fontaines'
 'hydrogeologique' 'hydro' 'fontaine' 'irrigation' 'water' 'eau' 'eaux'
 'hydrographie' 'hydrauliques' 'hydraulique' 'eaufrance' 'inondations'
 'aquatique' 'aquatiques' 'inondation' 'piscines' 'inondables' 'basin'
 'reservoirs' 'fluvial' 'reservoir' 'canalisations' 'barrages' 'piscine'
 'inondable' 'pluviales' 'humide' 'bassins' 'humides' 'fluides' 'bassin'
 'barrage' 'rivieres' 'submersions' 'submersion' 'ecoulements'
 'submersibles' 'fleuve' 'riveraines' 'precipitations' 'riviere'
 'riverains' 'flow' 'toilettes' 'ecoulement' 'baie']
89 ['recharges' 'rechargeables' 'recharge' 'surcharge' 'paysdelaloire'
 'telecharge' 'telecharger' 'telechargement' 'uploads' 'chargee'
 'remboursement' 'charges' 'recalcules' 'telechargees' 'charge'
 'redecoupage' 'aufilduboamp' 'defibrillateurs' 'reccordon'
 'telechargeables' 'relance' 'reprenant' 'recuperees' 'reperage'
 'restitution' 'ecoulements' 'recuperer' 'repondr

{'Gestion des eaux': [(163, 0.322503487732561),
  (89, 0.23468896089922756),
  (426, 0.2262902864044138)],
 'Qualité des sols': [(84, 0.21198946079690018),
  (89, 0.1967194746091282),
  (452, 0.14767213856451827)],
 'Aménagement et occupation du sol': [(89, 0.22628651539795225),
  (84, 0.20274832065432397),
  (126, 0.1916602111371217)],
 'Champs électromagnétiques': [(89, 0.2792194038181166),
  (334, 0.15520139876038555),
  (84, 0.15339591459215368)],
 "Qualité de l'air": [(89, 0.2860932255908034),
  (326, 0.1878112130927055),
  (84, 0.17413310451202377)],
 'Environnement sonore': [(89, 0.24858893900990658),
  (334, 0.1689112807769975),
  (322, 0.15872146408200807)],
 'Luminosité': [(89, 0.2608235019066737),
  (334, 0.22165281557177757),
  (404, 0.1943738103328947)],
 'Température': [(89, 0.2421814931082065),
  (334, 0.13176001217752933),
  (84, 0.13147361357996518)],
 'Risques alimentaires': [(430, 0.2517551481651512),
  (132, 0.24138495937708865),
  (192, 0.24084333266134939)],
 'Bio

In [94]:
import csv
from datetime import datetime
from pathlib import Path
from slugify import slugify

output_path = Path("output") / datetime.now().strftime('%Y%m%d-%H%M%S')
output_path.mkdir(exist_ok=True)

for theme, topics_with_score in best_topics_for_themes.items():
    print(theme)
    theme_docs_with_scores = []
    for topic_num, topic_score in topics_with_score:
        doc_scores, doc_ids = model.search_documents_by_topic(
            topic_num, model.topic_sizes[topic_num], return_documents=False
        )
        # account for topic match score to query * doc match score to topic
        normalized_score = doc_scores * topic_score
        theme_docs_with_scores += zip(doc_ids, normalized_score, [topic_num for _ in doc_ids])
    # sort by normalized score after merge
    theme_docs_with_scores.sort(key=lambda x: x[1], reverse=True)
    # export to CSV
    output_file = f"top2vec-search-output_{slugify(theme)}.csv"
    with (output_path / output_file).open("w") as f:
        writer = csv.DictWriter(f, fieldnames=["slug_or_id", "score", "topic"])
        writer.writeheader()
        writer.writerows([{
            "slug_or_id": item[0],
            "score": item[1],
            "topic": item[2],
        } for item in theme_docs_with_scores])

Gestion des eaux
Qualité des sols
Aménagement et occupation du sol
Champs électromagnétiques
Qualité de l'air
Environnement sonore
Luminosité
Température
Risques alimentaires
Biodiversité
Autre


## Next step

You known have the raw output files in `output/{timestamp}/`. The next step would be to use the following command to map the datasets to existing gd4h organizations and produce filtered outputs in `outputs-filtered`:

```bash
python filter_outputs.py map_gd4h
```