# Top2vec - search

Explore a top2vec model based on data.gouv.fr's catalog.

## Model training instructions

**warning** `learn_top2vec.py` needs a python 3.10 env to run (some libraries not compatible with python > 3.10 ATM).

You need to install [qsv](https://qsv.dathere.com) in order to preprocess the CSV catalog file.

```shell
pip install top2vec[sentence_transformers]
# download the latest catalog
make download
# create a cleaned version of the catalog
make filter
# train the model
python learn_top2vec.py train-model
```

In [68]:
from top2vec import Top2Vec

In [69]:
embedding_model = "distiluse-base-multilingual-cased"
model_file = f"top2vec_{embedding_model}.bin"
model = Top2Vec.load(model_file)

In [70]:
model.get_num_topics()

541

## Use keywords.yml for search_topics

Tokenize keywords.yml for every theme and apply search_topics for keywords that are in the vector space.

Get a list of interesting topics for every theme, filter them by common topics (too common, not precise enough) and manual pruning. Use `score = topic_score_in_query * doc_score_in_topic` to normalize scores across topics for a given theme. Keep the common topics apart because they might be interesting overall, even if they're not specific to a theme.

Export a list of documents for every prefered topic(s) for every theme.

In [71]:
from top2vec.Top2Vec import default_tokenizer

In [72]:
from utils import load_raw_keywords, load_kw_file

In [73]:
tokenized_tags = {}
for theme, tags in load_kw_file().items():
    tokenized_tags[theme] = [default_tokenizer(t) for t in tags]
tokenized_tags

{'Gestion des eaux': [['eau'], ['stress', 'hydrique'], ['trame', 'bleue']],
 'Qualité des sols': [['sol'], ['poussiere'], ['sediment']],
 'Aménagement et occupation du sol': [['sol'],
  ['occupation'],
  ['usage'],
  ['amiante']],
 'Champs électromagnétiques': [['rayonnements', 'ionisants'],
  ['rayonnements', 'non', 'ionisants'],
  ['rayonnements', 'optiques'],
  ['nanoparticules', 'de', 'carbone'],
  ['champs']],
 "Qualité de l'air": [['air'], ['monoxyde', 'de', 'carbone']],
 'Environnement sonore': [['bruit'],
  ['nuisance', 'sonore'],
  ['ambiance', 'thermique'],
  ['vibration']],
 'Luminosité': [['eclairage', 'public']],
 'Température': [['canicule'],
  ['ilot', 'de', 'chaleur'],
  ['ilot', 'de', 'fraicheur'],
  ['temperature']],
 'Risques alimentaires': [['aliment'], ['conservateur']],
 'Biodiversité': [['faune'],
  ['flore'],
  ['vegetal'],
  ['vegetaux'],
  ['insecte'],
  ['reservoir', 'biologique'],
  ['bacterie'],
  ['champignon', 'microscopique'],
  ['trame', 'verte'],
  ['v

In [74]:
def top_topics_for_theme(theme, num_topics=5):
    # flatten tokenized tags - maybe keep expressions together (eg "trame bleue" instead of "trame", "bleue")
    tags = [t for sublist in tokenized_tags[theme] for t in sublist if t in model.vocab]
    if not tags:
        print(f"Not tag in keyword space for {theme}")
        return []
    topic_words, word_scores, topic_scores, topic_nums = model.search_topics(
        keywords=tags, num_topics=num_topics
    )
    return topic_nums, topic_scores

In [75]:
topics = {theme: top_topics_for_theme(theme) for theme in tokenized_tags.keys()}
topics

Not tag in keyword space for Risques alimentaires


{'Gestion des eaux': (array([ 20, 316,  89, 163, 426]),
  array([0.54194508, 0.4177227 , 0.20196472, 0.19771077, 0.13278842])),
 'Qualité des sols': (array([ 20, 316,  84,  89, 452]),
  array([0.49099797, 0.40614742, 0.21147323, 0.18543416, 0.13105288])),
 'Aménagement et occupation du sol': (array([316,  20,  89,  84, 126]),
  array([0.56150682, 0.51652438, 0.22628652, 0.20274832, 0.19166021])),
 'Champs électromagnétiques': (array([ 20, 316,  89,  84, 452]),
  array([0.5562695 , 0.50407138, 0.19869777, 0.17012875, 0.14657172])),
 "Qualité de l'air": (array([ 20, 316,  89, 442, 326]),
  array([0.56656545, 0.50421575, 0.24073483, 0.17666679, 0.15243493])),
 'Environnement sonore': (array([ 20, 316,  89, 334, 322]),
  array([0.51910887, 0.44274878, 0.24457613, 0.15587113, 0.15249304])),
 'Luminosité': (array([316,  20, 349, 108,  89]),
  array([0.53738952, 0.48034521, 0.37904333, 0.33550778, 0.30238005])),
 'Température': (array([ 20, 316,  89,  84, 240]),
  array([0.60876428, 0.5013180

In [77]:
def get_topic_words(topic_num):
    return model.topic_words[topic_num]
    
# compute list of topics present in all the themes
topics_lists = [t[0] for t in topics.values() if t]
topics_intersection = set(topics_lists[0])
for array in topics_lists[1:]:
    if len(array):
        topics_intersection &= set(array)
topics_intersection = list(topics_intersection)

for ti in topics_intersection:
    print(ti, get_topic_words(ti))

89 ['recharges' 'rechargeables' 'recharge' 'surcharge' 'paysdelaloire'
 'telecharge' 'telecharger' 'telechargement' 'uploads' 'chargee'
 'remboursement' 'charges' 'recalcules' 'telechargees' 'charge'
 'redecoupage' 'aufilduboamp' 'defibrillateurs' 'reccordon'
 'telechargeables' 'relance' 'reprenant' 'recuperees' 'reperage'
 'restitution' 'ecoulements' 'recuperer' 'repondre' 'renouvelables'
 'remunerations' 'recouvrement' 'reponses' 'telechargeable' 'livraisons'
 'indemnites' 'reduite' 'reutilisation' 'remuneration' 'reparties'
 'recues' 'reperer' 'reclassement' 'auvergne' 'livraison' 'ecoulement'
 'recale' 'bdparcellaire' 'repertoriees' 'rattachement' 'repondant']
20 ['cadastrale' 'aufilduboamp' 'cadastrales' 'paysdelaloire' 'cadastral'
 'dicopub' 'defavorisees' 'trottinettes' 'decibels' 'decoupees' 'rasters'
 'allocataires' 'pluenligne' 'municipaux' 'nappes' 'auvergne' 'cadastraux'
 'geograndest' 'decouverte' 'dalles' 'idarticle' 'navettes' 'matieres'
 'decoupages' 'epargne' 'relevees

**Warning: some topics have been manually excluded below. Their ids might change if the model is re-trained.**

In [81]:
excluded_topics = [
    *topics_intersection,
    # uninteresting topic (description about description)
    #182,
    # uninteresting topic (manual review GD4H)
    #373,
]

best_topics_for_themes = {}

for theme, theme_topics_with_score in topics.items():
    print(theme)
    if not theme_topics_with_score:
        best_topics_for_themes[theme] = []
        continue
    theme_topics, topic_scores = theme_topics_with_score
    unique_topics = [t for t in zip(theme_topics, topic_scores) if t[0] not in excluded_topics]
    for t in unique_topics:
        print(t[0], get_topic_words(t[0]))
    best_topics_for_themes[theme] = unique_topics

best_topics_for_themes

Gestion des eaux
163 ['hydrographiques' 'hydrographique' 'hydrologiques' 'fontaines'
 'hydrogeologique' 'hydro' 'fontaine' 'irrigation' 'water' 'eau' 'eaux'
 'hydrographie' 'hydrauliques' 'hydraulique' 'eaufrance' 'inondations'
 'aquatique' 'aquatiques' 'inondation' 'piscines' 'inondables' 'basin'
 'reservoirs' 'fluvial' 'reservoir' 'canalisations' 'barrages' 'piscine'
 'inondable' 'pluviales' 'humide' 'bassins' 'humides' 'fluides' 'bassin'
 'barrage' 'rivieres' 'submersions' 'submersion' 'ecoulements'
 'submersibles' 'fleuve' 'riveraines' 'precipitations' 'riviere'
 'riverains' 'flow' 'toilettes' 'ecoulement' 'baie']
426 ['hydrographique' 'hydrographiques' 'eaux' 'hydrologiques' 'water' 'eau'
 'hydrogeologique' 'irrigation' 'hydro' 'hydrographie' 'aquatique'
 'fontaines' 'eaufrance' 'aquatiques' 'fontaine' 'inondations'
 'inondation' 'piscines' 'basin' 'hydrauliques' 'hydraulique' 'reservoir'
 'inondables' 'reservoirs' 'humide' 'spot' 'bassin' 'barrages' 'bassins'
 'fluvial' 'piscine'

{'Gestion des eaux': [(163, 0.1977107676465379), (426, 0.1327884210067234)],
 'Qualité des sols': [(84, 0.2114732283376602), (452, 0.13105288015020358)],
 'Aménagement et occupation du sol': [(84, 0.20274832065432397),
  (126, 0.1916602111371217)],
 'Champs électromagnétiques': [(84, 0.17012875036887068),
  (452, 0.14657172183565664)],
 "Qualité de l'air": [(442, 0.1766667883827392), (326, 0.1524349284951491)],
 'Environnement sonore': [(334, 0.15587112702559344),
  (322, 0.15249303640182527)],
 'Luminosité': [(349, 0.3790433334865524), (108, 0.33550778038614837)],
 'Température': [(84, 0.1376205353558292), (240, 0.1286579575578883)],
 'Risques alimentaires': [],
 'Biodiversité': [(84, 0.20305048795915248), (452, 0.16923843252177817)],
 'Autre': [(84, 0.20738411187344769), (326, 0.19075210358577177)]}

In [82]:
import csv
from datetime import datetime
from pathlib import Path
from slugify import slugify

output_path = Path("output") / datetime.now().strftime('%Y%m%d-%H%M%S')
output_path.mkdir(exist_ok=True)

for theme, topics_with_score in best_topics_for_themes.items():
    print(theme)
    theme_docs_with_scores = []
    for topic_num, topic_score in topics_with_score:
        doc_scores, doc_ids = model.search_documents_by_topic(
            topic_num, model.topic_sizes[topic_num], return_documents=False
        )
        # account for topic match score to query * doc match score to topic
        normalized_score = doc_scores * topic_score
        theme_docs_with_scores += zip(doc_ids, normalized_score, [topic_num for _ in doc_ids])
    # sort by normalized score after merge
    theme_docs_with_scores.sort(key=lambda x: x[1], reverse=True)
    # export to CSV
    output_file = f"top2vec-search-output_{slugify(theme)}.csv"
    with (output_path / output_file).open("w") as f:
        writer = csv.DictWriter(f, fieldnames=["slug_or_id", "score", "topic"])
        writer.writeheader()
        writer.writerows([{
            "slug_or_id": item[0],
            "score": item[1],
            "topic": item[2],
        } for item in theme_docs_with_scores])

Gestion des eaux
Qualité des sols
Aménagement et occupation du sol
Champs électromagnétiques
Qualité de l'air
Environnement sonore
Luminosité
Température
Risques alimentaires
Biodiversité
Autre


## Next step

You known have the raw output files in `output/{timestamp}/`. The next step would be to use the following command to map the datasets to existing gd4h organizations and produce filtered outputs in `outputs-filtered`:

```bash
python filter_outputs.py map_gd4h
```