<a href="https://colab.research.google.com/github/YinterestingProjects/human-wildlife-interactions/blob/main/notebooks/Translation_and_Topic_Modeling/BERTopic_labels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Generating labels from BERTopic Results


### Step 1: install libraries

In [2]:
! pip install pandas==1.4.2 joblib==1.2.0 bertopic==0.14.1 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandas==1.4.2
  Downloading pandas-1.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
Collecting bertopic==0.14.1
  Downloading bertopic-0.14.1-py2.py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.6 MB/s[0m et

In [3]:
import pickle
import json
import pandas as pd
from bertopic import BERTopic
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer

RANDOM_SEED = 42

### Step 2: load pre-trained model and data

In [4]:
# run this if you are using GoogleDrive as storage
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# enter the path for the cloned repo 
directory = ''

In [26]:
# get corpus
fp = f'{directory}/data/processed/topic_modeling_corpus.pkl'
docs = pd.read_pickle(fp)

# get look up to map back to video keys
fp = f'{directory}/data/processed/index_lookup.pkl'
index_lookup = pd.read_pickle(fp)
fp = f'{directory}/data/processed/yt8M_lookup.pkl'
yt8M_lookup = pd.read_pickle(fp)

In [None]:
# get model
# pre-trained bertopic_guided model is available at https://drive.google.com/file/d/1-2j8lVnBHrxBxXnqjcy_LPGFeRmar0X-/view?usp=share_link 
# please download the model into the models folder of clone repo
model_fp = f'{directory}/models/bertopic_guided'
topic_model = BERTopic.load(model_fp)
topics, probs = topic_model.fit_transform(docs)

### Step 3: create label dictionary

In [27]:
# define hunting topics based on review in BERTopic_visualizaitons
hunting_topics = [0, 12, 17, 18, 28]

# filter through video_tp_lookup and generate dictionary of video id and hunting label
video_tp_lookup = list(zip(topics, probs))
hunting_lookup = {v_id: (1 if topic in hunting_topics else 0) for v_id, (topic, prob) in enumerate(video_tp_lookup)}

# get hunting dictionary with yt8M id as key, label as value
updated_hunting_lookup = {}
for v_id, topic in hunting_lookup.items():
    idx = int(index_lookup[v_id])
    updated_id = yt8M_lookup.loc[idx, 'yt8M_id']
    updated_hunting_lookup[updated_id] = topic


In [None]:
# save the dictionary
with open(f'{directory}/data/processed/hunting_dict.json', 'w') as file:
     file.write(json.dumps(updated_hunting_lookup))

--------------------------------------
