<a href="https://colab.research.google.com/github/YinterestingProjects/human-wildlife-interactions/blob/main/notebooks/Translation_and_Topic_Modeling/BERTopics_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Topic Modeling via BERTopics 

### Step 1: install libraries

In [9]:
! pip install pandas==1.4.2 joblib==1.2.0 bertopic==0.14.1 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
import pickle
import pandas as pd
from bertopic import BERTopic
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer

RANDOM_SEED = 42

### Step 2: load data

In [11]:
# run this if you are using GoogleDrive as storage
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# enter the path for the cloned repo 
directory = ''

In [13]:
# get corpus
fp = f'{directory}/data/processed/topic_modeling_corpus.pkl'
docs = pd.read_pickle(fp)

### Step 3: modeling with BERTopics 

In [14]:
# destinate path for saved the model 
model_name = ''
model_fp = f'{directory}/models/{model_name}'

# custom BERTopics components
umap_model = UMAP(random_state=RANDOM_SEED)
tf_vectorizer = CountVectorizer(lowercase = True, # capitalization
                                  ngram_range = (2,3), # include 2-word phrases
                                  min_df=1,  # occur in absolute count of doc
                                  max_df=0.90,   # % of docs
                                  stop_words='english') # default English stopwords
  
# define seed topics of interest
charismatic_creatures = ['lion', 'tiger', 'elephant', 'leopard', 'big cat', 'puma']
national_park = ['safari', 'bush walk', 'wildlife photography', 'national park']
bird_watching = ['bird watching', 'migration pattern', 'observe', 'binocular', 'telescope', 'bird call']
conservation = ['conservation', 'wildlife', 'biodiversity', 'nature', 'scenic', 'sightings', 'ecosystem', 'extinction', 'cam','footage'] 
emotions = ['awesome', 'inspiring', 'amazing', 'miraculous', 'spiritual']
photography = ['nikon', 'canon', 'powershot', 'hd', '1080p']
conflict = ['heartbreaking', 'harm', 'kill', 'hunt']
illegal_trade = ['poaching', 'illegal', 'trade', 'sale']
hunting_related_terms = ['trophy', 'rifle', 'trap', 'game']
cute_animals = ['baby', 'cub', 'pup', 'friendly', 'cute', 'squirrel', 'love', 'rescue']
exotic_pets = ['exotic pets', 'rare', 'unusual', 'domesticated']
aquarium = ['fish', 'aquatic', 'sea world', 'shark', 'jellyfish', 'otter']

seed_topic_list = [charismatic_creatures, national_park, bird_watching, conservation, emotions, photography, 
                     conflict, illegal_trade, hunting_related_terms, cute_animals, exotic_pets, aquarium]
  
# initiate and fit BERTopics
topic_model = BERTopic(umap_model=umap_model, vectorizer_model=tf_vectorizer, 
                         seed_topic_list=seed_topic_list, calculate_probabilities=True )

topics, probs = topic_model.fit_transform(docs)

# save model 
topic_model.save(model_fp)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

--------------------------------------
