# Create text representations using BERT
* Load the train/validate/tests datasets
* Compute topic distributions applying BERT

https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html

In [1]:
import pandas as pd
import pickle
import os
from bertopic import BERTopic
import matplotlib.pyplot as plt
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_PATH = '../data'

In [3]:
def load_df(name):
    # make sure the authors_parsed column contains arrays of str, 1 str per author name
    # e.g. ['Bognár, Zs., ', 'Sódor, Á., ', 'Clark, I. R., ', 'Kawaler, S. D., ']
    return pd.read_csv(
        os.path.join(DATA_PATH, name), 
        index_col=0, 
        converters={"authors_parsed": lambda x:[entry.replace("'", '').strip("[]") for entry in x.split("', '")]}
    )

train_df = load_df('arxiv_train.csv')
validate_df = load_df('arxiv_validate.csv')
test_df = load_df('arxiv_test.csv')

## Scikit-Learn Embeddings
Create a text representation using Scikit-Learn embeddings and BERT

In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

Fit the model on the train dataset

In [5]:
%%time
%env TOKENIZERS_PARALLELISM=false

pipe = make_pipeline(
    TfidfVectorizer(),
    TruncatedSVD(100)  # dimensionality reduction, like PCA
)
topic_model_BERT_scikit = BERTopic(embedding_model=pipe).fit(train_df['abstract'])

env: TOKENIZERS_PARALLELISM=false
CPU times: user 4min 10s, sys: 1min 38s, total: 5min 48s
Wall time: 48.9 s


Compute a soft topic distribution, where each document can have multiple topics

In [12]:
def topic_distr(docs, topic_model):
    distr, _ = topic_model.approximate_distribution(docs)
    return distr

In [13]:
%%time

train_topic_distr = topic_distr(train_df['abstract'], topic_model_BERT_scikit)
validate_topic_distr = topic_distr(validate_df['abstract'], topic_model_BERT_scikit)
test_topic_distr = topic_distr(test_df['abstract'], topic_model_BERT_scikit)

CPU times: user 1min 22s, sys: 15.8 s, total: 1min 38s
Wall time: 1min 38s


In [14]:
print(f"The topic distribution matrix for the train dataset has {train_topic_distr.shape[0]} distributions (one for each document) for {train_topic_distr.shape[1]} topics.")
print(f"The topic distribution matrix for the validate dataset has {validate_topic_distr.shape[0]} distributions (one for each document) for {validate_topic_distr.shape[1]} topics.")
print(f"The topic distribution matrix for the test dataset has {test_topic_distr.shape[0]} distributions (one for each document) for {test_topic_distr.shape[1]} topics.")

The topic distribution matrix for the train dataset has 45265 distributions (one for each document) for 284 topics.
The topic distribution matrix for the validate dataset has 22632 distributions (one for each document) for 284 topics.
The topic distribution matrix for the test dataset has 22633 distributions (one for each document) for 284 topics.


In [15]:
# Examples:

# Display the topic distribution for document 0 in validate dataset
# validate_topic_distr[0]

# Display info for topics
# topic_model_BERT_scikit.get_topic_info()

# Display keywords for topic
# topic_model_BERT_scikit.get_topic(177)

### Save the topic model and distribution matrices
These model and matrices are too large for Git.

In [16]:
with open(os.path.join(DATA_PATH, 'topic_model_BERT_scikit.pickle'), 'wb') as handle:
    pickle.dump(topic_model_BERT_scikit, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(DATA_PATH, 'train_topic_distr_BERT_scikit.pickle'), 'wb') as handle:
    pickle.dump(train_topic_distr, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(DATA_PATH, 'validate_topic_distr_BERT_scikit.pickle'), 'wb') as handle:
    pickle.dump(validate_topic_distr, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(DATA_PATH, 'test_topic_distr_BERT_scikit.pickle'), 'wb') as handle:
    pickle.dump(test_topic_distr, handle, protocol=pickle.HIGHEST_PROTOCOL)
