In [None]:
# ========================================
# 0 Install requirements
# ========================================
%pip install -r requirements.txt
# Note: if installation is done in notebook and not in terminal, restart the kernel after this step

In [None]:
# ========================================
# 1. Imports
# ========================================
import os
import pandas as pd
from dotenv import load_dotenv
from datasets import load_dataset
from distutils.util import strtobool
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import faiss
import pickle
import numpy as np
from huggingface_hub import HfFolder
from sklearn.preprocessing import normalize
from tqdm.notebook import tqdm
from s3_utils import upload_to_s3, load_pickle_from_s3, SAVE_DIR, S3_BUCKET, S3_PREFIX
import shutil
tqdm.pandas()

In [None]:
# ========================================
# 2. Config
# ========================================

# Load and set variables
import torch

load_dotenv()

# Model
MODEL_NAME = "all-MiniLM-L6-v2"

# Hugging face
HF_USERNAME = os.environ["HF_USERNAME"]
HF_MODEL_REPO = f"{HF_USERNAME}/hupd-patent-topic-model"
HF_TOKEN = os.environ["HF_TOKEN"]
os.environ["HF_HOME"] = "/tmp/hf"
HfFolder.save_token(HF_TOKEN)

# Dataset
DATASET_TYPE = os.environ["DATASET_TYPE"]

if DATASET_TYPE == "SAMPLE":
    START_DATE_TRAIN = '2016-01-01'
    END_DATE_TRAIN = '2016-01-21'
    START_DATE_VAL = '2016-01-22'
    END_DATE_VAL = '2016-01-31'
elif DATASET_TYPE == "FULL":
    START_DATE_TRAIN = '2013-01-01'
    END_DATE_TRAIN = '2017-11-30'
    START_DATE_VAL = '2017-12-01'
    END_DATE_VAL = '2017-12-31'

HUPD_DATA_FILE = "https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather"
METADATA_FILE =  "https://huggingface.co/datasets/HUPD/hupd/resolve/main/hupd_metadata_2022-02-22.feather"

# Create output dir
os.makedirs(SAVE_DIR, exist_ok=True)

# Use GPU device for better performance
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Whether to upload files to AWS S3
UPLOAD_TO_AWS_S3 = strtobool(os.environ["UPLOAD_TO_AWS_S3"])

In [None]:
# ========================================
# 3. Load HUPD dataset and metadata
# 4. Create dataset dataframes
# ========================================

# Note:
# - The dataset is based on the HUPD patent corpus.
# - Loading the full dataset (2013–2017) can take ~ 1 hour.
#   To save time, a cached pickle is loaded from S3 if available.
# - Train/val splits are not for ML training;
#   they exist only for compatibility with Hugging Face's `load_dataset` API
#   and the HUPD dataset loader. The entire dataset (2013-2017) is used for BERTopic training in this notebook.
# - The final output is a merged dataframe (`df_full`) with both
#   patent text (title + abstract) and metadata.

df_full_path = f"{SAVE_DIR}/df_full.pkl"

try:
    df_full = load_pickle_from_s3("df_full.pkl")
except:
    if DATASET_TYPE == "SAMPLE":
        dataset_dict = load_dataset(path="hupd_modified.py",
        name='sample',
        data_files=HUPD_DATA_FILE,
        icpr_label=None,
        train_filing_start_date=START_DATE_TRAIN,
        train_filing_end_date=END_DATE_TRAIN,
        val_filing_start_date=START_DATE_VAL,
        val_filing_end_date=END_DATE_VAL,
        trust_remote_code=True,
        cache_dir=f"/tmp/hupd_tmp_cache",
        )
    elif DATASET_TYPE == "FULL":
        dataset_dict = load_dataset(path="hupd_modified.py",
        name='all',
        data_files=HUPD_DATA_FILE,
        icpr_label=None,
        force_extract=True,
        train_filing_start_date=START_DATE_TRAIN,
        train_filing_end_date=END_DATE_TRAIN,
        val_filing_start_date=START_DATE_VAL,
        val_filing_end_date=END_DATE_VAL,
        trust_remote_code=True,
        cache_dir=f"/tmp/hupd_tmp_cache",
        )
    # Filter selected columns
    columns_to_keep = ["patent_number", "title", "abstract"]
    dataset_dict["train"] = dataset_dict["train"].select_columns(columns_to_keep)
    dataset_dict["validation"] = dataset_dict["validation"].select_columns(columns_to_keep)

    # Convert to pandas
    df_train = dataset_dict["train"].to_pandas()
    df_val = dataset_dict["validation"].to_pandas()

    # delete dataset_dict
    del dataset_dict

    # Change column name
    df_train['application_number'] = df_train['patent_number']
    df_val['application_number'] = df_val['patent_number']

    # Drop unnecessary columns
    df_train.drop(columns=['patent_number'], inplace=True)
    df_val.drop(columns=['patent_number'], inplace=True)

    # Create metadata dataframe
    metadata_path = f"{SAVE_DIR}/df_metadata.pkl"

    try:
        df_metadata = load_pickle_from_s3("df_metadata.pkl")
    except:
        df_metadata = pd.read_feather(METADATA_FILE)
        df_metadata.dropna(subset=['patent_number'], inplace=True)
        columns_to_keep = ["patent_number", "application_number", "date_application_published", "patent_issue_date", "main_cpc_label", "cpc_labels", "main_ipcr_label", "ipcr_labels", "filing_date", "decision"]
        df_metadata = df_metadata.loc[:, columns_to_keep]
        pickle.dump(df_metadata, open(metadata_path, "wb"))
        upload_to_s3(metadata_path, S3_BUCKET, S3_PREFIX)

    # Create full dataframe
    df_full = pd.merge(pd.concat([df_train, df_val]), df_metadata, on = 'application_number', how='inner')
    
    # Combine the patent title and abstract into a single text field for each entry
    df_full["text"] = df_full["title"].fillna('') + ". " + df_full["abstract"].fillna('')

    # Save df_full locally, upload to S3
    df_full.to_pickle(df_full_path)
    upload_to_s3(df_full_path, S3_BUCKET, S3_PREFIX)

In [None]:
# ========================================
# 5. Create docs
# ========================================

# Prepare list of patent documents, where each document is the title and abstract concatenated into a single string
docs = df_full["text"].tolist()

In [None]:
# ========================================
# 6. Generate embeddings
# ========================================

# Load the pretrained SentenceTransformer model (all-MiniLM-L6-v2)
# Encoding time: The full dataset (2013-2017) with a GPU can take ~ 1 hour, depending on GPU

embedding_model = SentenceTransformer(MODEL_NAME, device=device)
embeddings = embedding_model.encode(docs, batch_size=64, show_progress_bar=True)

In [None]:
# ========================================
# 7. Train BERTopic
# ========================================

# Training time: The full dataset (2013-2017) with a GPU can take ~ 1 hour, depending on GPU
# For reproducible results, pass a `random_state` parameter to BERTopic.

topic_model = BERTopic(embedding_model=embedding_model, verbose=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

In [None]:
# ========================================
# 8. Save BERTopic model, dataframes, pkl
# ========================================

# save full model (pickle serialization)
topic_model.save(f"{SAVE_DIR}/bertopic_model")

# save model (pytorch serialization)
topic_model.save(f"{SAVE_DIR}/bertopic_model_dir", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)

# Zip the saved model directory
shutil.make_archive(f"{SAVE_DIR}/bertopic_model_dir", 'zip', f"{SAVE_DIR}/bertopic_model_dir")

# save dataframe
df_full["topic"] = topics
df_full["probs"] = probs
df_full.to_pickle(f"{SAVE_DIR}/df_full.pkl")

# Build patent number to index mapping
patent_to_idx = {pn: idx for idx, pn in enumerate(df_full["patent_number"])}
with open(f"{SAVE_DIR}/patent_to_idx.pkl", "wb") as f:
    pickle.dump(patent_to_idx, f)

In [None]:
# ==============================================
# 9. Build FAISS index w normalized embeddings
# ==============================================

# Normalize embeddings
embeddings_normalized = normalize(embeddings, axis=1)

# Delete embeddings
del embeddings

# Save normalized embeddings
np.save(f"{SAVE_DIR}/embeddings_normalized.npy", embeddings_normalized)

# Build FAISS index
d = embeddings_normalized.shape[1]
index = faiss.IndexFlatIP(d)
# Add vectors to index
index.add(embeddings_normalized.astype(np.float32))
faiss.write_index(index, f"{SAVE_DIR}/patent_faiss_normalized_embeddings.index")

In [None]:
# ========================================
# 10. Create Dataframes for Streamlit Dashboard
# ========================================

# Process dataframes
df_topics = topic_model.get_topic_info()
df = pd.merge(df_full, df_topics[["Topic", "Name"]], left_on="topic", right_on="Topic", how="left")
df["topic_words"] = df["Name"]
df["year"] = df["filing_date"].dt.year
df["topic_id"] = df["topic"]
df["topic_words"] = df["Name"].apply(lambda x: ", ".join(x.split("_")[1:]))
df["year"] = df["filing_date"].dt.year
df["topic_id"] = df["topic"]
df_topics_by_year = df.groupby(["year", "topic_words", "topic_id"]).size().reset_index(name="count")
df_topics_by_year.loc[df_topics_by_year["topic_id"] == -1, "topic_words"] = "no_topic_found"
df_topics_count = df_topics.loc[:,["Name", "Count", "Topic"]]
df_topics_count.rename(columns={"Name": "topic_words", "Count": "count", "Topic": "topic_id"}, inplace=True)
df_topics_count["topic_words"] = df_topics_count["topic_words"].apply(lambda x: ", ".join(x.split("_")[1:]))
df_topics_count.loc[df_topics_count["topic_id"] == -1, "topic_words"] = "no_topic_found"

# Save dataframes locally
df_topics_by_year.to_pickle(f"{SAVE_DIR}/df_topics_by_year.pkl")
df_topics_count.to_pickle(f"{SAVE_DIR}/df_topics_count.pkl")

In [None]:
# ========================================
# 11. Upload all saved files to S3
# ========================================

# If UPLOAD_TO_AWS_S3 variable is set to True, upload files to S3
if UPLOAD_TO_AWS_S3:

    # List all files to upload
    files_to_upload = [
        f"{SAVE_DIR}/bertopic_model",
        f"{SAVE_DIR}/bertopic_model_dir.zip",
        f"{SAVE_DIR}/df_full.pkl",
        f"{SAVE_DIR}/patent_to_idx.pkl",
        f"{SAVE_DIR}/embeddings_normalized.npy",
        f"{SAVE_DIR}/patent_faiss_normalized_embeddings.index",
        f"{SAVE_DIR}/df_topics_count.pkl",
        f"{SAVE_DIR}/df_topics_by_year.pkl",
    ]

    # Upload files
    for path in files_to_upload:
        upload_to_s3(path, S3_BUCKET, S3_PREFIX)