# Create embeddings for questions using MPNet and BERTOverflow on Google Colab

Google colab offers GPUs for free (although with a few restrictions). For small batches of texts the free tier is more than necessary.

To use the GPUs provided by Google Colab, you have to change the runtime type by going to `Runtime > Change runtime type`.

This notebook is meant to be used with Google Drive. To use it, copy the corpus files for each dataset into Google Drive using the same directory structure (i.e., put the files in directories like: `data/{dataset_name}/corpus/`)

In [1]:
!pip install sentence_transformers

import pandas as pd
from scipy.sparse import save_npz, csr_matrix
from pathlib import Path

# Change the root folder where you put your data
# For example, if you copied the data/ directory
# to the root folder of your drive, the value should be `drive/MyDrive/data`
path_to_data = 'drive/MyDrive/data'

read = pd.read_parquet

def make_dir(path):
    path.mkdir(parents=True, exist_ok=True)
    
def website_dir_path(ds):
    return Path(path_to_data) / ds

def corpus_dir_path(ds):
    return website_dir_path(ds) / 'corpus'

def embeddings_dir_path(ds):
    return website_dir_path(ds) / 'embeddings'

def corpus_path(ds, tokenized=True):
    if tokenized:
        return corpus_dir_path(ds) / 'corpus_tokenized.parquet'
    else:
        return corpus_dir_path(ds) / 'corpus.parquet'
    
def embedding_dir_path(ds, m):
    return embeddings_dir_path(ds) / m

def embedding_path(ds, m, c):
    return embedding_dir_path(ds, m) / f'{c}.{m}.npz'

# names of the Stack Overflow samples
so_samples = [f"so_samples/sample_{i}" for i in range(5)]

gamedev_datasets = [
    "gamedev_se",
    "gamedev_so",
]

datasets = gamedev_datasets + so_samples

text_columns = [
    "title",
    "body",
    "tags",
    "title_body",
    "title_body_tags",
    "title_body_tags_answer",
]

def bertoverflow_model():
    from sentence_transformers import SentenceTransformer, models
    bertoverflow  = models.Transformer("jeniya/BERTOverflow")
    pooling_model = models.Pooling(bertoverflow.get_word_embedding_dimension())
    return SentenceTransformer(modules=[bertoverflow, pooling_model])

def mpnet_model():
    from sentence_transformers import SentenceTransformer
    return SentenceTransformer('paraphrase-mpnet-base-v2')

def get_bertoverflow_embeddings(ds, cols, use_gpu=True):
    """Computes BERTOverflow embeddings and saves them"""
    print("- Computing BERTOverflow embeddings")

    feature_name = "bertoverflow"

    corpus = read(paths.corpus(ds, tokenized=False))

    model = bertoverflow_model()

    make_dir(paths.embedding_dir(ds, feature_name))

    device = None
    if not use_gpu:
        device = "cpu"

    for c in cols:
        print(f"-- Computing {c} embeddings with BERTOverflow for {ds}.")
        emb = model.encode(corpus[c], device=device, show_progress_bar=True)

        emb_save_path = paths.embedding(ds, feature_name, c)
        save_npz(emb_save_path, csr_matrix(emb))


def get_mpnet_embeddings(ds, cols, use_gpu=True):
    """Computes MPNet embeddings and saves them"""
    print("- Computing MPNet embeddings")
    feature_name = "mpnet"

    corpus = read(corpus_path(ds, tokenized=False))

    model = mpnet_model()

    make_dir(embedding_dir_path(ds, feature_name))

    device = None
    if not use_gpu:
        device = "cpu"

    for c in cols:
        print(f"-- Computing {c} embeddings with MPNet for {ds}.")
        emb = model.encode(corpus[c], device=device, show_progress_bar=True)

        emb_save_path = embedding_path(ds, feature_name, c)
        save_npz(emb_save_path, csr_matrix(emb))

for ds in datasets:
    get_mpnet_embeddings(ds, text_columns)
    get_bertoverflow_embeddings(ds, text_columns)

Defaulting to user installation because normal site-packages is not writeable


NameError: name 'text_cols' is not defined