In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

# 1. Process scrapped data

In [2]:
import logging
import pickle
import re
from pathlib import Path

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)
tqdm.pandas()

from chat.constants import EMBEDDING_MODEL_REPO_NAME, PROCESSED_DATA_FILE

# note: this path is different from chat.constants, because now we are executing notebook from a different folder
DATA_FOLDER = Path('./tmp/data')

# 1. Process the html into texts

In [3]:
def clean_text(text):
    # Remove leading and trailing whitespace
    text = text.strip()
    
    # Replace sequences of spaces and carriage returns surrounded by newlines with a single newline
    text = re.sub(r'(\n)[\s\r]+(\n)', '\n', text)
    
    # Replace sequences of multiple newlines with a single newline
    text = re.sub(r'\n+', '\n', text)
    
    # Replace sequences of spaces (not surrounded by newlines) with a single space
    text = re.sub(r' +', ' ', text)
    
    return text

def parse_html(x):
    try:
        soup = BeautifulSoup(x, "html.parser")
        return soup.get_text()
    except:
        return None
    
def process_scraped_data(input_path, output_folder, remove_string_prefix=None, remove_string_suffix=None):
    # output_path = Path(output_path)
    output_path = output_folder / PROCESSED_DATA_FILE
    if not output_path.exists():        
        logging.info("Loaded scraped data")
        res = pickle.load(open(input_path, "rb"))
        logging.info("Convert to pandas")
        df_urls = pd.DataFrame([
            {
                "url": v["metadata"]["url"], 
                "timestamp": v["metadata"]["timestamp"], 
                "content": v["content"],
                "key": v["key"]
            } for v in res.values()]).sort_values("timestamp")
        
        logging.info("Remove urls")
        mask = np.ones(len(df_urls), dtype=bool)
        if remove_string_prefix is not None:
            logging.info(f"Removing urls with prefix: {', '.join(remove_string_prefix)}")
            mask &= df_urls.url.apply(lambda x: not any(x.startswith(s) for s in remove_string_prefix)).values
        if remove_string_suffix is not None:
            logging.info(f"Removing urls with suffix: {', '.join(remove_string_suffix)}")
            mask &= df_urls.url.apply(lambda x: not any(x.endswith(s) for s in remove_string_suffix)).values    
        df_data = df_urls[mask].reset_index(drop=True).copy()

        logging.info("Parse html")
        df_data["text"] = df_data["content"].progress_apply(parse_html)
        df_data.dropna(subset=["text"], inplace=True)

        logging.info("Clean text")
        df_data["text_cleaned"] = df_data.text.progress_apply(clean_text)
        
        logging.info("Save to parquet")
        df_data.reset_index(drop=True).to_parquet(output_path)
        
    return pd.read_parquet(output_path)
    

In [4]:
input_path = DATA_FOLDER / "scraped_data.pkl"
df_data = process_scraped_data(input_path, DATA_FOLDER)

In [5]:
df_data[["url", "timestamp", "content", "text", "text_cleaned"]].iloc[0]

url             https://helpdesk.ugent.be/security/veilig-werk...
timestamp                              2023-10-22T22:22:12.916451
content         \n<!doctype html>\n<html lang="nl">\n<head>\n ...
text            \n\n\n\n\nVeilig werken met IT aan de UGent (m...
text_cleaned    Veilig werken met IT aan de UGent (medewerkers...
Name: 0, dtype: object

# 2. transform data into chunks

In [6]:
from dataclasses import dataclass
from typing import List, Callable
from transformers import AutoTokenizer

from chat.constants import CHUNK_OVERLAP, CHUNKED_FILE

In [7]:
@dataclass(frozen=True)
class Tokenizer:
    chunk_overlap: int
    tokens_per_chunk: int
    decode: Callable[[List[int]], str]
    encode: Callable[[str], List[int]]

def split_text_on_tokens(*, text: str, tokenizer: Tokenizer):
    """Split incoming text and return chunks using tokenizer."""
    splits: List[str] = []
    input_ids = tokenizer.encode(text)
    start_idx = 0
    cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
    chunk_ids = input_ids[start_idx:cur_idx]
    while start_idx < len(input_ids):
        splits.append(tokenizer.decode(chunk_ids))
        start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap
        cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
        chunk_ids = input_ids[start_idx:cur_idx]
    return splits

def chunk_text(df_data, repo_name, output_folder):
    output_file_path = output_folder / CHUNKED_FILE
    if not output_file_path.exists():
        model_tokenizer = AutoTokenizer.from_pretrained(repo_name)
        
        chunk_size = model_tokenizer.model_max_length
            
        tokenizer = Tokenizer(chunk_overlap=CHUNK_OVERLAP, tokens_per_chunk=chunk_size,decode=lambda x: model_tokenizer.decode(x, skip_special_tokens=True), encode=model_tokenizer.encode)

        df_data["text_chunked"] = df_data.text_cleaned.progress_apply(lambda x: split_text_on_tokens(text=x, tokenizer=tokenizer))
        df_data[['key', 'text_chunked']].to_parquet(output_file_path)    
    df_data = pd.read_parquet(output_file_path)
    return df_data

In [8]:
df_data_chunked = chunk_text(df_data, EMBEDDING_MODEL_REPO_NAME, DATA_FOLDER)

In [9]:
df_data_chunked.iloc[0]

key                              4d36516f1ec9bbee4576cc75195f926b
text_chunked    [Veilig werken met IT aan de UGent (medewerker...
Name: 0, dtype: object

# 3. embed chunks

In [10]:
from itertools import islice

import torch
from transformers import AutoModel

from chat.constants import EMBEDDING_PROMPT, EMBEDDING_MODEL_DEVICE, CHUNKED_EMB_FILE

In [11]:
@torch.inference_mode()
def get_embedding(model, tokenizer, text, pooling="mean"):
    def average_pool(last_hidden_states, attention_mask):
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return (last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]).cpu().numpy()
    
    batch_dict = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt').to(model.device)
    outputs = model(**batch_dict)

    if pooling == "mean":
        return average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    elif pooling == "last":
        return outputs.last_hidden_state[:, -1, :]
    else:
        raise NotImplementedError(f"{pooling} pooling not implemented")

def embed_texts(text, model, tokenizer, pooling, batch_size=128):
    def chunk(it, size):
        iterator = iter(it)
        while chunk := list(islice(iterator, size)):
            yield chunk

    embeddings = []    
    for batch in tqdm(chunk(text, batch_size), total=len(text) // batch_size +1):            
        emb = get_embedding(model, tokenizer, batch, pooling=pooling)
        embeddings.append(emb)
    return list(np.vstack(embeddings).astype(float))

def embed_chunks(repo_name, output_folder):
    output_file_path = output_folder / CHUNKED_EMB_FILE
    if not output_file_path.exists():
        assert df_data["key"].nunique() == len(df_data)
        df_chunks_exploded = df_data.explode('text_chunked').reset_index(drop=True).reset_index().rename(columns={'index': 'chunk_id'})

        
        tokenizer = AutoTokenizer.from_pretrained(repo_name)
        model = AutoModel.from_pretrained(repo_name)
        model.eval()
        model.cuda(EMBEDDING_MODEL_DEVICE)

        # add prompt
        texts = df_chunks_exploded.text_chunked.progress_apply(lambda x: EMBEDDING_PROMPT.format(text=x)).values
        df_chunks_exploded["emb"] = embed_texts(texts, model, tokenizer, pooling="mean")
        df_chunks_exploded.to_parquet(output_file_path)

    df_chunk_embs = pd.read_parquet(output_file_path)
    return df_chunk_embs

df_chunk_embs = embed_chunks(EMBEDDING_MODEL_REPO_NAME, DATA_FOLDER)

# 4. Sanity check

In [12]:
from chat.constants import RETRIEVAL_TOP_K
from chat.utils import get_corpus, get_corpus_embeddings, dense_retrieval, get_documents

## 4.1 similarity between embedded chunks

In [13]:
doc_id = 0

# get query embedding
query_emb = df_chunk_embs.iloc[doc_id]["emb"]

# get corpus embeddings
corpus_embs = np.vstack(df_chunk_embs["emb"].values)
corpus_embs = corpus_embs / np.linalg.norm(corpus_embs, axis=1, keepdims=True)

# compute cosine similarity
rec_ids = np.argsort(np.dot(corpus_embs, query_emb))[-RETRIEVAL_TOP_K:][::-1]

# get retrieved texts
df_chunk_embs.iloc[rec_ids]["text_chunked"]

0      Veilig werken met IT aan de UGent (medewerkers...
6      Veilig werken met IT aan de UGent (medewerkers...
982    Working safely with IT at the UGent (students)...
972    Working safely with IT at the UGent (staff) ZO...
3      slagen gegevens versleuteld worden (bv. met Bi...
Name: text_chunked, dtype: object

## 4.2 Manually Check User Query

In [14]:
logging.info("Loading corpus")
df_chunk_embs = get_corpus(DATA_FOLDER)

logging.info("Loading embeddings")    
chunk_embs = get_corpus_embeddings(df_chunk_embs)

INFO:root:Loading corpus


INFO:root:Loading embeddings


In [15]:
EMBEDDING_TOKENIZER = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_REPO_NAME)
EMBEDDING_MODEL = AutoModel.from_pretrained(EMBEDDING_MODEL_REPO_NAME)
EMBEDDING_MODEL.eval()
_ = EMBEDDING_MODEL.cuda(EMBEDDING_MODEL_DEVICE)

In [16]:
user_msg = "How can I set up email on MacOS?"
dense_retrieval_results = dense_retrieval(user_msg, chunk_embs, EMBEDDING_MODEL, EMBEDDING_TOKENIZER, RETRIEVAL_TOP_K)
candidate_keys, candidate_docs = get_documents(dense_retrieval_results, df_chunk_embs)

In [17]:
candidate_docs

array(['E-mail - Set up on macOS for Exchange Online (Office 365) ZOEK MENU Search In het Nederlands UGentNetE-mailAccount & passwordAthena & softwareStorageHelpMe DICT Helpdesk HomeE-mailMacos E-mail - Set up on macOS On this page Mail Microsoft Outlook On macOS (Monterey - version 12.x.x) you can add an e-mail account in the standard e-mail program Mail or in Microsoft Outlook (part of Office 365). Mail Choose "System Preferences" from the Apple menu Go to "Internet Accounts" Choose "Microsoft Exchange" in the right panel Enter your name and UGent e-mail address and choose "Sign In" Choose "Sign In" again when asked if you want to sign in to your Exchange account using Microsoft Enter your password on the UGent login screen Approve the sign-in request with your 2nd factor Select the apps you want to use with your account and choose "Done" The Exchange account is now available in Mail Microsoft Outlook Choose "Preferences..." from the Outlook menu Go to "Accounts" Add a new account us