In [None]:
import json
from bs4 import BeautifulSoup

In [None]:
def convert_html_to_text(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    return soup.get_text().strip().replace('\r','').replace('\n\n','\n')

First we load the JSON file with the HTML contents of the forum conversations

In [None]:
file_path = '../../scraper/forum_posts.json'
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)
# data[0]

The entire json payload is too large to be processed in memory. We take just the first 100 forum topics.

In [None]:
data = data[0:100]

Format the html inside the dictionary and remove trailing whitespaces

In [None]:
for item in data:
    item['title'] = convert_html_to_text(item['title'])
    for post in item['posts']:
        post['post'] = convert_html_to_text(post['post'])
data[0]

Write the forum topics as cursive conversations for chunking.

In [None]:
output_text = ""
for item in data:
    if output_text != "":
        output_text += "\n\n" 
    output_text += "Thema:" + item["title"] + "\n"
    for post in item['posts']:
        poster = post['poster'] if post['poster'] is not None else 'Unknown'
        output_text += poster + ": " + post['post'] + "\n\n"
print(output_text)

Write the cursive text to file

In [None]:
f = open("forum_posts.txt", "w")
f.write(output_text)
f.close()

Split the text into chunks with the tokenizer from the embedding model.

In [None]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'aari1995/German_Semantic_STS_V2'
model = SentenceTransformer(EMBEDDING_MODEL)
chunk_size = model.get_max_seq_length()

Use the tokenizer from the model

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)

def token_length_function(text_input):
  return len(tokenizer.encode(text_input, add_special_tokens=False))

Code for better specialized tokenixer for German language

In [None]:
# from somajo import SoMaJo
# from itertools import chain
# tokenizer = SoMaJo("de_CMC", split_camel_case=True)
# sentences = tokenizer.tokenize_text(["""Ca. 90min mit newmotion geladen, weil ich mit Maingau/EinfachStromLaden keine Verbindung über die App bekam. 
# Säule hat keinen RFID-Leser usw.
# 2. Buchse seit 2 Tagen mit Kommunalfahrzeug/EWV blockiert. """])
# # for sentence in sentences:
# #     for token in sentence:
# #         print(token.text)
# #     print()
# len(list(x.text for x in chain.from_iterable(sentences)))

Split the text by delimiting forum topics (\n\n\n), then individual posts (\n\n) and then regular new lines (\n). Use the `token_length_function` to keep the chunk size limited to the model chunk_size

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = 50,
    length_function = token_length_function,
    separators=["\n\n\n","\n\n", "\n"])
chunks = splitter.split_text(output_text)
# for i, chunk in enumerate(chunks):
#     print(f"Chunk #{i} with length {len(chunk)} characters and {token_length_function(chunk)} tokens: \n{chunk}\n\n\n")

In [None]:
# Distribution of chunk token sizes
# import matplotlib.pyplot as plt
# plt.hist([token_length_function(chunk) for chunk in chunks])

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the custom embedding model
embedding_model_wrapper = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
embedding_output = model.encode(chunks, normalize_embeddings=True, show_progress_bar=True)

In [None]:
print(f"There are {len(embedding_output)} embeddings of size {len(embedding_output[0])}")

In [None]:
from langchain_community.vectorstores import FAISS

def print_matches(matches):
    for p in matches:
        print(f"\n\nMatch with similarity {p[1]}:\n{p[0].page_content}")

text_embedding_pairs = zip(chunks, embedding_output)
vector_store = FAISS.from_embeddings(text_embedding_pairs, embedding_model_wrapper)
vector_store.save_local("forum_index")
answer1 = vector_store.similarity_search_with_score("Kind wacht stündlich auf")
print_matches(answer1)

In [None]:
answer2 = vector_store.similarity_search_with_score("Was sind Hexenstunden?")
print_matches(answer2)

In [None]:
answer3 = vector_store.similarity_search_with_score("Wer kümmert sich um das Kind nachts?")
print_matches(answer3)

In [None]:
answer4 = vector_store.similarity_search_with_score("Wann darf man mit Kind ins Tropical Island?")
print_matches(answer4)