In [1]:
import pandas as pd
import re

In [20]:
fairy_tales = pd.read_csv("../data/fairy_tales.csv")

chunks = []

for title, text in zip(fairy_tales["title"], fairy_tales["text"]):
    sentences = [s.strip().lower() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]

    for sentence in sentences:
        chunks.append(sentence)
                
# sentences_per_chunk = 3
# overlap_sentences = 1

# for title, text in zip(fairy_tales["title"], fairy_tales["text"]):
#     sentences = [s.strip().lower() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]

#     i = 0
#     while i < len(sentences):
#         chunk = " ".join(sentences[i:i + sentences_per_chunk])
#         chunks.append(f"[{title.lower()}] {chunk}")
#         i += sentences_per_chunk - overlap_sentences

print(f"chunks: {len(chunks)}")

chunks: 905


In [3]:
preprocessor_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
encoder_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/2"

In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

  from pkg_resources import parse_version


In [5]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(preprocessor_url)
encoder_inputs = preprocessor(text_input)

In [6]:
encoder = hub.KerasLayer(encoder_url, trainable=True)
outputs = encoder(encoder_inputs)

pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]

In [21]:
embedding_model = tf.keras.Model(text_input, pooled_output)
embeddings = embedding_model(tf.constant(chunks))
print(f"shape: {embeddings.shape}")

shape: (905, 128)


In [8]:
import numpy as np

In [22]:
chunk_embeddings = tf.math.l2_normalize(embeddings, axis=1)

def answer_question(question, chunks, embedding_model, chunk_embeddings, top_k=3):
    q = tf.math.l2_normalize(embedding_model(tf.constant([question])), axis=1)
    
    sims = tf.matmul(q, chunk_embeddings, transpose_b=True)[0]
    
    top = tf.argsort(sims, direction="DESCENDING")[:top_k]
    return [(chunks[i], float(sims[i])) for i in top.numpy()]

In [23]:
question = "Мысық нені төгеді?"
results = answer_question(question, chunks, embedding_model, embeddings, top_k=5)

for chunk, similarity in results:
    print(f"Relevance: {similarity:.4f} \nChunk: {chunk}\n")

Relevance: 8.1032 
Chunk: үйіңде не бар?

Relevance: 8.0903 
Chunk: қайтейін амал бар ма?

Relevance: 8.0699 
Chunk: тұлпарға жер алыс па?

Relevance: 8.0530 
Chunk: қалай болады?

Relevance: 8.0515 
Chunk: тұлпарға жол алыс па?

