#### Text Extraction & Preprocessing

In [None]:
import pymupdf
import re

In [None]:
path = "../data/paper.pdf"
out_path = "../data/paper_extract.txt"
with pymupdf.open(path) as doc, open(out_path, "wb") as out:
  for page in doc:
    text = page.get_text()
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = text.encode("utf8")
    out.write(text)

#### Text Splitting

In [15]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=600,
  chunk_overlap=100,
  length_function=len,
  is_separator_regex=False,
)

with open(out_path) as file:
    text = file.read()
    texts = text_splitter.create_documents([text])
    text_chunks = [doc.page_content for doc in texts]

#### Generate Embeddings

In [16]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(text_chunks, show_progress_bar=True)

Batches: 100%|██████████| 3/3 [00:00<00:00,  5.98it/s]


#### Vector Store

In [24]:
import faiss


d = embeddings.shape[1]

index = faiss.IndexFlatL2(d)

index.add(embeddings)

print(f"Total sentences indexed: {index.ntotal}")

Total sentences indexed: 83


#### Similarity Search

In [25]:
query_sentence = "General Idea of the document"
query_embedding = model.encode([query_sentence])


k = 2 
distances, indices = index.search(query_embedding, k)

print(f"Query: {query_sentence}")

print("Most similar sentences:")
for i, idx in enumerate(indices[0]):
    print(f"{i + 1}: {text_chunks[idx]} (Distance: {distances[0][i]})")

Query: General Idea of the document
Most similar sentences:
1: The Turkish Online Journal of Educational Technology, 10(3), 203-214. [1] It should be noted that the reason for choosing this sample was for convenience since they were accessible to the researcher (Dörnyei, 2007, p. 98-99). [2] It should be noted that in order to ward off potential misunderstandings and to allow the participants to freely elaborate upon their answers, the interviews were conducted in Polish. [3] Both here and throughout the remainder of the paper, the excerpts are translations of the students’ responses by the present author. (Distance: 1.4060250520706177)
2: (2001), the notion of autonomy was introduced and popularized in 1981 by Henri Holec in his seminal report for the Council of Europe entitled Autonomy in Foreign Language Learning in which the researcher defined autonomy in the context of language learning as “the ability to take charge of one’s own learning” (Holec, 1981, p. 3). Holec’s idea of auto