In [7]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m31.4/31.4 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [10]:
import os
import re
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline


In [22]:
def load_whatsapp_chat(path: str) -> list:
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()
    lines = raw.splitlines()
    messages = []
    current = None
    start_re = re.compile(r'^\[\d{1,2}/\d{1,2}/\d{4}, \d{2}:\d{2}:\d{2}\] [^:]+: ')
    for line in lines:
        if start_re.match(line):
            if current is not None:
                messages.append(current)
            current = line
        else:
            if current is None:
                current = line
            else:
                current += "\n" + line
    if current is not None:
        messages.append(current)
    return messages

messages = load_whatsapp_chat('_chat.txt')
print(messages[:5])

827
['[01/01/2020, 13:02:54] Caroline Sis: \u200eMessages and calls are end-to-end encrypted. Only people in this chat can read, listen to, or share them.', '[01/01/2020, 13:02:54] Caroline Sis: Happy new year baby ‚ù§‚ù§‚ù§', '[01/01/2020, 13:03:04] Caroline Sis: I‚Äôm trying to see if anyone recorded it', '[01/01/2020, 13:32:41] Anat Lorman: Happy new year my love üòç', '[20/01/2020, 11:51:20] Anat Lorman: Happy birthdayüíïüíïüíïüíïüíïüíïüíï']


In [23]:
def chunk_messages(messages: list, chunk_size: int = 5) -> list:
    chunks = []
    for i in range(0, len(messages), chunk_size):
        chunk = "\n\n".join(messages[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

chunks = chunk_messages(messages, chunk_size=4)
print(len(chunks))
print(chunks[0])

207
[01/01/2020, 13:02:54] Caroline Sis: ‚ÄéMessages and calls are end-to-end encrypted. Only people in this chat can read, listen to, or share them.

[01/01/2020, 13:02:54] Caroline Sis: Happy new year baby ‚ù§‚ù§‚ù§

[01/01/2020, 13:03:04] Caroline Sis: I‚Äôm trying to see if anyone recorded it

[01/01/2020, 13:32:41] Anat Lorman: Happy new year my love üòç


In [24]:
def create_embeddings(texts: list, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2') -> np.ndarray:
    embedder = SentenceTransformer(model_name)
    embs = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    return embedder, embs

def build_faiss_flatl2_index(index_vectors: np.ndarray) -> faiss.Index:
    dim = index_vectors.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(index_vectors)
    return index

def faiss_search(query_vectors: np.ndarray, index: faiss.Index, k: int = 3):
    distances, indices = index.search(query_vectors, k)
    return distances, indices

chunks = chunk_messages(messages, chunk_size=5)
embedder, embs = create_embeddings(chunks)
index = build_faiss_flatl2_index(embs)


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

In [25]:
def retrieve_messages(query: str, embedder: SentenceTransformer, index: faiss.Index, chunks: list, k: int = 3):
    q_emb = embedder.encode([query], convert_to_numpy=True)
    _, indices = faiss_search(q_emb, index, k)
    # indices shape: (1, k)
    retrieved = [chunks[i] for i in indices[0] if i != -1]
    return retrieved

# Example:
retrieved = retrieve_messages('Charlotte tests', embedder, index, chunks, k=3)
print(retrieved)


['[26/09/2020, 23:37:44] Caroline Sis: Charlotte is the same she changed subject and stuf\n\n[26/09/2020, 23:37:48] Anat Lorman: The city is Haifa where we had the last trip before your bus dropped us when you were visiting\n\n[26/09/2020, 23:37:57] Caroline Sis: Ohhh okay\n\n[26/09/2020, 23:38:01] Anat Lorman: What did she decide to study?\n\n[26/09/2020, 23:38:05] Caroline Sis: Chemistry', '[06/03/2020, 18:20:51] Anat Lorman: So how is everybody? Charlotte and her exams\n\n[06/03/2020, 18:22:20] Caroline Sis: Awww ‚ù§ we miss you too , we were talking about you the other day ?\n\n[06/03/2020, 18:22:29] Caroline Sis: Everyone‚Äôs good\n\n[06/03/2020, 18:22:39] Caroline Sis: She has mock exams next week so she‚Äôs a bit stressed\n\n[06/03/2020, 18:27:35] Anat Lorman: Good luck!!!!!!!!!', '[15/10/2020, 20:21:53] Anat Lorman: Charlotte‚Äôs bday is on the 22 right?\n\n[15/10/2020, 20:24:50] Caroline Sis: Yess\n\n[15/10/2020, 20:36:41] Anat Lorman: Ok \nAt first I thought it was yesterday\