In [1]:
!pip install datasets faiss-cpu==1.7.4 chromadb==0.4.22 sentence-transformers==2.3.1

Collecting faiss-cpu==1.7.4
  Downloading faiss-cpu-1.7.4.tar.gz (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting chromadb==0.4.22
  Downloading chromadb-0.4.22-py3-none-any.whl.metadata (7.3 kB)
Collecting sentence-transformers==2.3.1
  Downloading sentence_transformers-2.3.1-py3-none-any.whl.metadata (11 kB)
Collecting chroma-hnswlib==0.7.3 (from chromadb==0.4.22)
  Downloading chroma-hnswlib-0.7.3.tar.gz (31 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting posthog>=2.4.0 (fr

In [None]:
from datasets import load_dataset

qna_dataset = load_dataset("sadeem-ai/arabic-qna")

news_dataset = load_dataset("arbml/SANAD")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

ar-qna-train-data-hf.csv: 0.00B [00:00, ?B/s]

ar-qna-test-data-hf.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1030 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


dataset_infos.json: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

In [None]:
news_dataset = news_dataset.filter(lambda example: len(example["Text"]) >= 100)
news_dataset

In [None]:
news_dataset = news_dataset.shuffle(seed=42)

In [None]:
qna_dataset = qna_dataset.filter(lambda example: example["has_answer"] == True)

In [None]:
doc_texts = qna_dataset["train"]["text"] + news_dataset["train"][:30_000]["Text"]
doc_questions = qna_dataset["train"]["question"]

In [None]:
len(doc_texts)

In [None]:
metadata = [
    {
        "source": rec["source"],
        "title": rec["title"]
    }
    for rec in qna_dataset["train"]
]

metadata += [
    {
        "source": "",
        "title": "",
    }
    for i in range(30_000)
 ]

In [None]:
len(metadata)

In [None]:
docs_ids = [
    str(i)
    for i in range( len(doc_texts) )
]

In [None]:
len(docs_ids)

In [None]:
doc_questions[49], docs_ids[49], metadata[49]

## Text to Vectors

In [None]:
from sentence_transformers import SentenceTransformer

model_id = "sentence-transformers/distiluse-base-multilingual-cased-v2"
dim = 512

# model_id = "asafaya/bert-large-arabic"
# dim = 1024

device = "cuda:0" # "cpu"

model = SentenceTransformer(model_id, device=device)

In [None]:
encoded_docs = model.encode(doc_texts, show_progress_bar=True)

In [None]:
encoded_questions = model.encode(doc_questions, show_progress_bar=True)

## Vector Databases

### ChromaDB

In [None]:
import chromadb

chroma_client = chromadb.PersistentClient(path="./chromadb-ar-docs")

In [None]:
collection = chroma_client.create_collection(
    name="ar_docs_34k",
    metadata={"hnsw:space": "cosine"}
)

In [None]:
collection.add(
    documents=doc_texts,
    embeddings=encoded_docs,
    metadatas=metadata,
    ids=docs_ids
)

In [None]:
## Search
question = "ما السبب في صغر الأسنان بالمقارنة مع حجم الفكين؟"
question_embed = model.encode(question)

results = collection.query(
    query_embeddings=question_embed.tolist(),
    n_results=3
)

print(results)

### FAISS

In [None]:
import faiss
import numpy as np
from copy import deepcopy

In [None]:
norm_encoded_docs = deepcopy(encoded_docs)
faiss.normalize_L2(norm_encoded_docs)

In [None]:
faiss_index = faiss.IndexIDMap( faiss.IndexFlatIP(dim) )

faiss_index.add_with_ids( norm_encoded_docs, docs_ids )

In [None]:
question = "ما السبب في صغر الأسنان بالمقارنة مع حجم الفكين؟"
question_embed = model.encode([question])

faiss.normalize_L2(question_embed)

results = faiss_index.search(question_embed, 3)

In [None]:
print(results)

In [None]:
## Save
import pickle

with open("./faiss-ar-docs/index.pickle", "wb") as handle:
    pickle.dump(faiss_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open("./faiss-ar-docs/data.pickle", "wb") as handle:
    pickle.dump({
        "data": doc_texts,
        "docs_ids": docs_ids,
        "metadata": metadata
    }, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
## Load
import pickle

with open("./faiss-ar-docs/index.pickle", "rb") as handle:
    loaded_faiss_index = pickle.load(handle)

with open("./faiss-ar-docs/data.pickle", "rb") as handle:
    loaded_faiss_data = pickle.load(handle)

## Let's Compare

### Retrieving Speed

In [None]:
import time

**Dim: 512**

---------
`ChromaDB`: 4037 -
12.698187112

`FIASS`: 4037 -
2.043501231999997

---------
**Dim: 1024**

`ChromaDB`: 4037
15.938859471999997

`FIASS`: 4037
4.1828746040000055


In [None]:
t0 = time.process_time()

for i in range(len(doc_questions)):

    ques = encoded_questions[i]

    results = collection.query(
        query_embeddings=ques.tolist(),
        n_results=3
    )

print("ChromaDB:", len(doc_questions))
print(time.process_time() - t0)

In [None]:
t0 = time.process_time()

for i in range(len(doc_questions)):

    ques = encoded_questions[i].reshape(1, dim)

    faiss.normalize_L2(ques)

    results = faiss_index.search(ques, 3)


print("FIASS:", len(doc_questions))
print(time.process_time() - t0)

### Accuracy

`ChromaDB`

```
Model ID: sentence-transformers/distiluse-base-multilingual-cased-v2
----
Valid: 1201
Valid%: 0.2974981421847907
----
Similar: 864
Similar%: 0.21402031211295516
----
InValid: 1972
InValid%: 0.48848154570225416
----


Model ID: asafaya/bert-large-arabic
----
Valid: 586
Valid%: 0.14515729502105523
----
Similar: 427
Similar%: 0.10577161258360168
----
InValid: 3024
InValid%: 0.7490710923953431
----

```

`FAISS`

```
Model ID: sentence-transformers/distiluse-base-multilingual-cased-v2
----
Valid: 1374
Valid%: 0.3403517463462968
----
Similar: 947
Similar%: 0.23458013376269507
----
InValid: 1716
InValid%: 0.4250681198910082
----

Model ID: asafaya/bert-large-arabic
----
Valid: 703
Valid%: 0.17413921228635126
----
Similar: 518
Similar%: 0.12831310378994304
----
InValid: 2816
InValid%: 0.6975476839237057
----
```

In [None]:
chroma_results = []

for i in range(len(doc_questions)):

    ques = encoded_questions[i]

    results = collection.query(
        query_embeddings=ques.tolist(),
        n_results=3
    )

    chroma_results.append(results)

In [None]:
chroma_insights = {
    "valid": 0,
    "similar": 0,
    "invalid": 0
}

for i in range(len(doc_questions)):
    true_id = docs_ids[i]
    pred_id = chroma_results[i]["ids"][0][0]

    true_source = metadata[i]["source"]
    pred_source = metadata[int(pred_id)]["source"]

    if str(true_id) == str(pred_id):
        chroma_insights["valid"] += 1

    elif true_source == pred_source:
        chroma_insights["similar"] += 1

    else:
        chroma_insights["invalid"] += 1

chroma_insights["valid_percentage"] = chroma_insights["valid"]/len(doc_questions)
chroma_insights["similar_percentage"] = chroma_insights["similar"]/len(doc_questions)
chroma_insights["invalid_percentage"] = chroma_insights["invalid"]/len(doc_questions)

print("Model ID:", model_id)
print("----")
print("Valid:", chroma_insights["valid"])
print("Valid%:", chroma_insights["valid_percentage"])
print("----")
print("Similar:", chroma_insights["similar"])
print("Similar%:", chroma_insights["similar_percentage"])
print("----")
print("InValid:", chroma_insights["invalid"])
print("InValid%:", chroma_insights["invalid_percentage"])
print("----")

In [None]:
faiss_results = []

for i in range(len(doc_questions)):

    ques = encoded_questions[i].reshape(1, dim)

    faiss.normalize_L2(ques)

    scores, ids = faiss_index.search(ques, 3)

    faiss_results.append({
        "scores": scores,
        "ids": ids
    })


In [None]:
faiss_insights = {
    "valid": 0,
    "similar": 0,
    "invalid": 0
}

for i in range(len(doc_questions)):
    true_id = docs_ids[i]
    pred_id = faiss_results[i]["ids"][0][0]

    true_source = metadata[i]["source"]
    pred_source = metadata[int(pred_id)]["source"]

    if str(true_id) == str(pred_id):
        faiss_insights["valid"] += 1

    elif true_source == pred_source:
        faiss_insights["similar"] += 1

    else:
        faiss_insights["invalid"] += 1


faiss_insights["valid_percentage"] = faiss_insights["valid"]/len(doc_questions)
faiss_insights["similar_percentage"] = faiss_insights["similar"]/len(doc_questions)
faiss_insights["invalid_percentage"] = faiss_insights["invalid"]/len(doc_questions)


print("Model ID:", model_id)
print("----")
print("Valid:", faiss_insights["valid"])
print("Valid%:", faiss_insights["valid_percentage"])
print("----")
print("Similar:", faiss_insights["similar"])
print("Similar%:", faiss_insights["similar_percentage"])
print("----")
print("InValid:", faiss_insights["invalid"])
print("InValid%:", faiss_insights["invalid_percentage"])
print("----")