In [4]:
import json

#create the docs_raw variable from the .json file
with open('documents.json','rt') as f_in:
    docs_raw = json.load(f_in)

documents = []


#siccome elastic search vuole tutti i documenti su un livello, mettiamo ad ogni documento il suo corso
#prima i documenti erano suddivisi in sezione, in base al corso
#guarda le differenze tra docs_raw e documents
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [None]:
documents

In [None]:
pip install sentence_transformers==2.7.0

In [7]:
from sentence_transformers import SentenceTransformer

In [9]:
#this model has 768 length
model = SentenceTransformer("all-mpnet-base-v2")

In [None]:
model.encode("this is a simple sentence")

In [15]:
operations=[]

for doc in documents:
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

In [18]:
pip install elasticsearch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [22]:
#initiate the elastic search connection
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')
es_client.info()


ObjectApiResponse({'name': 'd28c359d8359', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'pVV6eBd3R_CG5STfsQChqw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [24]:
#creating a mapping (aka index_settings). It defines how a document is stored and indexed
index_settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "section": {"type": "text"},
                "question": {"type": "text"},
                "course": {"type": "keyword"},
                #tipo -> dense_vector, dimensione -> 768, la metrica di similarità che andrà ad usare
                "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
            }
        }
    }


In [25]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [26]:
#add documents into index
import tqdm
#dont run this cell several times, or (idk why) some doc duplicated. If so, try to delete the index ->
#es_client.indices.delete(index=index_name)
from tqdm.auto import tqdm
for doc in tqdm(documents):
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)
    
    

  0%|          | 0/948 [00:00<?, ?it/s]

In [36]:
#create a query

search_term = "windows or mac?"
vector_search_term = model.encode(search_term)

In [38]:
#il campo "field" serve per dire al VDB di andare a cercare nel suddetto campo
#il campo "k" serve per cercare i k vettori più vicini alla query
#il campo "num_candidates" serve per indicare in quanti documenti andrà fatta la ricerca

query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [40]:
#let's search into VDB
#il campo "knn" serve per indicare 
#il campo "source" serve per indicare quali campi si vuole nella risposta

res = es_client.search(index=index_name, knn=query, source=["text","section","question","course"])
res['hits']['hits']

[{'_index': 'course-questions',
  '_id': 'dxMy1pEBAs_YWaRzhNRK',
  '_score': 0.7147919,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}},
 {'_index': 'course-questions',
  '_id': 'ihMy1pEBAs_YWaRz2df9',
  '_score': 0.61347336,
  '_source': {'question': 'WSL instructions',
   'course': 'mlops-zoomcamp',
   'section': 'Module 1: Introduction',
   'text': 'If you wish to use WSL on your windows machine, here are the setup instructions:\nCommand: Sudo apt install wget\nGet Anaconda download address here. wget <download address>\nTurn on Docker Desktop WFree Download | AnacondaSL2\nCommand: git clone <github repository address>\nVSCODE on WSL\nJupyter: pip3 install jupyter\nAdded by Gregory Morris (gwm1980@gmail.com)\nAll in all softwares a

In [45]:
#per fare una corretta ricerca semantica, dobbiamo andare a trasformare la nostra query in un vettore
#quando facciamo una ricerca normale(non semantica), i risultati avranno uno score compreso tra 0 e 1


knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k":5,
    "num_candidates": 10000
}

#il campo "explain" serve per avere più informazioni su come lo score è calcolato
#si può creare una propria scoring function
response = es_client.search(
    index=index_name,
    query={
        "match":{
            "course": "data-engineering-zoomcamp"
        },
    },
    knn=knn_query,
    source=["text","section","question","course"],
    size=5,
    explain=True
)

response['hits']['hits']

[{'_shard': '[course-questions][0]',
  '_node': 'jSbKn3OlSBaaJsNAgpI0DQ',
  '_index': 'course-questions',
  '_id': 'dxMy1pEBAs_YWaRzhNRK',
  '_score': 1.4937059,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'},
  '_explanation': {'value': 1.4937059,
   'description': 'sum of:',
   'details': [{'value': 0.7147919,
     'description': 'within top k documents',
     'details': []},
    {'value': 0.778914,
     'description': 'weight(course:data-engineering-zoomcamp in 35) [PerFieldSimilarity], result of:',
     'details': [{'value': 0.778914,
       'description': 'score(freq=1.0), computed as boost * idf * tf from:',
       'details': [{'value': 2.2, 'description': 'boost', 'details': []},
        {'value': 0.778914,
         'descriptio

ObjectApiResponse({'took': 20, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 438, 'relation': 'eq'}, 'max_score': 1.4937059, 'hits': [{'_index': 'course-questions', '_id': 'dxMy1pEBAs_YWaRzhNRK', '_score': 1.4937059, '_source': {'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully', 'section': 'General course-related questions', 'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?', 'course': 'data-engineering-zoomcamp', 'text_vector': [-0.026965461671352386, -0.000626126304268837, -0.01662949100136757, 0.05285150930285454, 0.05476527288556099, -0.03133990615606308, 0.029942581430077553, -0.04808562621474266, 0.04467551037669182, 0.005839474033564329, 0.016233040019869804, 0.012001154012978077, -0.031222281977534294, 0.016600528731942177, -0.04886901378631592, -0.06496307998895645, 0.046434223651885986, -0.009297756478190422, -0.0642

In [35]:
#cella per vedere quanti documenti sono presenti nel VDB

search_query = {
    "size": 10000,
    "query": {
        "match_all": {}
    }
        }
    

response = es_client.search(index=index_name, body=search_query)
len(response['hits']['hits'])

948