In [80]:
import pandas as pd
import json
from tqdm.auto import tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
# from openai import OpenAI
# client = OpenAI()
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch

# Ingestion

In [81]:
with open('../data/arsonor_chunks_300_50.json', 'r', encoding='utf-8') as file:
    documents = json.load(file)

In [72]:
es = Elasticsearch("http://localhost:9200")

In [42]:
# index_name = "arsonor_chunks_300"

In [51]:
# if es.indices.exists(index=index_name):
#     es.indices.delete(index=index_name)

In [44]:
# Create index if not already created
# if not es.indices.exists(index=index_name):
#     es.indices.create(index=index_name, body={
#         "mappings": {
#             "properties": {
#                 "article_id": {"type": "keyword"},
#                 "title": {"type": "text"},
#                 "url": {"type": "keyword"},
#                 "category": {"type": "keyword"},
#                 "tags": {"type": "text"},
#                 "chunk_id": {"type": "keyword"},
#                 "chunk_text": {"type": "text"},
#                 "embedding": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
#             }
#         }
#     })

In [73]:
# Load CamemBERT model for French content
camembert_tokenizer = AutoTokenizer.from_pretrained("camembert-base")
camembert_model = AutoModel.from_pretrained("camembert-base")

# Load mBERT model for technical fine-tuning
mbert_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
mbert_model = AutoModel.from_pretrained("bert-base-multilingual-cased")

# Load paraphrase-multilingual-mpnet-base-v2 for semantic search
paraphrase_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')



In [82]:
def generate_embedding(text, embedding_model):
    if embedding_model == "camembert":
        inputs = camembert_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        outputs = camembert_model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    
    elif embedding_model == "mbert":
        inputs = mbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        outputs = mbert_model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    
    elif embedding_model == "paraphrase":
        embeddings = paraphrase_model.encode(text)
    
    else:
        raise ValueError("Unsupported model type. Choose 'camembert', 'mbert', or 'paraphrase'.")
    
    return embeddings

In [98]:
model_name = 'all-mpnet-base-v2'
model = SentenceTransformer(model_name)



Test indexing

In [99]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "article_id": {"type": "keyword"},
            "title": {"type": "text"},
            "url": {"type": "keyword"},
            "category": {"type": "keyword"},
            "tags": {"type": "text"},
            "chunk_id": {"type": "keyword"},
            "chunk_text": {"type": "text"},
            "chunk_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "title_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "tags_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "chunk_title_tag_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "chunks_300_mpnet"

es.indices.delete(index=index_name, ignore_unavailable=True)
es.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'chunks_300_mpnet'})

In [100]:
for doc in tqdm(documents):
    chunk = doc['chunk_text']
    title = doc['title']
    tags = doc['tags']
    ctt = chunk + ' ' + title + ' ' + tags

    doc['chunk_vector'] = model.encode(chunk)
    doc['title_vector'] = model.encode(title)
    doc['tags_vector'] = model.encode(tags)
    doc['chunk_title_tag_vector'] = model.encode(ctt)

  0%|          | 0/572 [00:00<?, ?it/s]

In [101]:
for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  0%|          | 0/572 [00:00<?, ?it/s]

Fin Test indexing

In [45]:
# def prepare_documents_for_indexing(docs, embedding_model):
#     for doc in docs:
#         embedding_vector = generate_embedding(doc['chunk_text'], embedding_model)
        
#         yield {
#             "_index": index_name,
#             "_id": doc['chunk_id'],
#             "_source": {
#                 "article_id": doc['article_id'],
#                 "title": doc['title'],
#                 "url": doc['url'],
#                 "category": doc['category'],
#                 "tags": doc['tags'],
#                 "chunk_id": doc['chunk_id'],
#                 "chunk_text": doc['chunk_text'],
#                 "embedding": embedding_vector
#             }
#         }

In [56]:
# Index the documents in bulk
# bulk(es, tqdm(prepare_documents_for_indexing(documents, 'paraphrase')))

# RAG flow

knn vector search:

In [17]:
def elastic_search_knn(query, embedding_model, category=None):
    vector = generate_embedding(query, embedding_model)
    
    # If category is provided, add the filter condition
    filter_conditions = []
    if category:
        filter_conditions.append({"term": {"category": category}})

    search_query = {
        "size": 10,
        "_source": ["chunk_id", "title", "tags", "chunk_text", "url"],
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["title", "tags", "chunk_text"],
                        "type": "best_fields"
                    }
                },
                "filter": filter_conditions
            }
        },
        "knn": {
            "field": "embedding",
            "query_vector": vector.tolist(),
            "k": 10,
            "num_candidates": 10000
        }
    }

    es_results = es.search(index=index_name, body=search_query)
    return [hit["_source"] for hit in es_results["hits"]["hits"]]


In [77]:
def elastic_search_knn_test(field, vector, category):
    
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "category": category
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["chunk_id", "title", "tags", "chunk_text", "url", "category"]
    }

    es_results = es.search(index=index_name, body=search_query)
    return [hit["_source"] for hit in es_results["hits"]["hits"]]


hybrid search with script_score query that combines text search with vector similarity:

In [207]:
def elastic_search_hybrid(query, embedding_model, category=None):
    vector = generate_embedding(query, embedding_model)
    
    # If category is provided, add the filter condition
    filter_conditions = []
    if category:
        filter_conditions.append({"term": {"category": category}})
    
    # Hybrid search query with BM25 and kNN (semantic search)
    search_query = {
        "size": 10,
        "_source": ["chunk_id", "title", "tags", "chunk_text", "url"],
        "query": {
            "function_score": {
                "query": {
                    "bool": {
                        "must": [
                            {
                                "multi_match": {
                                    "query": query,
                                    "fields": ["title", "tags", "chunk_text"],
                                    "type": "best_fields"
                                }
                            }
                        ],
                        "filter": filter_conditions
                    }
                },
                "functions": [
                    {
                        "script_score": {
                            "script": {
                                # Combines BM25 score and kNN similarity score
                                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1 + _score",
                                "params": {
                                    "query_vector": vector.tolist()
                                }
                            }
                        }
                    }
                ],
                "boost_mode": "replace"  # Replace BM25 score with the combined score
            }
        }
    }

    es_results = es.search(index=index_name, body=search_query)
    return [hit["_source"] for hit in es_results["hits"]["hits"]]


### Prompt

In [20]:
prompt_template = """
You're an audio engineer and sound designer instructor for beginners.
You're particularly specialized in audio home-studio set-up, computer music production and audio post-production in general (editing, mixing and mastering). 
Answer the QUESTION based on the CONTEXT from our arsonor knowledge database (articles).
Use only the facts from the CONTEXT when answering the QUESTION.
Finally, recommend the top 3 Arsonor articles that are the best to read for answering this question.
For each recommended article, include both its title and URL.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
ARTICLE: {title}
KEYWORDS: {tags}
CONTENT: {chunk_text}
""".strip()


In [21]:
def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context += entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

### Test prompt

In [24]:
query = 'Comment obtenir une musique de haute qualité au même niveau sonore que les autres?'
search_results = elastic_search_knn_test(query, 'paraphrase', 'LA POST-PROD')
prompt = build_prompt(query, search_results)
print(prompt)

You're an audio engineer and sound designer instructor for beginners.
You're particularly specialized in audio home-studio set-up, computer music production and audio post-production in general (editing, mixing and mastering). 
Answer the QUESTION based on the CONTEXT from our arsonor knowledge database (articles).
Use only the facts from the CONTEXT when answering the QUESTION.
Finally, recommend the top 3 Arsonor articles that are the best to read for answering this question.
For each recommended article, include both its title and URL.

QUESTION: Comment obtenir une musique de haute qualité au même niveau sonore que les autres?

CONTEXT:
ARTICLE: La gestion des niveaux sonores (3): Variations du loudness
KEYWORDS: balance tonale, dBA, dBC, loudness, LUFS, masquage, sonie, transitoire, volume sonore
CONTENT: – A haut niveau, plus de perception des basses donc risque d’en mettre pas assez. Ces considérations dépendent bien sûr de l’expérience individuelle de chacun en mixage. Cependan

### Final RAG flow

In [98]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [99]:
def rag(query, embedding_model, category):
    search_results = elastic_search_knn(query, embedding_model, category)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

### Example usage

In [139]:
category = 'LA POST_PROD'
query = 'Comment obtenir une musique de haute qualité au même niveau sonore que les autres?'
embedding_model = 'paraphrase'
response = rag(query, embedding_model, category)
print(response)

Pour obtenir une musique de haute qualité au même niveau sonore que les autres, il est essentiel de porter attention à plusieurs éléments clés lors du processus de mixage et de mastering :

1. **Mixage des Fréquences Médiums** : Un équilibre correct des fréquences médiums est crucial, car ces fréquences ne varient guère avec le volume et permettent de maintenir la perception des instruments à différents niveaux d'écoute. Par exemple, il est conseillé de traiter les sons de basse dans les hautes médiums pour s'assurer qu'ils soient perçus même à faible volume.

2. **Gérer la Dynamique** : Utiliser la dynamique de manière réfléchie pendant le mixage permet de diriger l’attention de l’auditeur sur les éléments les plus importants de la musique, tandis que la compression peut aider à stabiliser les niveaux perçus.

3. **Mastering et Loudness** : En phase de mastering, il est crucial de travailler sur la plage dynamique et le loudness pour que le morceau soit compétitif par rapport aux autr

# Retrieval evaluation

In [12]:
df_question = pd.read_csv('../data/ground-truth-300.csv')
df_question.head()

Unnamed: 0,question,category,chunk,article
0,Quel est l'impact de l'IA sur la post-producti...,LA POST-PROD,4615db39-1,4615db39
1,Comment les outils IA simplifient-ils le trava...,LA POST-PROD,4615db39-1,4615db39
2,Quels avantages l'IA apporte-t-elle aux artist...,LA POST-PROD,4615db39-1,4615db39
3,Comment un débutant peut-il améliorer ses prod...,LA POST-PROD,4615db39-1,4615db39
4,Quelle est l'évolution des outils audio pour l...,LA POST-PROD,4615db39-1,4615db39


In [13]:
ground_truth = df_question.to_dict(orient='records')
ground_truth[0]

{'question': "Quel est l'impact de l'IA sur la post-production audio et musicale",
 'category': 'LA POST-PROD',
 'chunk': '4615db39-1',
 'article': '4615db39'}

In [95]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['chunk']
        results = search_function(q)
        relevance = [d['chunk_id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

Chunks 300_50, model 'camembert':

In [108]:
evaluate(ground_truth, lambda q: elastic_search_knn(q['question'], 'camembert', q['category']))

  0%|          | 0/2860 [00:00<?, ?it/s]

{'hit_rate': 0.8748251748251749, 'mrr': 0.6097562160062151}

Chunks 300_50, model 'mbert':

In [112]:
evaluate(ground_truth, lambda q: elastic_search_knn(q['question'], 'mbert', q['category']))

  0%|          | 0/2860 [00:00<?, ?it/s]

{'hit_rate': 0.877972027972028, 'mrr': 0.6076655289155277}

Chunks 300_50, model 'paraphrase', k=10:

In [117]:
evaluate(ground_truth, lambda q: elastic_search_knn(q['question'], 'paraphrase', q['category']))

  0%|          | 0/2860 [00:00<?, ?it/s]

{'hit_rate': 0.8867132867132868, 'mrr': 0.6172445609945602}

Chunks 300_50, model 'paraphrase', k=30:

In [193]:
evaluate(ground_truth, lambda q: elastic_search_knn(q['question'], 'paraphrase', q['category']))

  0%|          | 0/2860 [00:00<?, ?it/s]

{'hit_rate': 0.8895104895104895, 'mrr': 0.6158666333666325}

Chunks 350_30, model 'paraphrase'

In [125]:
evaluate(ground_truth, lambda q: elastic_search_knn(q['question'], 'paraphrase', q['category']))

  0%|          | 0/2215 [00:00<?, ?it/s]

{'hit_rate': 0.87313769751693, 'mrr': 0.6015434089361857}

Chunks 300_50, model 'paraphrase', hybrid search:

In [208]:
evaluate(ground_truth, lambda q: elastic_search_hybrid(q['question'], 'paraphrase', q['category']))

  0%|          | 0/2860 [00:00<?, ?it/s]

{'hit_rate': 0.8846153846153846, 'mrr': 0.6128640803640799}

In [78]:
def chunk_paraphrase_knn(q):
    question = q['question']
    category = q['category']

    v_q = paraphrase_model.encode(question)

    return elastic_search_knn_test('chunk_vector', v_q, category)

In [79]:
evaluate(ground_truth, chunk_paraphrase_knn)

  0%|          | 0/2860 [00:00<?, ?it/s]

{'hit_rate': 0.49965034965034966, 'mrr': 0.28753357753357806}

In [68]:
def chunk_title_tag_paraphrase_knn(q):
    question = q['question']
    category = q['category']

    v_q = generate_embedding(question, 'paraphrase')

    return elastic_search_knn_test('chunk_title_tag_vector', v_q, category)

In [69]:
evaluate(ground_truth, chunk_title_tag_paraphrase_knn)

  0%|          | 0/2860 [00:00<?, ?it/s]

{'hit_rate': 0.49965034965034966, 'mrr': 0.28753357753357806}

In [86]:
def chunk_camembert_knn(q):
    question = q['question']
    category = q['category']

    v_q = generate_embedding(question, 'camembert')

    return elastic_search_knn_test('chunk_vector', v_q, category)

In [87]:
evaluate(ground_truth, chunk_camembert_knn)

  0%|          | 0/2860 [00:00<?, ?it/s]

{'hit_rate': 0.1618881118881119, 'mrr': 0.05634254634254627}

In [96]:
def chunk_multiqa_knn(q):
    question = q['question']
    category = q['category']

    v_q = model.encode(question)

    return elastic_search_knn_test('chunk_vector', v_q, category)

In [97]:
evaluate(ground_truth, chunk_multiqa_knn)

  0%|          | 0/2860 [00:00<?, ?it/s]

{'hit_rate': 0.6814685314685315, 'mrr': 0.3852813852813858}

In [102]:
def chunk_mpnet_knn(q):
    question = q['question']
    category = q['category']

    v_q = model.encode(question)

    return elastic_search_knn_test('chunk_vector', v_q, category)

In [103]:
evaluate(ground_truth, chunk_mpnet_knn)

  0%|          | 0/2860 [00:00<?, ?it/s]

{'hit_rate': 0.6391608391608392, 'mrr': 0.36160631035631086}