# Conversational Search Retrieval Augmented Generation

In this notebook you will implement the following steps:

- **Answer selection + evaluation**: Implement a *search-based* conversation framework evaluation framework to evaluate conversation topics made up of conversation turns.
- **Answer ranking**: Implement a *re-ranking method* to sort the initial search results. Evaluate the re-ranked results.
- **Conversation memory**: Implement a conversational context modeling method to keep track of the conversation state. 

Submission dates:
- **20 October**: first stage retrieval + conversation memory + evaluation
- **15 November**: re-ranking with LLM + evaluation
- **15 December**: answer generation + evaluation

## Test bed and conversation topics
The TREC CAST corpus (http://www.treccast.ai/) for Conversational Search is indexed in this cluster and available to be searched behind an OpenSearch API.

The queries and the relevance judgments are available through class `ConvSearchEvaluation`:

In [1]:
import TRECCASTeval as trec
import numpy as np
import pprint
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import OpenSearchSimpleAPI as osearch
import bm25s
import pandas as pd

nltk.download('stopwords')

pp = pprint.PrettyPrinter(indent=4)


test_bed = trec.ConvSearchEvaluation()

chosen_topic= 77
conversation = []
topics = {}
for topic in test_bed.train_topics:
    conv_id = topic['number']
    if conv_id not in (1, 2, 4, 7, 15, 17, 18, 22, 23, 24, 25, 27, 30):
        continue
    for turn in topic['turn']:
        turn_id = turn['number']
        utterance = turn['raw_utterance']
        topic_turn_id = '%d_%d' % (conv_id, turn_id)
        topics[topic_turn_id] = utterance
        if conv_id == chosen_topic :
            conversation.append({"conv_id" : conv_id, "turn_id" : turn_id, "utterance" : utterance})


for topic in test_bed.test_topics:
    conv_id = topic['number']

    if conv_id not in (31, 32, 33, 34, 37, 40, 49, 50, 54, 56, 58, 59, 61, 67, 68, 69, 75, 77, 78, 79):
        continue
    for turn in topic['turn']:
        turn_id = turn['number']
        utterance = turn['raw_utterance']
        topic_turn_id = '%d_%d' % (conv_id, turn_id)
        topics[topic_turn_id] = utterance
        if conv_id == chosen_topic :
            conversation.append({"conv_id" : conv_id, "turn_id" : turn_id, "utterance" : utterance})


opensearch = osearch.OSsimpleAPI()

numdocs = 100


[nltk_data] Downloading package stopwords to /home/anna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'acknowledged': True, 'shards_acknowledged': True}

----------------------------------------------------------------------------------- INDEX SETTINGS
{'kwiz': {'settings': {'index': {'creation_date': '1728153198145',
                                 'knn': 'true',
                                 'number_of_replicas': '0',
                                 'number_of_shards': '1',
                                 'provided_name': 'kwiz',
                                 'refresh_interval': '-1',
                                 'similarity': {'default': {'lambda': '0.7',
                                                            'type': 'LMJelinekMercer'}},
                                 'uuid': 'qkpQ7pcwS7iT1IOTsfwRNg',
                                 'version': {'created': '135238227'}}}}}

----------------------------------------------------------------------------------- INDEX MAPPINGS
{'kwiz': {'mappings': {'properties': {'collection': {'type': 'keyword'},
                   

In [2]:


def BM_retrieval(k):
    #metrics_df = pd.DataFrame(columns=['turn', 'query', '_id'])
    BM25data = []
    bm25_doc_ids = []
    for element in conversation :
        topic = str(chosen_topic)
        turn = str(element['turn_id'])
        utterance = topic + '_' + turn
   
        query = topics[utterance]
        opensearch_results = opensearch.search_body(query, numDocs = k)
        best_docs = []
        best_passages = []
        content_to_id = {}  
        for index, row in opensearch_results.iterrows():
            doc_id = row['_id']
            doc_body = opensearch.get_doc_body(doc_id)
            #new_row = {'turn': utterance, 'query': element['utterance'], '_id': doc_id}
            #metrics_df = pd.concat([metrics_df, pd.DataFrame([new_row])], ignore_index=True)
            best_passages.append(doc_body)
            best_docs.append(doc_id)
            content_to_id[doc_body] = doc_id  
        bm25_doc_ids.append([content_to_id[doc] for doc in best_passages if doc in content_to_id])
   
            
        BM25data.append({'turn': turn, 'query': element["utterance"], "expanded_query" : query,  'top passages': best_passages, '_id': best_docs})

    df = pd.DataFrame(BM25data)
   
    return df

print(BM_retrieval(3))


  turn                                           query  \
0    1    What's the difference between soup and stew?   
1    2                               Is chilli a stew?   
2    3                              How about goulash?   
3    4                What are popular ones in France?   
4    5                          How is cassoulet made?   
5    6    Tell me about feijoada and its significance.   
6    7  How is it similar or different from cassoulet?   
7    8                          Tell about Bigos stew.   
8    9                            Why is it important?   
9   10              What is the history of Irish stew?   

                                   expanded_query  \
0    What's the difference between soup and stew?   
1                               Is chilli a stew?   
2                              How about goulash?   
3                What are popular ones in France?   
4                          How is cassoulet made?   
5    Tell me about feijoada and its signifi

Search example:

In [3]:
# LMDRetriever
class LMDRetriever:
    def __init__(self, opensearch, corpus_ids):
        self.opensearch = opensearch
        self.corpus_ids = corpus_ids
        self.corpus_length = 0
        self.doc_count = len(corpus_ids)
        self.index = self.build_index()
        self.collection_frequency = self.build_collection_frequency()
        self.mu = self.calculate_mu()
    
    def calculate_mu(self):
        avg_doc_length = self.corpus_length / self.doc_count
        return 0.1 * avg_doc_length

    def build_index(self):
        index = {}
        for doc_id in self.corpus_ids:
            term_vectors = self.opensearch.doc_term_vectors(doc_id)
            if term_vectors:
                terms = term_vectors[3]
                for term, stats in terms.items():
                    if term not in index:
                        index[term] = {}
                    index[term][doc_id] = stats[0]
                    self.corpus_length += stats[0]
        return index

    def build_collection_frequency(self):
        collection_frequency = {}
        for doc_id in self.corpus_ids:
            term_vectors = self.opensearch.doc_term_vectors(doc_id)
            if term_vectors:
                terms = term_vectors[3]
                for term, stats in terms.items():
                    if term not in collection_frequency:
                        collection_frequency[term] = 0
                    collection_frequency[term] += stats[2]
        return collection_frequency

    def score(self, query, doc_id):
        score = 1.0
        term_vectors = self.opensearch.doc_term_vectors(doc_id)
        if term_vectors:
            terms = term_vectors[3]
            doc_length = sum([stats[0] for stats in terms.values()])
            for term in query.split():
                tf = terms.get(term, [0])[0]
                cf = self.collection_frequency.get(term, 0)
                p_ml = cf / self.corpus_length
                p_lmd = (tf + self.mu * p_ml) / (doc_length + self.mu)
                if p_lmd > 0:
                    score *= p_lmd
        return score

    def retrieve(self, query, k):
        scores = []
        for doc_id in self.corpus_ids:
            score = self.score(query, doc_id)
            scores.append((doc_id, score))
        scores = sorted(scores, key=lambda x: x[1], reverse=True)
        return scores[:k]

bm25_results = BM_retrieval(100)

#print(bm25_results)

#print('-------------------------------------------------------------------------------------')


reranked_df = pd.DataFrame(columns=["turn", "query", "_id"])


for index, row in bm25_results.iterrows():
   
    bm25_doc_ids = row["_id"] 
    turn = row["turn"]
    query = row["expanded_query"]

    # Créer une instance de LMDRetriever avec les doc_ids récupérés
    lmd_retriever = LMDRetriever(opensearch=opensearch, corpus_ids=bm25_doc_ids)

    # Reranking avec la méthode retrieve
    reranked_results = lmd_retriever.retrieve(query, k=100)

    # Extraire les passages du reranking
    top_N_passages = [doc_id for doc_id, score in reranked_results]

    # Créer un DataFrame temporaire pour la nouvelle ligne
    new_row = pd.DataFrame({
        "turn": [turn],
        "query": [query],
        "_id": [top_N_passages]
    })


    # Utiliser pd.concat pour ajouter la nouvelle ligne au DataFrame final
    reranked_df = pd.concat([reranked_df, new_row], ignore_index=True)

# Afficher le DataFrame final
print(reranked_df)


  turn                                           query  \
0    1    What's the difference between soup and stew?   
1    2                               Is chilli a stew?   
2    3                              How about goulash?   
3    4                What are popular ones in France?   
4    5                          How is cassoulet made?   
5    6    Tell me about feijoada and its significance.   
6    7  How is it similar or different from cassoulet?   
7    8                          Tell about Bigos stew.   
8    9                            Why is it important?   
9   10              What is the history of Irish stew?   

                                                 _id  
0  [MARCO_16539, MARCO_7312785, MARCO_4009632, MA...  
1  [CAR_e0559ce4c6079c402ca8c187105fc3099decb965,...  
2  [CAR_33deda658ce5f81b4ffca6fd9be8efbc60436928,...  
3  [CAR_e7d160e7a8dde88710838eb8dd6dfc7898a02d4f,...  
4  [CAR_c44cfd57976b8ef0ee9f5d1b3446d7b9e3106c84,...  
5  [CAR_5a75fe18050cc4ebdb9a030

In [4]:
turns = []
LMD_ap_values = []
LMD_ndcg_values = []
LMD_precision_values = []
LMD_recall_values = []

print("LMD")

for index, row in reranked_df.iterrows():
    try:
    
        turn = f"77_{row['turn']}"  
        query = row['query']
        docs = row['_id']  

        result_df = pd.DataFrame({"_id": docs})

        p10, recall, ap, ndcg5 = test_bed.eval(result_df, turn)
        turns.append(turn)
        LMD_ap_values.append(ap)
        LMD_ndcg_values.append(ndcg5)
        LMD_precision_values.append(p10)
        LMD_recall_values.append(recall)

        print(f"Turn: {turn}")
        
    
        print(f"P@10: {p10}, Recall: {recall}, AP: {ap}, NDCG@5: {ndcg5}\n")

    except Exception as e:
  
        print(f"Erreur sur le tour {turn}: {e}")
        break  

print(turns,LMD_ap_values,LMD_ndcg_values,LMD_precision_values, LMD_recall_values)


LMD
Turn: 77_1
P@10: 0.6, Recall: 0.6153846153846154, AP: 0.21890349531082884, NDCG@5: 0.1599902354470432

Turn: 77_2
P@10: 0.0, Recall: 0.4, AP: 0.009639830508474575, NDCG@5: 0.0

Turn: 77_3
P@10: 0.3, Recall: 0.7435897435897436, AP: 0.2410340073882552, NDCG@5: 0.10157051007554901

Turn: 77_4
P@10: 0.1, Recall: 0.20833333333333334, AP: 0.02203193303734692, NDCG@5: 0.25326335410655304

Turn: 77_5
P@10: 0.6, Recall: 0.9444444444444444, AP: 0.4116411106384749, NDCG@5: 0.25064969555809496

Turn: 77_6
P@10: 0.2, Recall: 0.125, AP: 0.03187957875457875, NDCG@5: 0.18322608909137006

Turn: 77_7
P@10: 0.2, Recall: 0.22857142857142856, AP: 0.03654396991052007, NDCG@5: 0.10907458963273321

Turn: 77_8
P@10: 0.2, Recall: 0.84375, AP: 0.20964821424602653, NDCG@5: 0.0

Turn: 77_9
P@10: 0, Recall: 0, AP: 0, NDCG@5: 0

Turn: 77_10
P@10: 0, Recall: 0, AP: 0, NDCG@5: 0

['77_1', '77_2', '77_3', '77_4', '77_5', '77_6', '77_7', '77_8', '77_9', '77_10'] [0.21890349531082884, 0.009639830508474575, 0.24103400

In [None]:
import matplotlib.pyplot as plt

# Supprimer les tours avec des scores de 0 pour éviter les erreurs
filtered_turns = []
filtered_ap_values = []
filtered_ndcg_values = []
filtered_precision_values = []
filtered_recall_values = []

for i in range(len(turns)):
    # Vérifier si AP ou NDCG est nul
    if LMD_ap_values[i] != 0 or LMD_ndcg_values[i] != 0:
        filtered_turns.append(turns[i])
        filtered_ap_values.append(LMD_ap_values[i])
        filtered_ndcg_values.append(LMD_ndcg_values[i])

    # Vérifier si Précision ou Rappel est nul (pour éviter d'afficher un point isolé)
    if LMD_precision_values[i] != 0 or LMD_recall_values[i] != 0:
        filtered_precision_values.append(LMD_precision_values[i])
        filtered_recall_values.append(LMD_recall_values[i])

# Activer le mode interactif pour améliorer la réactivité
plt.ion()

# 1. Évolution de l'AP et du NDCG au fil des tours de conversation
plt.figure(figsize=(10, 6))
plt.plot(filtered_turns, filtered_ap_values, marker='o', label='AP', linestyle='-')
plt.plot(filtered_turns, filtered_ndcg_values, marker='x', label='NDCG@5', linestyle='-')
plt.title("Évolution de l'AP et du NDCG@5 au fil des tours de conversation")
plt.xlabel("Tours de conversation")
plt.ylabel("Scores")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.show()


plt.figure(figsize=(10, 6))
plt.plot(filtered_recall_values, filtered_precision_values, marker='o', label='Precision-Recall')
plt.title("Precision-Recall")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.grid(True)
plt.show()

