# Conversational Search Retrieval Augmented Generation

In this notebook you will implement the following steps:

- **Answer selection + evaluation**: Implement a *search-based* conversation framework evaluation framework to evaluate conversation topics made up of conversation turns.
- **Answer ranking**: Implement a *re-ranking method* to sort the initial search results. Evaluate the re-ranked results.
- **Conversation memory**: Implement a conversational context modeling method to keep track of the conversation state. 

Submission dates:
- **20 October**: first stage retrieval + conversation memory + evaluation
- **15 November**: re-ranking with LLM + evaluation
- **15 December**: answer generation + evaluation

## Test bed and conversation topics
The TREC CAST corpus (http://www.treccast.ai/) for Conversational Search is indexed in this cluster and available to be searched behind an OpenSearch API.

The queries and the relevance judgments are available through class `ConvSearchEvaluation`:

In [2]:
import TRECCASTeval as trec
import numpy as np
import pprint
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import OpenSearchSimpleAPI as osearch
import bm25s
import pandas as pd

nltk.download('stopwords')

pp = pprint.PrettyPrinter(indent=4)


test_bed = trec.ConvSearchEvaluation()


topics = {}
for topic in test_bed.train_topics:
    conv_id = topic['number']
    if conv_id not in (1, 2, 4, 7, 15, 17, 18, 22, 23, 24, 25, 27, 30):
        continue
    for turn in topic['turn']:
        turn_id = turn['number']
        utterance = turn['raw_utterance']
        topic_turn_id = '%d_%d' % (conv_id, turn_id)
        topics[topic_turn_id] = utterance

for topic in test_bed.test_topics:
    conv_id = topic['number']
    if conv_id not in (31, 32, 33, 34, 37, 40, 49, 50, 54, 56, 58, 59, 61, 67, 68, 69, 75, 77, 78, 79):
        continue
    for turn in topic['turn']:
        turn_id = turn['number']
        utterance = turn['raw_utterance']
        topic_turn_id = '%d_%d' % (conv_id, turn_id)
        topics[topic_turn_id] = utterance


opensearch = osearch.OSsimpleAPI()

numdocs = 100
test_query = topics['77_1']


opensearch_results = opensearch.search_body(test_query, numDocs=numdocs)


[nltk_data] Downloading package stopwords to /home/anna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'acknowledged': True, 'shards_acknowledged': True}

----------------------------------------------------------------------------------- INDEX SETTINGS
{'kwiz': {'settings': {'index': {'creation_date': '1728153198145',
                                 'knn': 'true',
                                 'number_of_replicas': '0',
                                 'number_of_shards': '1',
                                 'provided_name': 'kwiz',
                                 'refresh_interval': '-1',
                                 'similarity': {'default': {'lambda': '0.7',
                                                            'type': 'LMJelinekMercer'}},
                                 'uuid': 'qkpQ7pcwS7iT1IOTsfwRNg',
                                 'version': {'created': '135238227'}}}}}

----------------------------------------------------------------------------------- INDEX MAPPINGS
{'kwiz': {'mappings': {'properties': {'collection': {'type': 'keyword'},
                   

In [3]:

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

def expand_query(query):
    expanded_query = query 
    return expanded_query


query = test_query
processed_query = preprocess_text(query)
expanded_query = expand_query(processed_query)
print('Unprocessed query: ' + query)
print('Processed: ' + processed_query)
print('Expanded Query:', expanded_query)

Unprocessed query: What's the difference between soup and stew?
Processed: what differ soup stew
Expanded Query: what differ soup stew


## OpenSearch

In [4]:

corpus = []
doc_ids = []
content_to_id = {}  
for index, row in opensearch_results.iterrows():
    doc_id = row['_id']
    doc_body = opensearch.get_doc_body(doc_id)
    corpus.append(doc_body)
    doc_ids.append(doc_id)
    content_to_id[doc_body] = doc_id  

# BM25 retrieval
tokenized_query = bm25s.tokenize(expanded_query)
print("Tokenized Query:", tokenized_query)


retriever = bm25s.BM25(corpus=corpus)
retriever.index(bm25s.tokenize(corpus))

k = 100
bm25_results, bm25_scores = retriever.retrieve(tokenized_query, k=k)


bm25_doc_ids = [content_to_id[doc] for doc in bm25_results[0] if doc in content_to_id]


Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenized Query: Tokenized(ids=[[0, 1, 2, 3]], vocab={'what': 0, 'differ': 1, 'soup': 2, 'stew': 3})


Split strings:   0%|          | 0/100 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/100 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/100 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Search example:

In [5]:

# LMDRetriever
class LMDRetriever:
    def __init__(test_bed, opensearch, corpus_ids):
        test_bed.opensearch = opensearch
        test_bed.corpus_ids = corpus_ids
        test_bed.corpus_length = 0
        test_bed.doc_count = len(corpus_ids)
        test_bed.index = test_bed.build_index()
        test_bed.collection_frequency = test_bed.build_collection_frequency()
        test_bed.mu = test_bed.calculate_mu()
    
    def calculate_mu(test_bed):
        avg_doc_length = test_bed.corpus_length / test_bed.doc_count
        return 0.1 * avg_doc_length

    def build_index(test_bed):
        index = {}
        for doc_id in test_bed.corpus_ids:
            term_vectors = test_bed.opensearch.doc_term_vectors(doc_id)
            if term_vectors:
                terms = term_vectors[3]
                for term, stats in terms.items():
                    if term not in index:
                        index[term] = {}
                    index[term][doc_id] = stats[0]
                    test_bed.corpus_length += stats[0]
        return index

    def build_collection_frequency(test_bed):
        collection_frequency = {}
        for doc_id in test_bed.corpus_ids:
            term_vectors = test_bed.opensearch.doc_term_vectors(doc_id)
            if term_vectors:
                terms = term_vectors[3]
                for term, stats in terms.items():
                    if term not in collection_frequency:
                        collection_frequency[term] = 0
                    collection_frequency[term] += stats[2]
        return collection_frequency

    def score(test_bed, query, doc_id):
        score = 1.0
        term_vectors = test_bed.opensearch.doc_term_vectors(doc_id)
        if term_vectors:
            terms = term_vectors[3]
            doc_length = sum([stats[0] for stats in terms.values()])
            for term in query.split():
                tf = terms.get(term, [0])[0]
                cf = test_bed.collection_frequency.get(term, 0)
                p_ml = cf / test_bed.corpus_length
                p_lmd = (tf + test_bed.mu * p_ml) / (doc_length + test_bed.mu)
                if p_lmd > 0:
                    score *= p_lmd
        return score

    def retrieve(test_bed, query, k):
        scores = []
        for doc_id in test_bed.corpus_ids:
            score = test_bed.score(query, doc_id)
            scores.append((doc_id, score))
        scores = sorted(scores, key=lambda x: x[1], reverse=True)
        return scores[:k]
    
# LMDRetriever to rerank the BM25 results
lmd_retriever = LMDRetriever(opensearch=opensearch, corpus_ids=bm25_doc_ids)
reranked_results = lmd_retriever.retrieve(expanded_query, k=10)



In [7]:
data = []
best_docs = []
for rank, (doc_id, score) in enumerate(reranked_results, start=1):
    doc_content = opensearch.get_doc_body(doc_id)
    best_docs.append(str(doc_id))
    #print(f"Rank {rank} (score: {score:.6f}):\n{doc_content}\n")

result_df = pd.DataFrame({"turn" : "77_1", "query" : query, '_id' : best_docs })

print(result_df)

   turn                                         query  \
0  77_1  What's the difference between soup and stew?   

                                                 _id  
0  [MARCO_16539, MARCO_7312785, MARCO_4009632, MA...  


In [13]:
p10, recall, ap, ndcg5 = test_bed.eval(result_df, '77_1')

print(p10,recall, ap, ndcg5 )


TypeError: '<' not supported between instances of 'str' and 'list'