#### Uncomment the nltk.download if you haven't downloaded it yet

In [2]:
import TRECCASTeval as trec
import numpy as np
import pprint
import pandas as pd
import OpenSearchSimpleAPI as osearch
import pprint as pp

import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [6]:
pp = pprint.PrettyPrinter(indent=4)

test_bed = trec.ConvSearchEvaluation()

# Initialize stop words and stemmer
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

### Preprocess text will tokenize the text 
This function takes in raw text (usually conversational utterances) and performs the following:

- Converts the text to lowercase.
- Removes non-alphanumeric characters (punctuation, symbols).
- Removes common stopwords using the NLTK stopwords list.
- Stems each word using the Porter stemmer to reduce words to their base forms.

In [7]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

### Training and Test Data Processing

Here, we iterate over the training and testing topics provided by the test_bed. We filter out some conversation IDs based on predefined criteria, and then process each utterance in the conversation using our preprocess_text function. The preprocessed utterances are accumulated over turns of the conversation to simulate a growing context.

Key Variables:
- **previous_query_tokenized**: Keeps track of the concatenated previous utterances to simulate a conversation history.
- **topics**: Stores each turn's preprocessed utterances, indexed by a combination of the conversation ID and turn number.

(Printing of the queries is optional and used mostly for debug purposes)

In [15]:
# Change visualization of the tokenized queries
print_queries = False

chosen_topic= 77
conversation = []

print("========================================== Training conversations =====") if print_queries else 0
topics = {}
for topic in test_bed.train_topics:
    conv_id = topic['number']

    if conv_id not in (1, 2, 4, 7, 15, 17,18,22,23,24,25,27,30):
        continue

    print() if print_queries else 0
    print(conv_id, "  ", topic['title']) if print_queries else 0

    previous_query_tokenized = ''
    for turn in topic['turn']:
        turn_id = turn['number']
        utterance = turn['raw_utterance']
        updated_utterance = previous_query_tokenized + utterance
        previous_query_tokenized += preprocess_text(utterance) + ' '
        topic_turn_id = '%d_%d'% (conv_id, turn_id)
        
        print(topic_turn_id, updated_utterance) if print_queries else 0
        topics[topic_turn_id] = updated_utterance
        if conv_id == chosen_topic :
            conversation.append({"conv_id" : conv_id, "turn_id" : turn_id, "utterance" : utterance})


print() if print_queries else 0
print("========================================== Test conversations =====") if print_queries else 0
for topic in test_bed.test_topics:
    conv_id = topic['number']

    if conv_id not in (31, 32, 33, 34, 37, 40, 49, 50, 54, 56, 58, 59, 61, 67, 68, 69, 75, 77, 78, 79):
        continue


    #print(conv_id, "  ", topic['title'])

    previous_query_tokenized = ''
    for turn in topic['turn']:
        turn_id = turn['number']
        utterance = turn['raw_utterance']
        updated_utterance = previous_query_tokenized + utterance
        previous_query_tokenized += preprocess_text(utterance) + ' '
        topic_turn_id = '%d_%d'% (conv_id, turn_id)
        
        print(topic_turn_id, updated_utterance) if print_queries else 0
        topics[topic_turn_id] = updated_utterance
        if conv_id == chosen_topic :
            conversation.append({"conv_id" : conv_id, "turn_id" : turn_id, "utterance" : utterance})


print(conversation)

[{'conv_id': 77, 'turn_id': 1, 'utterance': "What's the difference between soup and stew?"}, {'conv_id': 77, 'turn_id': 2, 'utterance': 'Is chilli a stew?'}, {'conv_id': 77, 'turn_id': 3, 'utterance': 'How about goulash?'}, {'conv_id': 77, 'turn_id': 4, 'utterance': 'What are popular ones in France?'}, {'conv_id': 77, 'turn_id': 5, 'utterance': 'How is cassoulet made?'}, {'conv_id': 77, 'turn_id': 6, 'utterance': 'Tell me about feijoada and its significance.'}, {'conv_id': 77, 'turn_id': 7, 'utterance': 'How is it similar or different from cassoulet?'}, {'conv_id': 77, 'turn_id': 8, 'utterance': 'Tell about Bigos stew.'}, {'conv_id': 77, 'turn_id': 9, 'utterance': 'Why is it important?'}, {'conv_id': 77, 'turn_id': 10, 'utterance': 'What is the history of Irish stew?'}]


# OpenSearch implementation

### Setup

The OpenSearch API is initialized, confirming index creation with the following settings:

**Index name**: kwiz   
**Similarity**: BM25 for text ranking and LM Jelinek-Mercer for smoothing (λ=0.7)   
**Shards**: 1 shard, no replicas   
**Documents**: 23,596 documents indexed   
**k-NN enabled**: Sentence embeddings available for vector-based queries

In [9]:
opensearch = osearch.OSsimpleAPI()

{'acknowledged': True, 'shards_acknowledged': True}

----------------------------------------------------------------------------------- INDEX SETTINGS
{'kwiz': {'settings': {'index': {'creation_date': '1728153198145',
                                 'knn': 'true',
                                 'number_of_replicas': '0',
                                 'number_of_shards': '1',
                                 'provided_name': 'kwiz',
                                 'refresh_interval': '-1',
                                 'similarity': {'default': {'lambda': '0.7',
                                                            'type': 'LMJelinekMercer'}},
                                 'uuid': 'qkpQ7pcwS7iT1IOTsfwRNg',
                                 'version': {'created': '135238227'}}}}}

----------------------------------------------------------------------------------- INDEX MAPPINGS
{'kwiz': {'mappings': {'properties': {'collection': {'type': 'keyword'},
                   

We conduct a test search using a single preprocessed query (61_7) to retrieve the top 100 documents from the OpenSearch API. This helps verify if the query is functioning correctly and if we receive results as expected.

The results of the OpenSearch query are printed to ensure that the API returns valid documents.

In [10]:
numdocs = 100

test_query = topics['61_7']

opensearch_results = opensearch.search_body(test_query, numDocs = numdocs)
print(opensearch_results)

   _index _type                                           _id     _score  \
0    kwiz  _doc  CAR_54ddfb93ad52e7e7bdf960f5cd3164f683eb757b  42.747547   
1    kwiz  _doc  CAR_4b18b521b30a9d32d2c2852b05a5fffce336ca4e  39.716260   
2    kwiz  _doc  CAR_db3beebe1d9e72b74daeec818f076a1e6a794b9d  36.619880   
3    kwiz  _doc                                 MARCO_3765773  36.438580   
4    kwiz  _doc  CAR_56f5109e7dcc45e4bcf50cbc789a3fff94ab1575  35.619743   
..    ...   ...                                           ...        ...   
95   kwiz  _doc                                 MARCO_6139465  25.450195   
96   kwiz  _doc  CAR_613140b2eab12517d1da86bb42d2688934a3d4e1  25.309765   
97   kwiz  _doc                                 MARCO_8019905  25.259228   
98   kwiz  _doc  CAR_d8c0ddb5a2cec36eec0eb592c845665ee060e847  25.202800   
99   kwiz  _doc                                 MARCO_8344507  25.190685   

                                     _source.contents  \
0   The Justice League is a fi

## BM25-based Retrieval

This section performs document retrieval using the BM25 ranking algorithm for all queries in topics.   
For each query, the top 3 documents are retrieved from OpenSearch and from each of those documents, we extract the body (passage) and it's ID.   
The results are stored in a Pandas DataFrame for easier visualization and analysis.

In [23]:

def BM_retrieval(k):
    BM25data = []
    for element in conversation :
        topic = str(chosen_topic)
        turn = str(element['turn_id'])
        utterance = topic + '_' + turn
   
        query = topics[utterance]

        opensearch_results = opensearch.search_body(query, numDocs = k)
        best_docs = []
        best_passages = []
        for index, row in opensearch_results.iterrows():
            doc_id = row['_id']
            doc_body = opensearch.get_doc_body(doc_id)
            best_passages.append(doc_body)
            best_docs.append(doc_id)
        BM25data.append({'turn': turn, 'query': element["utterance"], 'top passages': best_passages, 'doc ids': best_docs})
        
    df = pd.DataFrame(BM25data)
    print(df)

print(BM_retrieval(3))

  turn                                           query  \
0    1    What's the difference between soup and stew?   
1    2                               Is chilli a stew?   
2    3                              How about goulash?   
3    4                What are popular ones in France?   
4    5                          How is cassoulet made?   
5    6    Tell me about feijoada and its significance.   
6    7  How is it similar or different from cassoulet?   
7    8                          Tell about Bigos stew.   
8    9                            Why is it important?   
9   10              What is the history of Irish stew?   

                                        top passages  \
0  [What's the difference between a soup and a st...   
1  [Specifically, I've always had some confusion ...   
2  [1 Stew is more of a meat and vegetables dish ...   
3  [Goulash is a rustic stew or soup made made wi...   
4  [Goulash is a rustic stew or soup made with be...   
5  [Specifically, I've al

In [None]:
p10, recall, ap, ndcg5 = test_bed.eval(df, '77_1')

print(p10,recall, ap, ndcg5 )