In [19]:
import TRECCASTeval as trec
import numpy as np
import pprint
import pandas as pd
import OpenSearchSimpleAPI as osearch
import pprint as pp

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /home/rick/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
pp = pprint.PrettyPrinter(indent=4)

test_bed = trec.ConvSearchEvaluation()

# Initialize stop words and stemmer
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [21]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

In [31]:
# Change if you want to see the tokenized queries
print_queries = True
print("========================================== Training conversations =====") if print_queries else 0
topics = {}
for topic in test_bed.train_topics:
    conv_id = topic['number']

    if conv_id not in (1, 2, 4, 7, 15, 17,18,22,23,24,25,27,30):
        continue

    print() if print_queries else 0
    print(conv_id, "  ", topic['title']) if print_queries else 0

    previous_query_tokenized = ''
    for turn in topic['turn']:
        turn_id = turn['number']
        utterance = turn['raw_utterance']
        updated_utterance = previous_query_tokenized + utterance
        previous_query_tokenized += preprocess_text(utterance) + ' '
        topic_turn_id = '%d_%d'% (conv_id, turn_id)
        
        print(topic_turn_id, updated_utterance) if print_queries else 0
        topics[topic_turn_id] = updated_utterance

print() if print_queries else 0
print("========================================== Test conversations =====") if print_queries else 0
for topic in test_bed.test_topics:
    conv_id = topic['number']

    if conv_id not in (31, 32, 33, 34, 37, 40, 49, 50, 54, 56, 58, 59, 61, 67, 68, 69, 75, 77, 78, 79):
        continue


    #print(conv_id, "  ", topic['title'])

    previous_query_tokenized = ''
    for turn in topic['turn']:
        turn_id = turn['number']
        utterance = turn['raw_utterance']
        updated_utterance = previous_query_tokenized + utterance
        previous_query_tokenized += preprocess_text(utterance) + ' '
        topic_turn_id = '%d_%d'% (conv_id, turn_id)
        
        print(topic_turn_id, updated_utterance) if print_queries else 0
        topics[topic_turn_id] = updated_utterance

test_bed.test_relevance_judgments


1    Career choice for Nursing and Physician's Assistant
1_1 What is a physician's assistant?
1_2 physician assist What are the educational requirements required to become one?
1_3 physician assist educ requir requir becom one What does it cost?
1_4 physician assist educ requir requir becom one cost What's the average starting salary in the UK?
1_5 physician assist educ requir requir becom one cost what averag start salari uk What about in the US?
1_6 physician assist educ requir requir becom one cost what averag start salari uk us What school subjects are needed to become a registered nurse?
1_7 physician assist educ requir requir becom one cost what averag start salari uk us school subject need becom regist nurs What is the PA average salary vs an RN?
1_8 physician assist educ requir requir becom one cost what averag start salari uk us school subject need becom regist nurs pa averag salari vs rn What the difference between a PA and a nurse practitioner?
1_9 physician assist educ req

Unnamed: 0,topic_turn_id,dummy,docid,rel
0,31_1,Q0,CAR_116d829c4c800c2fc70f11692fec5e8c7e975250,0
1,31_1,Q0,CAR_1463f964653c5c9f614a0a88d26b175e4a8120f1,1
2,31_1,Q0,CAR_172e16e89ea3d5546e53384a27c3be299bcfe968,2
3,31_1,Q0,CAR_1c93ef499a0c2856c4a857b0cb4720c380dda476,0
4,31_1,Q0,CAR_2174ad0aa50712ff24035c23f59a3c2b43267650,3
...,...,...,...,...
29345,79_9,Q0,MARCO_8795229,0
29346,79_9,Q0,MARCO_8795231,0
29347,79_9,Q0,MARCO_8795233,0
29348,79_9,Q0,MARCO_8795236,0


# OpenSearch implementation

In [23]:
opensearch = osearch.OSsimpleAPI()


numdocs = 3
test_query = topics['34_3']

{'acknowledged': True, 'shards_acknowledged': True}

----------------------------------------------------------------------------------- INDEX SETTINGS
{'kwiz': {'settings': {'index': {'creation_date': '1728153198145',
                                 'knn': 'true',
                                 'number_of_replicas': '0',
                                 'number_of_shards': '1',
                                 'provided_name': 'kwiz',
                                 'refresh_interval': '-1',
                                 'similarity': {'default': {'lambda': '0.7',
                                                            'type': 'LMJelinekMercer'}},
                                 'uuid': 'qkpQ7pcwS7iT1IOTsfwRNg',
                                 'version': {'created': '135238227'}}}}}

----------------------------------------------------------------------------------- INDEX MAPPINGS
{'kwiz': {'mappings': {'properties': {'collection': {'type': 'keyword'},
                   

#### We can see that opensearch already uses bm25, so we won't be implementing it ourselves

#### Now, search_body will scour opensearch's corpus and rank the best passages found using the settings above

In [24]:
opensearch_results = opensearch.search_body(test_query, numDocs = numdocs)
print(opensearch_results)

  _index _type            _id     _score  \
0   kwiz  _doc  MARCO_2934667  22.235218   
1   kwiz  _doc  MARCO_6161702  22.137203   
2   kwiz  _doc  MARCO_4322101  19.237360   

                                    _source.contents    _source.doc  \
0  Causes of bone pain. There are also some other...  MARCO_2934667   
1  The following medical conditions are some of t...  MARCO_6161702   
2  The following medical conditions are some of t...  MARCO_4322101   

  _source.collection  
0            msmarco  
1            msmarco  
2            msmarco  


#### Making the tuple (#turn; query; top N passages), with N = 3

In [30]:
k = 3

data = []
for topic in topics:
    query = topics[topic]
    opensearch_results = opensearch.search_body(query, numDocs = k)
    best_docs = []
    best_passages = []
    for index, row in opensearch_results.iterrows():
        doc_id = row['_id']
        doc_body = opensearch.get_doc_body(doc_id)
        best_passages.append(doc_body)
        best_docs.append(doc_id)
    data.append({'turn': topic, 'query': query, 'top passages': best_passages, 'doc ids': best_docs})

df = pd.DataFrame(data)
print(df)


     turn                                              query  \
0     1_1                   What is a physician's assistant?   
1     1_2  physician assist What are the educational requ...   
2     1_3  physician assist educ requir requir becom one ...   
3     1_4  physician assist educ requir requir becom one ...   
4     1_5  physician assist educ requir requir becom one ...   
..    ...                                                ...   
309  79_5  taught sociolog main contribut august comt rol...   
310  79_6  taught sociolog main contribut august comt rol...   
311  79_7  taught sociolog main contribut august comt rol...   
312  79_8  taught sociolog main contribut august comt rol...   
313  79_9  taught sociolog main contribut august comt rol...   

                                          top passages  \
0    [What is the difference between a medical assi...   
1    [What Education Do I Need for a Career As a Ph...   
2    [NEW: Follow this link to view the updated 201...   