In [1]:
import pandas as pd
from uuid import uuid4

input_data = []

with open("./Datasets/Group 1_Older Wearers - Rev Transcript.txt", "r") as f:
    input_data = f.readlines()

input_data = pd.DataFrame(input_data, columns=["text"])
input_data["text_id"] = input_data["text"].apply(lambda x: uuid4())

input_data #make sure data has [text, text_id] columns

Unnamed: 0,text,text_id
0,Moderator ():\n,a9dcf230-8c00-48e6-89dd-064367a194ba
1,... in here. I realize we're starting a few mi...,d2851d5d-4639-43d9-a033-3533faaf9061
2,I come around the country and I talk to people...,a8f277c9-e9d5-40b7-8bcd-5cd12b8412d8
3,"guys, so that's kind of a nice time for me. Gu...",59c75244-da06-4d15-9204-8373373cad71
4,then just talk about different brands and type...,ebd36c3e-714a-4ce5-a8fe-9890b32a60ee
...,...,...
1820,Because people don't... If they don't see it e...,96cbd583-0d89-4eb4-a445-59b4cf8ec61b
1821,look for it unless they see it all the time. I...,b72f16d2-9a1f-4313-b2b2-b265301bd91a
1822,think they need to push yourself up front. I l...,24a8978c-7da9-4c04-b651-46cca21be2ee
1823,Moderator ():\n,7c0ca282-fb74-498a-9147-e00287568c5f


In [2]:
from pipeline import DocumentRetrievalPipeline

pipeline = DocumentRetrievalPipeline()

  from tqdm.autonotebook import tqdm, trange

2024-08-14 22:09:17,670 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cuda
2024-08-14 22:09:17,671 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2


In [3]:
pipeline.build_index(input_data["text"].tolist()) #send only a list of texts

2024-08-14 22:09:21,122 - root - INFO - Building indices using 2 index builders ...
2024-08-14 22:09:21,125 - root - INFO - Building index with TFIDFIndexBuilder on 1825 documents ...
2024-08-14 22:09:21,173 - root - INFO - Building index with DocumentRetriever on 1825 documents ...
Batches: 100%|██████████| 58/58 [00:01<00:00, 37.07it/s]
Batches: 100%|██████████| 532/532 [00:06<00:00, 86.11it/s]
2024-08-14 22:09:31,108 - root - INFO - Building document index ...
2024-08-14 22:09:31,181 - root - INFO - Built index for 1825 documents | 1825 keyword pairs extracted
2024-08-14 22:09:31,182 - root - INFO - Building inverse index ...
2024-08-14 22:09:31,187 - root - INFO - Built inverse index for 2672 keywords



In [15]:
def sentence_lookup(texts, files):

    lookup = pd.merge(pd.DataFrame({"text" : texts}),
                    files[["text_id", "text"]],
                    on="text",
                    how="left")
    
    return lookup["text_id"].tolist()

def answer_query(query:str,
                 document_retriever=pipeline):

    #get the documents that can answer the query first
    retrieved_docs = document_retriever.retrieve_topk(
                                        query=query,
                                        topk=5
                                        )

    retrieved_docs = pd.DataFrame(retrieved_docs)
    if retrieved_docs.empty:
        return pd.DataFrame(columns=['index', 'document', 'keywords', 'score', 'method', 'sentence_window',
                                    'text_id', 'sentence_window_text_ids'])
    
    retrieved_docs = pd.merge(retrieved_docs,
                              input_data[["text_id", "text"]].rename(columns={"text": "document"}),
                              on="document",
                              how="left")
    
    retrieved_docs.sort_values(by='score', ascending=True, inplace=True)

    #get the text_ids for all sentence_windows retrieved
    sentence_window_delimiter = document_retriever.sentence_window_delimiter
    retrieved_docs["sentence_window_text_ids"] = retrieved_docs["sentence_window"].apply(lambda x: x.split(sentence_window_delimiter))
    
    # print(retrieved_docs[["sentence_window", "sentence_window_text_ids"]].head(5))
    
    retrieved_docs["sentence_window_text_ids"] = retrieved_docs["sentence_window_text_ids"].apply(lambda x: sentence_lookup(x, input_data))

    return retrieved_docs

In [16]:
retrieved_docs = answer_query(
                            query="""How do the participants define "authentic" Western wear, and what specific characteristics do they associate with this term?""",
                            document_retriever=pipeline
                            )

retrieved_docs

2024-08-14 22:15:04,098 - root - INFO - 
Retrieving top 5 relevant documents with TFIDFIndexBuilder


2024-08-14 22:15:04,188 - root - INFO - 
Retrieving top 5 relevant documents with DocumentRetriever
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.49it/s]


query ngrams : ['this term', 'what specific characteristics', '"authentic" Western wear', 'the participants']



Batches: 100%|██████████| 1/1 [00:00<00:00, 78.00it/s]
Batches: 100%|██████████| 84/84 [00:00<00:00, 104.17it/s]


Unnamed: 0,index,document,keywords,score,method,sentence_window,text_id,sentence_window_text_ids
6,338,"Okay, perfect. Thank you. All right guys, I wa...","[{'entity': 'you western wear', 'score': 0.818...",0.678738,semantic_match,Hope ():\n\n [SEP] \nI would say probably 50% ...,287b5d53-49e5-4483-ac51-22705606a2ff,"[21340af7-9c8b-41b6-a4c7-297dbd976983, 0a8dace..."
5,660,Western wear. If you were going to put Western...,"[{'entity': 'your own mind', 'score': 0.8}, {'...",0.681936,semantic_match,good look. That's a good look.\n\n [SEP] \nMod...,c4028b0c-f896-4fbd-bf2f-22bfebfbbb5f,"[2a807dc1-75eb-4435-9cbe-60302f4919db, a9dcf23..."
4,846,when you're thinking about dressing in Western...,"[{'entity': 'the starting point', 'score': 0.8...",0.685265,semantic_match,Moderator ():\n\n [SEP] \nConfident? All right...,849044e1-200b-4e14-b8d0-938570947a86,"[a9dcf230-8c00-48e6-89dd-064367a194ba, 078af36..."
3,926,"Okay, all right. Do you consider that Western ...","[{'entity': 'actually western wear', 'score': ...",0.687921,semantic_match,"Maria ():\n\n [SEP] \nI dress also, I like men...",cbe4dd50-5fc8-41d9-8a94-bcc2b8506327,"[e060bfd2-5b75-412e-9966-3f34a0835438, a939e4e..."
2,311,"Western wear, mostly.\n","[{'entity': 'dressing in western', 'score': 0....",0.695248,semantic_match,"Jeff ():\n\n [SEP] \nYeah, I'm along with Mann...",70162a96-9dfd-483f-ae41-3130318e5317,"[40e18a1c-a6cd-405e-b576-2b9cffff1310, 11fa2c4..."
1,1702,Is that a word though? I was going to ask you ...,"[{'entity': 'a word', 'score': 0.8}, {'entity'...",0.780501,semantic_keyword_match,"Eddie ():\n\n [SEP] \nYeah, you didn't have bo...",a2613b56-95e7-420a-8d54-dac75db923f9,"[1735f98a-51a2-4d46-9f46-798f546be7f9, 7c2ee93..."
0,1656,"I don't know if it's a word, but they've been ...","[{'entity': 'a word', 'score': 0.8}]",0.780501,semantic_keyword_match,Hard working? That's just a hard working perso...,09c10431-e3d6-4ee4-86d5-66e4c1a16b0e,"[6da49695-e44a-4d33-a32f-ecc65131c596, 45d9d72..."
