In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pandas as pd
import logging
import sys
import config
import json
from sentence_transformers import SentenceTransformer, util
import gzip
import os
import torch

# Import OpenAI and other necessary modules
import openai

# Import classes and functions from modules
from llama_index import (
    Document,
    VectorStoreIndex,
    ListIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    SimpleKeywordTableIndex,
)
from llama_index.indices.postprocessor import (
    LLMRerank
)
from llama_index.response.notebook_utils import display_response
from llama_index.llms import OpenAI

# Configure logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().handlers = []
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


In [2]:
#OpenAI API Key Authentication (The OpenAI API Key will be stored in the config.py file)
openai.api_key = config.openai_key

In [3]:
# Initialize OpenAI's LLM (Language Learning Model)
llm = OpenAI(model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(chunk_size=1024, llm=llm)

In [4]:
# Read and process query, document, and relevance data
df_queries = pd.read_csv('antique_query_test.csv')
df_queries = df_queries[['query_id','text']]

df_docs = pd.read_csv('antique_sample_404k.csv')
df_docs = df_docs[['doc_id','text']]

df_qrel = pd.read_csv('antique_qurel_test.csv')
df_qrel = df_qrel[['query_id','doc_id','relevance']]

# Merge relevant data for query and document
merged_df = df_qrel.merge(df_docs, on='doc_id', how='left')

# Extract text data from merged DataFrame
df_text = merged_df[['doc_id','text']]

# Initialize an empty list to store passages
passages = []

# Iterate through each row in the 'df_text' DataFrame and append text to the 'passages' list
for index, row in df_text.iterrows():
    passages.append(str(row['text']))

In [5]:
# Load pre-computed document embeddings
import pickle
with open('corpus_embeddings_text.pickle', 'rb') as pkl:
    doc_embedding = pickle.load(pkl)

In [6]:
# Initialize SentenceTransformer for embedding
bi_encoder = SentenceTransformer('intfloat/e5-base-v2')
bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
# top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

Load pretrained SentenceTransformer: intfloat/e5-base-v2
Created a temporary directory at /var/folders/75/0dtb1gc52pdfr40bnpp97qj00000gn/T/tmp_hi35l7g
Writing /var/folders/75/0dtb1gc52pdfr40bnpp97qj00000gn/T/tmp_hi35l7g/_remote_module_non_scriptable.py
Use pytorch device: cpu


In [7]:
# Function to perform semantic search

# This function will search all the articles for passages that
# answer the query
def search(input_query):  
    output_answers=[]  
    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(input_query, convert_to_tensor=True)
    hits = util.semantic_search(question_embedding, doc_embedding, top_k=50)
    hits = hits[0]  # Get the hits for the first query

    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits:
        output_answer =  passages[hit['corpus_id']].replace("\n", " ")
        output_answers.append(output_answer)  # Append to the list

    return output_answers  # Return the list of output answers

In [8]:
# Perform semantic search on a sample query
retrieved_docs = search(input_query = "Why doesn't the water fall off  earth if it's round?")
retrieved_docs[0:5]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

['because there is no gravity strong enough around earth for water to fall of the earth',
 "Water doesn't spill off of the earth because there is gravity in the center of the earth that holds down the water.",
 "This goes along with the question of why don't we fall off the earth if it is round. The answer is because gravity is holding us (and the water) down.",
 "The world isn't round. It's a cube.",
 "Why don't we fall off?  Simple, gravity.  Large objects have a natural gravitational pull...  Earth is a LARGE object haha  A very large object.  So Earth is just pulling water towards it."]

In [9]:
# Create Document objects for the retrieved documents
documents = [Document(text=t) for t in retrieved_docs]

# Create a VectorStoreIndex from the retrieved documents
retrieved_docs_index = VectorStoreIndex.from_documents(documents)

In [10]:
# Initialize an LLM reranker
reranker = LLMRerank(top_n=10)

# Create a query engine with reranking
query_engine = retrieved_docs_index.as_query_engine(
    similarity_top_k=10,
    node_postprocessors=[reranker],
)

In [11]:
# Query the engine with a sample query and display the response
response = query_engine.query(
    "Why doesn't the water fall off  earth if it's round?"
)

In [12]:
# Display the response with source and other details
display_response(
    response, show_source=True, source_length=500
)

**`Final Response:`** The water doesn't fall off the earth because of the gravitational pull of the earth. The round shape of the earth helps to create a stronger gravitational pull, which helps to keep the water in place. The round shape also helps to create a more stable state, which helps to keep the water from spilling off the earth.

---

**`Source Node 1/3`**

**Node ID:** c90b5590-4e0d-41ef-953e-2d54dbddf694<br>**Similarity:** 8.0<br>**Text:** Water doesn't spill off of the earth because there is gravity in the center of the earth that holds down the water....<br>

---

**`Source Node 2/3`**

**Node ID:** 17968708-6d30-4ec2-8ec5-1f50c630a9a9<br>**Similarity:** 7.0<br>**Text:** Why don't we fall off?  Simple, gravity.  Large objects have a natural gravitational pull...  Earth is a LARGE object haha  A very large object.  So Earth is just pulling water towards it....<br>

---

**`Source Node 3/3`**

**Node ID:** c967249c-a4ed-4b66-abd1-7cbd8fdacf02<br>**Similarity:** 6.0<br>**Text:** The same reason a drop of mercury in a bowl is round. 'Round' is the most stable state. It's the only configuration that gets the most 'earth' closest to the center and uses up the most gravitational potential energy. You can see how this works by imagining an avalanche. The rocks move as close as they can toward the center of the earth by falling and along with erosion, tends to make the earth even more 'rounder'....<br>