# Importing OpenAI

In [None]:
# Import libraries
!pip install -qU langchain==0.0.354 \
    openai==1.6.1 \
    pinecone-client==3.1.0 \
    tiktoken==0.5.2

In [None]:
import os

# Setting the OpenAI environment variable
os.environ["OPENAI_APIKEY"] = "API_KEY"

In [None]:
OPENAI_APIKEY = os.getenv('OPENAI_APIKEY')
print(OPENAI_APIKEY)

In [None]:
import os
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_APIKEY"],
    model='gpt-3.5-turbo-1106'
)

In [None]:
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)

messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content="I'd like to know more about how mobile networks are being used in mining operations.")
]


In [None]:
res = chat(messages)
res

In [None]:
print(res.content)

In [None]:
# add latest AI response to messages
messages.append(res)

# now create a new user prompt
prompt = HumanMessage(
    content="how are these networks different from regular every day mobile networks in the cities?"
)
# add to messages
messages.append(prompt)

# send to chat-gpt
res = chat(messages)

print(res.content)

# Importing Relevant Patent Data

In [None]:
from datasets import load_dataset
import json
import pandas as pd

def load_jsonl(file_path):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            dataset.append(json.loads(line))
    return dataset

# Path to your .jsonl file
path_to_jsonl_file = '/Users/alexsar/Downloads/train.jsonl'

# Load the data
dataset = load_jsonl(path_to_jsonl_file)

# Convert the list of dictionaries to a DataFrame
data = pd.DataFrame(dataset)

# Example usage: print first few items
dataset


In [None]:
dataset[2]

In [None]:
import pandas as pd

# Assuming 'data' is your loaded dataset as a DataFrame
print(data.info())  # Get a concise summary of the DataFrame
print(data.head())  # Print the first few entries of the DataFrame to understand what each column contains


In [None]:
# Downloading preprocessed data from Hugging Face

In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("alsolacol/schlumberger-patents-dataset-chunked", split='train')

dataset


In [None]:
dataset[0]

# Upgesting to Pinecone DB

In [None]:
import os

# Setting the Pinecone API environment variable
os.environ["PINECONE_API_KEY"] = "PINECONE_KEY"

In [None]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
print(PINECONE_API_KEY)

In [None]:
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv("PINECONE_API_KEY") 

# configure client
pc = Pinecone(api_key=api_key)


In [None]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [None]:
import time

index_name = 'llama-2-rag'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

# Initialize the embeddings model with the API key
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key="OPENAI_KEY")


In [None]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed_model.embed_documents(texts)
len(res), len(res[0])

In [None]:
from tqdm.auto import tqdm  # for progress bar

data = dataset.to_pandas()  # this makes it easier to iterate over the dataset

batch_size = 100

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    # get batch of data
    batch = data.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [f"{x['publication_number']}-{x['filing_date']}" for i, x in batch.iterrows()]
    # get text to embed
    texts = [x['split_preprocessed_combined_text'] for _, x in batch.iterrows()]
    # embed text
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['aggregated_representative_docs'],
         'inventors': x['inventors'],
         'title': x['title']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

In [None]:
index.describe_index_stats()

# RAG

In [None]:
from langchain.vectorstores import Pinecone

text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

In [None]:
query = "What is so special about mobile networks in mining operations?"

vectorstore.similarity_search(query, k=3)

In [None]:
def augment_prompt(query: str):
    # get top 3 results from knowledge base
    results = vectorstore.similarity_search(query, k=3)
    # get the text from the results
    source_knowledge = "\n".join([x.page_content for x in results])
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.

    Contexts:
    {source_knowledge}

    Query: {query}"""
    return augmented_prompt

In [None]:
print(augment_prompt(query))

In [None]:
# create a new user prompt without RAG
prompt = HumanMessage(
    content=augment_prompt(query)
)
# add to messages
messages.append(prompt)

res = chat(messages)

print(res.content)

In [None]:
prompt = HumanMessage(
    content="who are some of the inventors who worked on mobile networks in mining?"
)

res = chat(messages + [prompt])
print(res.content)

In [None]:
# Adding context to the new user prompt with RAG
prompt = HumanMessage(
    content=augment_prompt(
        "who are some of the inventors who worked on mobile networks in mining based on the data provided below?"
    )
)

res = chat(messages + [prompt])
print(res.content)

In [None]:
# create a new user prompt
prompt = HumanMessage(
    content=augmented_prompt
)
# add to messages
messages.append(prompt)

# send to OpenAI
res = chat(messages)

print(res.content)

In [None]:
pc.delete_index(index_name)