In [None]:
! pip3 install -qU \
  langchain==0.0.162 \
  openai==0.27.7 \
  tiktoken==0.4.0 \
  "pinecone-client[grpc]"==2.2.1

In [None]:
import tiktoken

tiktoken.encoding_for_model('gpt-3.5-turbo')

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [13]:
import os

# get openai api key from platform.openai.com
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [14]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [65]:
import pinecone

# find API key in console at app.pinecone.io
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
# find ENV (cloud region) next to API key in console
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') or 'asia-southeast1-gcp-free'

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)
index_name = 'langchain-retrieval-augmentation'

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=len(res[0])  # 1536 dim of text-embedding-ada-002
    )

In [68]:
from tqdm.auto import tqdm
from uuid import uuid4

def retrieve_vec(text, contract_name):
    index = pinecone.GRPCIndex(index_name)
    texts = []
    metadatas = []
    batch_limit = 100
    # first get metadata fields for this record
    metadata = {
        'title': contract_name
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(text)
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas), namespace=contract_name)
        texts = []
        metadatas = []

    if len(texts) > 0:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas), namespace=contract_name)
    return index

In [21]:
from langchain.vectorstores import Pinecone

text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

In [69]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [None]:
import pandas as pd
df = pd.read_csv('')

In [None]:
df_new = df
df_new['Retrieved Answer'] = list_answers

In [None]:
list_ans_p = []
for i in range(len(df_new)):
    res = df_new.iloc[i]['GPT-3.5 Response']
    if res=="No Clarification Needed":
        list_ans_p.append("No Answers")
        continue
    cl_questions = res.split('\n')
    print(i)
    ans = ""
    if len(list(df_new.iloc[i]['Retrieved Answer'])) != len(cl_questions):
        new_qlist = []
        for j in range(len(cl_questions)):
            if cl_questions[j].strip() != "":
                new_qlist.append(cl_questions[j])
        for j in range(len(new_qlist)):
            a = list(df_new.iloc[i]['Retrieved Answer'])[j]
            ans += str(j+1) + ". " + a + "\n"
        list_ans_p.append(ans)
        continue
    for j in range(len(cl_questions)):
        a = list(df_new.iloc[i]['Retrieved Answer'])[j]
        ans += str(j+1) + ". " + a + "\n"
    list_ans_p.append(ans)

In [None]:
df_new['Processed Answers']= list_ans_p
df_new = df_new.drop(columns=['Retrieved Answer'], axis=0)
df_new.to_csv('', index=False)