# Query text-related info in a custom file
LangChain + Pinecone + OpenAI

## Import packages

In [None]:
import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import boto3
import os
import io
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader
import tempfile

## Set up the api-key

In [None]:
# openai.api_key = "YOUR_OPENAI_API_KEY"
# pinecone_api_key = "YOUR_PINECONE_API_KEY"
# pinecone_environment = "YOUR_PINECONE_ENVIRONMENT"
# pinecone_index_name = "YOUR_PINECONE_INDEX_NAME"
# aws_access_key_id = 'YOUR_AWS_ACCESS_KEY_ID'
# aws_secret_access_key = 'YOUR_AWS_SECRET_ACCESS_KEY'
# aws_region_name = 'YOUR_AWS_REGION_NAME'


## Pinecone init

In [None]:
pc = pinecone.Pinecone()

## Create a serverless index

In [None]:
if pinecone_index_name not in pc.list_indexes().names():
    pc.create_index(
        name=pinecone_index_name,
        dimension=1536,
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )


In [None]:
index = pc.Index(pinecone_index_name)

In [None]:
embeddings = OpenAIEmbeddings()
vectorstore = Pinecone(index=index, embedding=embeddings, text_key="text")

## AWS S3 init

In [None]:
s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=aws_region_name
)

In [None]:
bucket_name = "erica-model"
folder_path = "test_resume/"

## Upsert vectors (from AWS S3)

In [None]:
def read_s3_file(bucket_name, key):
    response = s3_client.get_object(Bucket=bucket_name, Key=key)
    file_stream = io.BytesIO(response['Body'].read())

    if key.lower().endswith(".pdf"):
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
            tmp_file.write(file_stream.read())
            tmp_file.flush()
            try:
                loader = PyPDFLoader(tmp_file.name)
                documents = loader.load()
                return "\n".join([doc.page_content for doc in documents])
            finally:
                os.remove(tmp_file.name)
    elif key.lower().endswith(".docx"):
        with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp_file:
            tmp_file.write(file_stream.read())
            tmp_file.flush()
            try:
                loader = Docx2txtLoader(tmp_file.name)
                documents = loader.load()
                return "\n".join([doc.page_content for doc in documents])
            finally:
                os.remove(tmp_file.name)
    else:
        try:
            return file_stream.read().decode('utf-8')
        except UnicodeDecodeError:
            try:
                return file_stream.read().decode('latin-1')
            except UnicodeDecodeError:
                return file_stream.read().decode('iso-8859-1')

In [None]:
def upload_files_to_pinecone(bucket_name, folder_path):
    paginator = s3_client.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=folder_path)

    documents = []
    for page in pages:
        for obj in page.get('Contents', []):
            key = obj['Key']
            content = read_s3_file(bucket_name, key)
            if content:
                documents.append({"id": key.split('/')[1], "text": content})

    vectorstore.add_texts(texts=[doc["text"] for doc in documents], metadatas=[{"id": doc["id"]} for doc in documents])
    print("Files uploaded to Pinecone.")

In [None]:
upload_files_to_pinecone(bucket_name, folder_path)

## Check the index

In [None]:
print(index.describe_index_stats())


## Run a similarity search

In [None]:
def query_from_pinecone(query):
    # vectorstore = Pinecone(index=index, embedding=embeddings.embed_query, text_key="text")
    results = vectorstore.similarity_search(query, k=5)

    # return results
    for match in results:
        print(f"Matched File ID: {match.metadata['id']}")

In [None]:
query = "Bachelor's degree in Computer Science, Information Technology, or related field.\nMinimum of 3 years of experience in front-end development. \nProficiency in HTML, CSS, JavaScript, and front-end frameworks (e.g., React, Angular, Vue.js).\nExperience with web platforms like Squiz or similar content management systems.\nStrong problem-solving and analytical skills.\nExcellent communication and collaboration abilities."
query_from_pinecone(query)

In [None]:
query_2 = "Kevin Jacob"
query_from_pinecone(query_2)

# Question2Answer

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [None]:
qa.run(query)

In [None]:
index.delete(delete_all=True)