In [1]:
# Import necessary libraries
from PyPDF2 import PdfReader
import faiss
import numpy as np

# Kor!
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

# LangChain Models
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

# Standard Helpers
import pandas as pd
import json

# For token counting
from langchain.callbacks import get_openai_callback

In [2]:
import os
os.environ["OPENAI_API_KEY"] = "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"

In [3]:
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0.7,
    max_tokens=2000,
)

In [4]:
# Read the PDF file
reader = PdfReader('/Users/ccsekhar/Downloads/archive/data/data/ACCOUNTANT/10554236.pdf')

In [5]:
# Iterate through each page in the PDF and extract text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [6]:
# Split the raw text into chunks using the text splitter
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [7]:
# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

In [8]:
# Create a FAISS index for similarity search from the text chunks
docsearch = FAISS.from_texts(texts, embeddings)

In [None]:
d = len(texts[0])  # Dimension of the vectors
nlist = 32  # Number of cells (clusters) in the IVF index
quantizer = faiss.IndexFlatL2(d)  # The quantizer index
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

In [None]:
# Initialize an empty list to store all embeddings
all_embeddings = []
for chunk in texts:
    embedding = embeddings.embed_documents(chunk)
    all_embeddings.append(embedding)  # Collect all embeddings for training

In [None]:
# Convert the list of embeddings to a NumPy array
all_embeddings = np.array(all_embeddings, dtype=np.float32)

In [None]:
# Train the index using all_embeddings
index.train(all_embeddings)
# Add all_embeddings to the index
index.add(all_embeddings)

In [9]:
# Load the question-answering chain
chain = load_qa_chain(OpenAI(), chain_type = "stuff")

In [None]:
# Run the question answering chain (to summarize)
query = "Can you summarize the resume in the form of key value pairs? (make sure to include ALL THE INFORMATION and in a listed order)"
docs = docsearch.similarity_search(query)
chain.run(input_documents = docs, question = query)