In [2]:
import base64
import os
import json
from PyPDF2 import PdfReader
import numpy as np
import base64
from sentence_transformers import SentenceTransformer

import uuid


In [3]:
def read_txt(file_path):
    with open(file_path, 'r') as f:
        return f.read()

def read_pdf(file_path):
    # read every page and concatenate them
    pdf = PdfReader(file_path)
    return "\n".join([page.extract_text() for page in pdf.pages])\


supported_file_types = {
    '.txt': read_txt,
    '.pdf': read_pdf
}

def iter_supported_files(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if os.path.splitext(file)[1] in supported_file_types:
                yield os.path.join(root, file)


def iter_texts(directory):
    for file in iter_supported_files(directory):
        try:
            yield (file, supported_file_types[os.path.splitext(file)[1]](file))
        except Exception as e:
            print(f"Error reading file {file}: {e}")
        


class TextEncoderPipeline:
    def __init__(self, model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1", chunk_size=128):
        self.model = SentenceTransformer(model_name)
        self.chunk_size = chunk_size

    def __call__(self, text):
        tokens = self.model.tokenizer.tokenize(text)
        chunks = [tokens[i:i+self.chunk_size] for i in range(0, len(tokens), self.chunk_size)]
        chunks.extend(
            [tokens[i:i+self.chunk_size] for i in range(self.chunk_size//2, len(tokens), self.chunk_size)]
        )
        # convert the chunks of tokens back to text\
        text_chunks = [self.model.tokenizer.convert_tokens_to_string(chunk) for chunk in chunks]
        embeddings = self.model.encode(text_chunks)

        # return both the embeddings and the text chunks
        return text_chunks, embeddings
    

class QueryEncoderPipeline:
    def __init__(self, model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1"):
        self.model = SentenceTransformer(model_name)

    def __call__(self, query):
        return self.model.encode(query)


def ndarray_to_json(ndarray):
    return {
        "shape": ndarray.shape,
        "type": str(ndarray.dtype),
        "bytes": base64.b64encode(ndarray.tobytes()).decode("utf-8")
    }


def ndarray_from_json(json):
    return np.frombuffer(base64.b64decode(json["bytes"]), dtype=json["type"]).reshape(json["shape"])

encoder_pipeline = TextEncoderPipeline(chunk_size=128)

def get_embeddings_dict(data_dir):
    embeddings_dict = {}

    for file, text in iter_texts(data_dir):
        text_chunks, embeddings = encoder_pipeline(text)
        for chunk, embedding in zip(text_chunks, embeddings):
            # hash the text to make a key
            key = str(uuid.uuid3(uuid.NAMESPACE_DNS, chunk))
            embeddings_dict[key] = {}
            embeddings_dict[key]["filename"] = file
            embeddings_dict[key]["text"] = chunk
            embeddings_dict[key]["embedding"] = ndarray_to_json(embedding)

    return embeddings_dict

data_dir = "data_directory"

In [4]:
with open("embeddings.json", 'w') as f:
    json.dump(get_embeddings_dict(data_dir), f, indent=4)

Token indices sequence length is longer than the specified maximum sequence length for this model (66268 > 512). Running this sequence through the model will result in indexing errors


Error reading file data_directory/pdfs/02_CS687.pdf: EOF marker not found
Error reading file data_directory/pdfs/01_CS687.pdf: EOF marker not found
Error reading file data_directory/pdfs/03_CS687.pdf: EOF marker not found
Error reading file data_directory/pdfs/04_CS687.pdf: EOF marker not found


In [1]:
query_encoder_pipeline = QueryEncoderPipeline()

def query_embeddings_dict(query, embeddings_dict, top_k=10):
    query_embedding = query_encoder_pipeline(query)
    results = []
    for key, value in embeddings_dict.items():
        embedding = ndarray_from_json(value["embedding"])
        results.append((key, value["filename"], value["text"], np.dot(query_embedding, embedding)))
    return sorted(results, key=lambda x: x[3], reverse=True)[:top_k]

NameError: name 'QueryEncoderPipeline' is not defined

In [None]:
from transformers import pipeline

qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2")


In [None]:
with open("embeddings.json", 'r') as f:
    embeddings_dict = json.load(f)

query = "How do you formally define an MDP?"
results = query_embeddings_dict(query, embeddings_dict)

for result in results:

    context = result[2]
    answer = qa_model(question = query, context = context)

    print(f"{result[1]} - {result[3]}")
    print(f"\tAnswer: {answer['answer']}")
    print(f"\tContext: {context}")
    print()

data_directory/pdfs/Lecture_Notes_v1.0_687_F22.pdf - 27.476634979248047
	Answer: a tuple
	Context: state of the environment is st, the agent takes action at, and the environment transitions to state st + 1, the agent receives the reward rt. this differs from some other sources wherein this reward is called rt + 1. there are many definitions of mdps used in the literature, which share common terms. in each case an mdp is a tuple. four examples are : 1. ( s, a, p, r ) 2. ( s, a, p, r, γ ) 3. ( s, a, p, r, d 0, γ ) 4. ( s, a, p, d r

data_directory/pdfs/Lecture_Notes_v1.0_687_F22.pdf - 27.319942474365234
	Answer: four common ways of defining an mdp. these different definitions
	Context: , a, r, s ′, a ′ ) later. • d0is the initial state distribution : d0 : s → [ 0, 1 ], ( 5 ) and for all s : d0 ( s ) = pr ( s0 = s ). ( 6 ) 1in the remainder of the course, we will very rarely use dr — typically we will work with r. 8 • γ∈ [ 0, 1 ] is a parameter called the reward discount parameter, and wh