In [1]:
import openai
import numpy as np
import fasttext
import fasttext.util

# Train FastText model
def train_fasttext_model(file_path, model_path):
    model = fasttext.train_unsupervised(file_path, model='skipgram', lr=0.0001)
    model.save_model(model_path)
    return model

def load_text(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.readlines()

def embed_text(text_lines, model):
    return np.array([model.get_sentence_vector(line.strip()) for line in text_lines])

def retrieve_relevant_text(query, text_lines, embeddings, model, top_k=3):
    query_embedding = model.get_sentence_vector(query)
    similarities = np.dot(embeddings, query_embedding)
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    return "\n".join([text_lines[i] for i in top_indices])

def generate_response(prompt, context):
    messages = [
        {"role": "system", "content": "You are an AI assistant using retrieval-augmented generation."},
        {"role": "user", "content": f"Context: {context}\n\nUser question: {prompt}"}
    ]
    response = openai.ChatCompletion.create(
        model="gpt-4", messages=messages
    )
    return response["choices"][0]["message"]["content"]

if __name__ == "__main__":
    file_path = "./output/full_text.txt"
    text_lines = load_text(file_path)
    
    # fasttext_model = train_fasttext_model(file_path, 'fasttext_model.bin')

    # embeddings = embed_text(text_lines, fasttext_model)
    
    # user_prompt = input("Enter your question: ")
    # relevant_text = retrieve_relevant_text(user_prompt, text_lines, embeddings, fasttext_model)
    # response = generate_response(user_prompt, relevant_text)
    
    # print("\nAI Response:", response)


In [None]:
import re
from collections import defaultdict

def get_stats(vocab):
    """
    Given a vocabulary (dictionary mapping words to frequency counts), returns a 
    dictionary of tuples representing the frequency count of pairs of characters 
    in the vocabulary.
    """
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    """
    Given a pair of characters and a vocabulary, returns a new vocabulary with the 
    pair of characters merged together wherever they appear.
    """
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

def get_vocab(data):
    """
    Given a list of strings, returns a dictionary of words mapping to their frequency 
    count in the data.
    """
    vocab = defaultdict(int)
    for line in data:
        for word in line.split():
            vocab[' '.join(list(word))] += 1
    return vocab

def byte_pair_encoding(data, n):
    """
    Given a list of strings and an integer n, returns a list of n merged pairs
    of characters found in the vocabulary of the input data.
    """
    vocab = get_vocab(data)
    # print(vocab)
    for i in range(n):
        pairs = get_stats(vocab)
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
    return vocab

# Example usage:
corpus = text_lines[0]
data = corpus.lower().split('.')

n = 230
bpe_pairs = byte_pair_encoding(data, n)
len(bpe_pairs)


# np.unique(text_lines[0].split(" ")).shape


defaultdict(<class 'int'>, {'p r o v i d e d': 1, 'p r o p e r': 1, 'a t t r i b u t i o n': 1, 'i s': 54, 'p r o v i d e d ,': 1, 'g o o g l e': 9, 'h e r e b y': 1, 'g r a n t s': 1, 'p e r m i s s i o n': 1, 't o': 117, 'r e p r o d u c e': 1, 't h e': 332, 't a b l e s': 1, 'a n d': 177, 'f i g u r e s': 1, 'i n': 139, 't h i s': 42, 'p a p e r': 1, 's o l e l y': 2, 'f o r': 51, 'u s e': 10, 'j o u r n a l i s t i c': 1, 'o r': 13, 's c h o l a r l y': 1, 'w o r k s': 1, 'a t t e n t i o n': 57, 'a l l': 20, 'y o u': 1, 'n e e d': 2, 'a s h i s h': 1, 'v a s w a n i ∗': 1, 'b r a i n': 4, 'a v a s w a n i @ g o o g l e': 1, 'c o m': 7, 'n o a m': 4, 's h a z e e r ∗': 1, 'n o a m @ g o o g l e': 1, 'n i k i': 2, 'p a r m a r ∗': 1, 'r e s e a r c h': 6, 'n i k i p @ g o o g l e': 1, 'j a k o b': 3, 'u s z k o r e i t ∗': 1, 'u s z @ g o o g l e': 1, 'l l i o n': 2, 'j o n e s ∗': 1, 'l l i o n @ g o o g l e': 1, 'a i d a n': 2, 'n': 13, 'g o m e z ∗ †': 1, 'u n i v e r s i t y': 1

1926

In [14]:
bpe_pairs

{'pro v id ed': 1,
 'pro per': 1,
 'at tr i b ution': 1,
 'is': 54,
 'pro v id ed ,': 1,
 'go og le': 9,
 'h ere by': 1,
 'g r an ts': 1,
 'per m is sion': 1,
 'to': 117,
 're produc e': 1,
 'the': 332,
 't able s': 1,
 'and': 177,
 'f ig ure s': 1,
 'in': 139,
 'this': 42,
 'pa per': 1,
 'so le ly': 2,
 'for': 51,
 'u se': 10,
 'j our n al is ti c': 1,
 'or': 13,
 's ch ol ar ly': 1,
 'work s': 1,
 'attention': 57,
 'all': 20,
 'y o u': 1,
 'ne ed': 2,
 'as h is h': 1,
 'v as w an i ∗': 1,
 'b r ain': 4,
 'a v as w an i @ go og le': 1,
 'co m': 7,
 'n o am': 4,
 'sh a z e er ∗': 1,
 'n o am @ go og le': 1,
 'n i k i': 2,
 'par m ar ∗': 1,
 're se arch': 6,
 'n i k i p @ go og le': 1,
 'j a k o b': 3,
 'us z k o re it ∗': 1,
 'us z @ go og le': 1,
 'l li on': 2,
 'j on es ∗': 1,
 'l li on @ go og le': 1,
 'a id an': 2,
 'n': 13,
 'go me z ∗ †': 1,
 'un i ver si t y': 1,
 'of': 180,
 't or on to': 2,
 'a id an @ c s': 1,
 'ed u': 1,
 'ł u k as z': 3,
 'k a is er ∗': 1,
 'l u k as z k a 

In [21]:
import os
import torch
from transformers import AutoTokenizer, AutoModel
import faiss
import fitz
import numpy as np

class RAGSystem:
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2'):
        """
        Initialize the RAG system with embedding model and tokenizer

        Args:
            model_name (str): Hugging Face model for creating embeddings
        """
        # Load embedding model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Ensure model is in evaluation mode
        self.model.eval()
        
        # Initialize vector database attributes
        self.passages = []
        self.embeddings = None
        self.faiss_index = None
    
    def mean_pooling(self, model_output, attention_mask):
        """
        Perform mean pooling on model output
        
        Args:
            model_output (torch.Tensor): Model's last hidden state
            attention_mask (torch.Tensor): Attention mask for input
        
        Returns:
            torch.Tensor: Pooled sentence embeddings
        """
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    def create_embeddings(self, texts):
        """
        Create embeddings for given texts
        
        Args:
            texts (list): List of text passages
        
        Returns:
            numpy.ndarray: Embedding matrix
        """
        # Tokenize texts
        encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        
        # Compute token embeddings
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        
        # Perform mean pooling
        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
        
        # Normalize embeddings
        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
        
        return sentence_embeddings.numpy()

    def load_pdf(self, file_path):
        """
        Load text passages from a file
        
        Args:
            file_path (str): Path to the text file
        """
        
        doc = fitz.open(file_path)
        
        # Prepare content dictionary
        content = []
        
        # Extract text from each page
        for page_num in range(len(doc)):
            page = doc[page_num]
            
            # Extract text blocks
            text_blocks = page.get_text("blocks")
            
            # print(text_blocks)
            # break
            for block in text_blocks:
                # Check if block is text (not image)
                if block[6] == 0:  # 0 indicates text block
                    content.append(block[4])
        self.passages = content

    def build_vector_database(self):
        """
        Build FAISS vector database for efficient similarity search
        """
        # Create embeddings for passages
        self.embeddings = self.create_embeddings(self.passages)
        
        # # Create FAISS index
        # dimension = self.embeddings.shape[1]
        # self.faiss_index = faiss.IndexFlatL2(dimension)
        # self.faiss_index.add(self.embeddings.astype('float32'))

    def retrieve_top_k_passages(self, query, k=5):
        """
        Retrieve top K similar passages for a given query
        
        Args:
            query (str): Search query
            k (int): Number of top passages to retrieve
        
        Returns:
            list: Top K most similar passages
        """
        if self.faiss_index is None:
            raise ValueError("Vector database not built. Call build_vector_database() first.")
        
        # Create query embedding
        query_embedding = self.create_embeddings([query])
        
        # Perform similarity search
        distances, indices = self.faiss_index.search(query_embedding, k)
        
        # Return top K passages
        return [self.passages[idx] for idx in indices[0]]

# Initialize RAG system
rag_system = RAGSystem()

# Load passages from file
rag_system.load_pdf('attention_is_all_you_need.pdf')

print(rag_system.passages)
# Build vector database
rag_system.build_vector_database()

# # Example query
# query = "What is the main topic of this document?"
# top_passages = rag_system.retrieve_top_k_passages(query, k=3)

# print("Query:", query)
# print("\nTop Similar Passages:")
# for i, passage in enumerate(top_passages, 1):
#     print(f"\nPassage {i}:\n{passage}")


['Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\n', 'Attention Is All You Need\n', 'Ashish Vaswani∗\nGoogle Brain\navaswani@google.com\n', 'Noam Shazeer∗\nGoogle Brain\nnoam@google.com\n', 'Niki Parmar∗\nGoogle Research\nnikip@google.com\n', 'Jakob Uszkoreit∗\nGoogle Research\nusz@google.com\n', 'Llion Jones∗\nGoogle Research\nllion@google.com\n', 'Aidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\n', 'Łukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\n', 'Illia Polosukhin∗‡\nillia.polosukhin@gmail.com\n', 'Abstract\n', 'The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on att

In [6]:
print(len(rag_system.passages[0]))


39495
