In [1]:
import numpy 
import pandas

# for tokenizers and reading pdf
from transformers import AutoTokenizer, AutoModel
import torch
import fitz  # PyMuPDF

# Display the variable in Markdown format
from IPython.display import Markdown, display

# for api
import requests
import json

# to calculate similarities
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_text(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf:
        for page in pdf:
            text += page.get_text()
    return text

def search(query, embeddings, chunks):
    query_embedding = generate_embeddings([query])[0]
    similarities = cosine_similarity([query_embedding], embeddings)
    best_match_index = similarities.argsort()[0,::-1]
    return "\n".join([chunks[best_match_index[x]] for x in range(5)])

def chunk_text(text, chunk_size=1000):
    chunk = []
    for i in range(0, len(text), chunk_size):
        new_chunk = text[i : i + chunk_size].lower()
        chunk.append(new_chunk)
    return chunk


def generate_embeddings(chunks):
    embeddings = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors = 'pt', truncation = True, padding = True)
        #print("inputs : ", inputs)
        """
        The tokenizer processes the text chunk and converts it into a format suitable for the model.
        # args...
        
        return_tensors='pt': This argument specifies that the output should be in PyTorch tensor format, which is required for the model.
        truncation=True: This ensures that any input longer than the model's maximum length is truncated, preventing errors during processing.
        padding=True: This ensures that shorter inputs are padded to the same length, allowing for batch processing.
        
        # keys that are returned and which will be used as arg to model:
        input_ids : list of token ids of all tokenised words
        attention_mask : binary mask indicating which tokes are to be attended by the model
        token_type_ids :  It indicates which tokens belong to which segment, if all tokens belong to a single segment then [0,0,0,0]
        overflowing_tokens : This key contains any tokens that were truncated when the input exceeded the maximum length allowed by the model. 
        num_truncated_tokens : number of truncated tokesm
        """
        with torch.no_grad():
            outputs = model(**inputs)
            """
            No Gradient Calculation: The with torch.no_grad(): context manager is used to disable gradient calculations. This is important during inference to save memory and speed up computations since we don't need gradients for backpropagation.
            Model Output: The model processes the tokenized inputs and returns the outputs, which include various hidden states. The **inputs syntax unpacks the dictionary of input tensors into keyword arguments for the model.
            """
            k = outputs.last_hidden_state
            #print("meaned last hidden layer : ", k.shape) # prints mean of all multidimensional layers
            embeddings.append(k.mean(dim=1).squeeze().numpy())
            # last hidden state is output of last layer 
            """
            Extracting Last Hidden State:
            outputs.last_hidden_state contains the hidden states for all tokens in the input sequence. This is a tensor of shape (batch_size, sequence_length, hidden_size).
            Mean Calculation:
            mean(dim=1) computes the mean of the hidden states across all tokens in the sequence, effectively creating a single embedding for the entire input chunk. This is done to obtain a fixed-size vector representation for each chunk.
            Squeeze and Convert to NumPy:
            """
    return embeddings

In [3]:
text = extract_text(r"E:\1My_Books\Self Help\Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")

In [4]:
text

'T H E  A L M A N A C K  O F  N A V A L  R A V I K A N T\nE R I C  J O RG E N S O N\nT H E  A L M A N AC K  O F  N AVA L  R AV I K A N T\nCopyright © 2020 Eric Jorgenson\nAll rights reserved.\nThe Almanack of Naval Ravikant\nA Guide to Wealth and Happiness\nISBN\t 978-1-5445-1422-2\t Hardcover\n\t\n978-1-5445-1421-5\t Paperback\n\t\n978-1-5445-1420-8\t Ebook\nThis book has been created as a public service. It is available for \nfree download in pdf and e-reader versions on Navalmanack.com. \nNaval is not earning any money on this book. Naval has essays, \npodcasts and more at Nav.al and is on Twitter @Naval.\nF O R  M Y  P A R E N T S ,  W H O  G A V E  M E \nE V E R Y T H I N G  A N D  A L W AY S  S E E M  T O \nF I N D  A  W AY  T O  G I V E  M O R E .\nCONTENTS\nIMPORTANT NOTES ON THIS BOOK (DISCLAIMER)\t\n9\nFOREWORD\t\n13\nERIC’S NOTE (ABOUT THIS BOOK)\t\n17\nTIMELINE OF NAVAL RAVIKANT\t\n21\nNOW, HERE IS NAVAL IN HIS OWN WORDS…\t\n23\nPART I: WEALTH\nBUILDING WEALTH\t\n29\nUnders

In [5]:
chunked = chunk_text(text)

In [6]:
chunked[0]

't h e  a l m a n a c k  o f  n a v a l  r a v i k a n t\ne r i c  j o rg e n s o n\nt h e  a l m a n ac k  o f  n ava l  r av i k a n t\ncopyright © 2020 eric jorgenson\nall rights reserved.\nthe almanack of naval ravikant\na guide to wealth and happiness\nisbn\t 978-1-5445-1422-2\t hardcover\n\t\n978-1-5445-1421-5\t paperback\n\t\n978-1-5445-1420-8\t ebook\nthis book has been created as a public service. it is available for \nfree download in pdf and e-reader versions on navalmanack.com. \nnaval is not earning any money on this book. naval has essays, \npodcasts and more at nav.al and is on twitter @naval.\nf o r  m y  p a r e n t s ,  w h o  g a v e  m e \ne v e r y t h i n g  a n d  a l w ay s  s e e m  t o \nf i n d  a  w ay  t o  g i v e  m o r e .\ncontents\nimportant notes on this book (disclaimer)\t\n9\nforeword\t\n13\neric’s note (about this book)\t\n17\ntimeline of naval ravikant\t\n21\nnow, here is naval in his own words…\t\n23\npart i: wealth\nbuilding wealth\t\n29\nunders

In [7]:
# Load model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"

# define tokenizer, pre trained tokenizer are used to avoid unknown tokens, also good for domain adaption where tokenizer will
#  be used for specific application
tokenizer = AutoTokenizer.from_pretrained(model_name)
# define model
model = AutoModel.from_pretrained(model_name)



In [8]:
embeddings = generate_embeddings(chunked)

In [9]:
embeddings

[array([-1.20201841e-01,  1.35532305e-01, -3.07220165e-02,  3.79243419e-02,
         7.14482460e-03,  3.56983766e-02,  1.92105188e-04, -1.19148032e-03,
        -7.03828186e-02,  3.48695479e-02,  5.36719430e-03,  1.66381616e-02,
         1.12470105e-01, -6.55039474e-02, -6.58925027e-02,  1.04939729e-01,
        -6.09998479e-02,  5.09396661e-03, -5.39766513e-02,  1.92475058e-02,
         5.26292659e-02,  5.46974391e-02, -3.76168154e-02, -3.73474769e-02,
         2.41039600e-02, -1.65494289e-02, -3.76126245e-02, -6.05830029e-02,
        -1.10070176e-01, -9.42764163e-05, -2.30580498e-03,  2.33674377e-01,
         6.65388554e-02, -1.15526654e-03, -5.48575521e-02,  8.41637179e-02,
         1.41858347e-02,  4.28008661e-02, -4.78527305e-04,  2.11325455e-02,
        -6.34681731e-02, -6.46213070e-02,  4.39750999e-02,  4.60367203e-02,
        -3.12809236e-02, -1.06854878e-01,  6.38612732e-02,  4.75981925e-03,
         5.80895916e-02,  4.85234559e-02, -1.23284303e-01,  2.83941533e-02,
        -1.3

#### "Embedding are values generated for each word and the values also depend on the data that was fed, it trains a NN to create weights *or* for this specific purpose called embeddings."
#### So embeddings should be made context specific for specific use cases.
#### The embeddings generated by the model are designed to capture semantic relationships:
#### Similar texts will have embeddings that are close together in the vector space.
#### Dissimilar texts will have embeddings that are farther apart.

In [10]:
api = ""

In [11]:
query = "How does naval defines good investment oppertunities?".lower()


rag_query = " ".join([x for x in query.split() if x not in ['author', 'naval', 'ravikant']])

In [12]:
rag_response = search(rag_query, embeddings, chunked)

In [13]:
rag_response

'turn on investment = “buy-and-hold” + valuation + \nmargin of safety\xa0[72]\n224\u2002 · \u2002 t h e  a l m a n a c k  o f  n a v a l  r a v i k a n t\nnaval’s rules (2016)\n\t\n→be present above all else.\n\t\n→desire is suffering. (buddha)\n\t\n→anger is a hot coal you hold in your hand while waiting to \nthrow it at someone else. (buddha)\n\t\n→if you can’t see yourself working with someone for life, \ndon’t work with them for a day.\n\t\n→reading (learning) is the ultimate meta-skill and can be \ntraded for anything else.\n\t\n→all the real benefits in life come from compound interest.\n\t\n→earn with your mind, not your time.\n\t\n→99 percent of all effort is wasted.\n\t\n→total honesty at all times. it’s almost always possible to be \nhonest and positive.\n\t\n→praise specifically, criticize generally. (warren buffett)\n\t\n→truth is that which has predictive power.\n\t\n→watch every thought. (ask “why am i having this thought?”)\n\t\n→all greatness comes from suffering.\n\t\n

In [14]:
url = f'https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent?key={api}'
headers = {'Content-Type': 'application/json'}
data = {
        "contents": [
            {
                "parts": [
                    {
                        "text": f"""Query: {query}
Reference Information:
{rag_response}
Please generate a response based on the query and the provided reference information. 
Please do not add information from yourside. Keep it pointed on query"""
                    }
                ]
            }
        ],
        "generationConfig": {
            "temperature": 0.7,
            "topK": 40,
            "topP": 0.95,
            "maxOutputTokens": 1024,
        }
    }



response = requests.post(url, headers=headers, json=data)
r = response.json()
r

{'candidates': [{'content': {'parts': [{'text': 'The provided text doesn\'t explicitly state how Naval defines good investment opportunities. However, it does highlight several key principles that likely inform his investment decisions:\n\n* **"Buy-and-hold" strategy:**  Naval emphasizes the power of compound interest and long-term investing. This suggests he favors investments that can generate consistent returns over extended periods. \n* **Valuation and margin of safety:** Naval\'s approach includes analyzing the intrinsic value of an investment and seeking a "margin of safety," meaning buying assets at a price significantly below their estimated worth. This helps mitigate risk.\n* **Focus on long-term games and long-term people:** Naval stresses the importance of building relationships and collaborating with individuals who share his long-term vision. This suggests he prioritizes investing in companies and ventures with strong leadership and a sustainable future.\n* **Avoiding the 

In [15]:
display(Markdown(r['candidates'][0]['content']['parts'][0]['text']))

The provided text doesn't explicitly state how Naval defines good investment opportunities. However, it does highlight several key principles that likely inform his investment decisions:

* **"Buy-and-hold" strategy:**  Naval emphasizes the power of compound interest and long-term investing. This suggests he favors investments that can generate consistent returns over extended periods. 
* **Valuation and margin of safety:** Naval's approach includes analyzing the intrinsic value of an investment and seeking a "margin of safety," meaning buying assets at a price significantly below their estimated worth. This helps mitigate risk.
* **Focus on long-term games and long-term people:** Naval stresses the importance of building relationships and collaborating with individuals who share his long-term vision. This suggests he prioritizes investing in companies and ventures with strong leadership and a sustainable future.
* **Avoiding the risk of ruin:**  Naval cautions against taking unnecessary risks that could lead to financial ruin. This emphasizes his preference for investments with a lower risk profile.

While the text doesn't provide a specific definition of "good investment opportunities" in Naval's view, it highlights his focus on long-term value creation, careful analysis, risk mitigation, and building relationships with trustworthy individuals. 


### Hello