In [29]:
import os
from pathlib import Path
from typing import cast
import numpy as np
from dataclasses import dataclass
import requests

import shortuuid
import torch as pt
from dotenv import load_dotenv
from llm_utils.md import chunk_md, md_to_text
from pinecone import Pinecone, PodSpec, Index
from sentence_transformers import SentenceTransformer

In [2]:
load_dotenv()

True

In [3]:
device = "cpu"
if pt.cuda.is_available():
    device = "cuda"
elif pt.backends.mps.is_available():
    device = "mps"

model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

In [4]:
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

In [None]:
spec = PodSpec(environment="gcp-starter")
pc.create_index(
    name="my-notes",
    dimension=model.get_sentence_embedding_dimension(),
    metric="cosine",
    spec=spec
)

In [8]:
index = cast(Index, pc.Index("my-notes"))

In [None]:
NOTES_ROOT = Path.expanduser(Path("~/OneDrive/Documents/Notes"))

In [None]:
# notespath = NOTES_ROOT/"Journal"
# namespace = "journal"

notespath = NOTES_ROOT/"Useful Cmds"
namespace = "usefulcmds"

In [None]:
# vectors = []
# for md_file in md_to_text(notespath):
#     vectors.append({
#         "id": shortuuid.uuid()[:7],
#         "values": model.encode(md_file.contents),
#         "metadata": {"filepath": md_file.filepath}
#     })

In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]

chunk_size = 500
chunk_overlap = 100

md_docs = chunk_md(
    in_dir=notespath, 
    headers_to_split_on=headers_to_split_on,
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [None]:
vectors = []
for md_doc in md_docs:
    id = shortuuid.uuid()[:7]
    metadata = md_doc.metadata
    metadata["contents"] = md_doc.page_content
    emb = model.encode(md_doc.page_content)
    vector = {
        "id": id,
        "metadata": metadata,
        "values": emb
    }
    vectors.append(vector)
    if len(vectors) == 500:
        print(f"Uploading {len(vectors)} records.")
        index.upsert(vectors, namespace=namespace)
        vectors = []
if vectors:
    print(f"Uploading {len(vectors)} records.")
    index.upsert(vectors, namespace=namespace)

In [None]:
index.describe_index_stats()

In [23]:
@dataclass
class SearchMatch:
    id: str
    header_1: str
    header_2: str
    contents: str
    file: str
    score: float

    def __repr__(self) -> str:
        repr = f"id: {self.id}\n"
        repr += f"score: {self.score:.4f}\n"
        repr += f"header_1: {self.header_1}\n"
        repr += f"header_2: {self.header_2}\n"
        repr += f"file: {self.file}\n"
        repr += f"contents:\n{self.contents}\n"
        return repr

In [24]:
def semantic_search(query: str, query_ns: str) -> list[SearchMatch]:
    query_emb = cast(np.ndarray, model.encode(query))
    matches = index.query(
        namespace=query_ns,
        vector=query_emb.tolist(),
        top_k=5,
        include_metadata=True,
        include_values=False
    )
    search_matches = []
    for match in matches["matches"]:
        id = match["id"]
        header_1 = match["metadata"]["Header 1"] if "Header 1" in match["metadata"] else ""
        header_2 = match["metadata"]["Header 2"] if "Header 2" in match["metadata"] else ""
        contents = match["metadata"]["contents"]
        file = match["metadata"]["file"]
        score = match["score"]
        sm = SearchMatch(id, header_1, header_2, contents, file, score)
        search_matches.append(sm)
    return search_matches

In [None]:
matches = semantic_search("How to create a conda environment?", "usefulcmds")
for match in matches:
    print(match)

id: SEsX7xd
score: 0.6878
header_1: Conda
header_2: Common Commands
file: /Users/avilay/OneDrive/Documents/Notes/Useful Cmds/Conda.md
contents:
Conda
Common Commands
Create a new environment:  
```
conda create -n <env name> python=<ver> <pkg> <pkg>
```  
List all existing environments:  
```
conda info --envs
```  
Remove an environment and all its packages:  
```
conda remove -n <env name> --all
```  
List all packages installed in current environment:  
```
conda list
```  
Check if a specific package has been installed or not:  
```
conda list <pkg>
```  
Delete unused packages and caches:  
```
conda clean --all
```  
Install a package:  
```

id: nyTVpmS
score: 0.6342
header_1: Conda
header_2: Conda Config
file: /Users/avilay/OneDrive/Documents/Notes/Useful Cmds/Conda.md
contents:
Conda
Conda Config
To get the current configuration run `conda info`. Any time I change any of these default configs, conda will create a `~/.condarc` file. I can see the config filename as part of the 

In [52]:
query = "Should I do another startup?"
matches = semantic_search(query, "journal")
for match in matches:
    print(match)

id: 7yfcTU8
score: 0.5876
header_1: February 2016
header_2: 2016-02-17: Startup Tactics
file: /Users/avilay/OneDrive/Documents/Notes/Journal/2016-02.md
contents:
February 2016
2016-02-17: Startup Tactics
I should have a very convincing (to myself) answer of why I am getting into this. Initially I was in it to solve cool problems. That is totally the wrong reason to start my own startup. This might be a good reason to join an early stage startup. Doing my own startup involves so much more than solving cool problems, to the point where solving the cool problem is just a small part of running a startup. This also leads me to ignore a bunch of different opportunities because the problem space is

id: PtrSsiJ
score: 0.5218
header_1: March 2012
header_2: Preparing for Startup
file: /Users/avilay/OneDrive/Documents/Notes/Journal/2012-03.md
contents:
March 2012
Preparing for Startup
There are two conflicting views that I have heard on the subject of when to leave - one is that I should have so

In [53]:
rag_template = """
Context information from multiple sources is below.
---------------------------------------------------
{context}
---------------------------------------------------
Given the information from multiple sources and not prior knowledge, answer the query.
Query: {query}
Answer:
"""

In [54]:
context = ""
for match in matches:
    context += match.contents + "\n\n"
print(context)

February 2016
2016-02-17: Startup Tactics
I should have a very convincing (to myself) answer of why I am getting into this. Initially I was in it to solve cool problems. That is totally the wrong reason to start my own startup. This might be a good reason to join an early stage startup. Doing my own startup involves so much more than solving cool problems, to the point where solving the cool problem is just a small part of running a startup. This also leads me to ignore a bunch of different opportunities because the problem space is

March 2012
Preparing for Startup
There are two conflicting views that I have heard on the subject of when to leave - one is that I should have something up and running with paying customers before I leave, another is that the only way I can get something up and running is to leave. I think both these approaches are wrong. I should have my MVP(s) done. If it is a B2B product, have around $10,000 revenue per year. If it is B2C, have around 1000 active users.

In [55]:
prompt = rag_template.format(context=context, query=query)
print(prompt)


Context information from multiple sources is below.
---------------------------------------------------
February 2016
2016-02-17: Startup Tactics
I should have a very convincing (to myself) answer of why I am getting into this. Initially I was in it to solve cool problems. That is totally the wrong reason to start my own startup. This might be a good reason to join an early stage startup. Doing my own startup involves so much more than solving cool problems, to the point where solving the cool problem is just a small part of running a startup. This also leads me to ignore a bunch of different opportunities because the problem space is

March 2012
Preparing for Startup
There are two conflicting views that I have heard on the subject of when to leave - one is that I should have something up and running with paying customers before I leave, another is that the only way I can get something up and running is to leave. I think both these approaches are wrong. I should have my MVP(s) done. I

In [56]:
resp = requests.post(
    "http://localhost:11434/api/generate", 
    json={
        "model": "llama2",
        "prompt": prompt,
        "stream": False
    }
)
resp.status_code

200

In [57]:
print(resp.json()["response"])

Based on the information provided in the given context, it seems that you are considering starting a new startup after your previous experience with starting one. While there are various reasons to start a new startup, the main question is whether it is the right decision for you at this point in time.

After analyzing the given information, I would advise you to take the following factors into consideration before making a decision:

1. Reasoning behind starting a new startup: Reflect on your reasons for starting a new startup. Were they based on solving cool problems or was there another motivation? Identify if your reasons are still valid and if you have a clear vision for your next startup.
2. Current situation at MS: Evaluate your current job and work environment at Microsoft. Are you happy with the work you're doing, and do you see potential for growth? Consider if leaving MS to start a new venture is necessary or if you can achieve your goals while still employed.
3. End goals a