In [143]:
import re
import os
import nltk
import numpy as np
# import pandas as pd
import json
from threading import Timer
from collections import Counter
import torch
import faiss                 
from transformers import AutoModel, AutoTokenizer
import ollama
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def clean(note):
    # remove zero-links
    note = re.sub(r'\[.*\]', '', note)
    # remove tags and headers
    note = re.sub(r'\#.*\n', '', note)
    # remove lines
    note = re.sub('---', ' ', note)
    # remove **
    note = re.sub('\*', '', note)
    
    return note

def clean_thought(thought):
    thought = re.sub(r'\(http\S+', '<LINK>', thought)
    thought = re.sub(r'http\S+', '<LINK>', thought)

    if thought[:2] == '- ':
        thought = thought[2:]

    if '<LINK>' in thought:
        linkless = re.sub('<LINK>', '', thought)
        linkless = re.sub('[^a-zA-Zа-яА-Я ]', '',  linkless)
        linkless = linkless.strip()
        if len(linkless.split(' ')) < 2:
            return ''
    
    return thought.strip()


def filter_thought(thought):
    if not thought:
        return False
    
    thought = str(thought)
    letters_only = re.sub('[^a-zA-Zа-яА-Я]', '',  thought)
    if len(letters_only) < 30:
        return False
    
    words_only = re.sub('[^a-zA-Zа-яА-Я ]', '',  thought)
    if len(words_only.split(' ')) < 10:
        return False
    
    return True


def find_tags(note):
    tags = re.findall("\B(\#[a-zA-Z]+(\n|\ ))", note)
    tags = [t.split(s)[0][1:] for (t, s) in tags]
    return tuple(tags)


def parse_folder(db_path, len_thr=40):
    path, folders, files = next(os.walk(db_path))

    subfolder_dbs = []
    if len(folders) > 0:
        for f in folders:
            folder_path = os.path.join(path, f)
            folder_db = parse_folder(folder_path, len_thr)
            subfolder_dbs += folder_db

    db = []
    for fn in files:
        if '.md' not in fn:
            continue

        filepath = os.path.join(path, fn)
        with open(filepath, 'r') as f:
            note = f.read()

        if len(note) < len_thr:
            continue
        cleaned_note = clean(note)
        tags = find_tags(note)
        sentences = get_sentences(cleaned_note)
        paragraphs = get_paragraphs(cleaned_note)
        llm_thoughts = llm_get_thoughts(cleaned_note)
        note_dict = {'name': fn.split('.md')[0], 'path':filepath, 
                     'note':note, 'cleaned_note': cleaned_note, 
                     'llm_thoughts': llm_thoughts, 
                     'sentences': sentences, "paragraphs": paragraphs, 
                     'tags': tags}
        db.append(note_dict)

    db = db + subfolder_dbs
    return db


def get_sentences(note):
    sentences = [t for thought in re.split('\n|\t', note) for t in nltk.sent_tokenize(thought)]
    cleaned = list(map(clean_thought, sentences))
    filtered = list(filter(filter_thought, cleaned))
    return filtered

def get_paragraphs(note):
    paragraphs = [p for p in re.split('\n\n', note)]
    cleaned = list(map(clean_thought, paragraphs))
    filtered = list(filter(filter_thought, cleaned))
    return filtered


def llm(query):
    response = ollama.chat(model='llama3',
                            messages=[{'role': 'user', 
                                        'content': query}])
    return response['message']['content']

def llm_get_thoughts(text):
    prompt = '''Summarize the following text in 2-3 sentences, formulate it very concisely. Text: {} Output only the concise summary, 2-3 sentences.'''
    query = prompt.format(text)
    ans = llm(query)
    if '\n' in ans: 
        ans = ans.split('\n')[-1]
    thoughts = ans.split('.')
    thoughts = list(filter(len, thoughts))
    thoughts = [t.strip() for t in thoughts]
    return thoughts

In [144]:
SEARCH_FIELDS = ['cleaned_note', 'sentences', 'paragraphs', 'llm_thoughts']
class NoteManager:
    def __init__(self, db_path, 
                        model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
                        device='cpu',
                        save_path='../saved',
                        batch_size=32,
                        from_scratch=False):
        os.makedirs(save_path, exist_ok=True)
        
        self.db_path, self.save_path, self.batch_size = db_path, save_path, batch_size
        self.init_model(model_name, device)
        self.load_db(from_scratch)
        self.parse_notes()
        # self.start_timer()
    
    def parse_notes(self):
        print("### Parsing notes ###")
        loaded = parse_folder(self.db_path, len_thr=40)

        self.add_notes(loaded)
        self.build_index()
        self.embed_database()
        self.save()

    def add_notes(self, notes):
        print("### Adding notes ###")
        # Create dictionaries for fast lookups by 'path'
        db_dict = {n['path']: n for n in self.db}
        loaded_db_dict = {n['path']: n for n in notes}

        new_notes = {path: n for path, n in loaded_db_dict.items() if path not in db_dict}
        changed_notes = {path: n for path, n in loaded_db_dict.items() if path in db_dict and db_dict[path]['note'] != n['note']}
        deleted_note_paths = {path for path in db_dict if path not in loaded_db_dict}

        for path in changed_notes:
            del db_dict[path]

        for path in deleted_note_paths:
            del db_dict[path]

        # Update database with new notes
        self.db = list(db_dict.values()) + list(new_notes.values()) + list(changed_notes.values())
    
    def build_index(self):
        print("### Buliding index ###")
        self.f2i = dict()
        for field in SEARCH_FIELDS:
            note_inds = []
            field_inds = []
            for note_ind, note in enumerate(self.db):
                nf = note[field]
                if type(nf) == str:
                    note_inds.append(note_ind)
                    field_inds.append(0)
                elif type(nf) == list:
                    note_inds += [note_ind] * len(nf)
                    field_inds += list(range(len(nf)))
            element_inds = range(len(note_inds))
            self.f2i[field] = dict(zip(element_inds, zip(note_inds, field_inds)))
    
    def embed_database(self):
        print("### Embedding DB ###")
        self.index = dict()
        for field in SEARCH_FIELDS:
            embeddings = []
            emb_field = f"{field}_emb"
            for note in self.db:
                if emb_field in note:
                    emb = note[emb_field]
                else:
                    nf = note[field]
                    if type(nf) == str:
                        emb = self.embed([nf])
                    elif type(nf) == list:
                        emb = self.embed(nf)

                    note[emb_field] = emb
                embeddings += emb 
            if not embeddings:
                continue
            
            index = faiss.IndexFlatL2(self.model.config.hidden_size)
            index.add(torch.vstack(embeddings))
            self.index[field] = index
        
    def get_nearest(self, text, k=5, by_field='cleaned_note'):
        index = self.index[by_field]

        text_emb = self.embed([clean(text)])
        D, I = index.search(torch.stack(text_emb), k)

        nearest = self.get_notes_by_field(by_field, I[0])
        for i, n in enumerate(nearest):
            n['distance'] = D[0][i]
        nearest = sorted(nearest, key=lambda n: n['distance'])
        return nearest
    
    def get_notes_by_field(self, by_field, inds):
        f2i = self.f2i[by_field]
        out = []
        for i in inds:
            o = dict(**self.db[f2i[i][0]])
            o['nearest_field'] = f2i[i][1]
            out.append(o)
        return out

    def create_index(self, emb_matrix):
        self.index = faiss.IndexFlatL2(self.model.config.hidden_size)
        self.index.add(emb_matrix)
  
    def embed(self, texts, batch_size=32):
        embeddings = []
        for i in range(0, len(texts), batch_size):
            text_batch = texts[i:i+batch_size]
            tokenized = self.tokenizer.batch_encode_plus(text_batch, return_tensors='pt', padding='max_length', truncation=True)
            for t in tokenized:
                tokenized[t] = tokenized[t].to(self.device)
            with torch.no_grad():
                encoded = self.model(**tokenized)
            for bn, states in enumerate(encoded.last_hidden_state):
                emb = states[tokenized['attention_mask'][bn] == 1].mean(dim=0).cpu().detach()
                embeddings.append(emb)

        return embeddings

    def init_model(self, model_name, device):
        print("### Loading model ###")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.model.eval()
        self.model.to(device)
        self.device = device

    def load_db(self, from_scratch=False):
        db_path = os.path.join(self.save_path, 'note_db.npy')
        if not os.path.exists(db_path) or from_scratch:
            self.db = []
        else:
            self.db = np.load(db_path, allow_pickle=True)
    
    def save(self):
        os.makedirs(self.save_path, exist_ok=True)
        db_path = os.path.join(self.save_path, 'note_db.npy')
        np.save(db_path, self.db)

    def suggest_tags(self, text):
        drop_tags = {'', 'voice'}
        nearest = self.get_knn(text, 10)
        all_tags = ', '.join(nearest.tags.dropna()).split(', ')
        all_tags = list(filter(lambda x: x not in drop_tags, all_tags))
        suggested_tags = [t[0] for t in Counter(all_tags).most_common(4)]
        return suggested_tags
    
    def start_timer(self):
        class RepeatTimer(Timer):
            def run(self):
                while not self.finished.wait(self.interval):
                    self.function(*self.args, **self.kwargs)

        self.timer = RepeatTimer(1800, self.parse_notes)
        self.timer.start()

In [145]:
# loaded = parse_folder("/home/booydar/sync/obsidian-db", len_thr=40)

In [146]:
tm = NoteManager(db_path="/home/booydar/sync/obsidian-db-test",
                    device='cpu',
                    save_path="./saved_thoughts",
                    batch_size=32,
                    # from_scratch=True
                    )

### Loading model ###


  return torch.load(checkpoint_file, map_location="cpu")


### Parsing notes ###
### Adding notes ###
### Buliding index ###
### Embedding DB ###


In [97]:
nearest[1]['paragraphs'][nearest[1]['nearest_field']]

'Actually many of retrieval methods use question or query in each segment. This requires processing segments separately and this might be the point where recurrence is stronger, along reasoning over long contexts.'

In [98]:
nearest = tm.get_nearest('Retrieval does not necessarily reflect what a model needs to complete the task. ', by_field='paragraphs')
for n in nearest:
    print(n['name'], n['paragraphs'][n['nearest_field']])

Unlimiformer seminar notes Retrieval does not necessarily VERY  reflect what a model needs to complete the task. 
!
Unlimiformer seminar notes Actually many of retrieval methods use question or query in each segment. This requires processing segments separately and this might be the point where recurrence is stronger, along reasoning over long contexts.
workshop on ML&bioinf Пример
процесс сплайсинга
Кодирующие участки записаны не подряд, между ними некодирующая последовательность. От него зависит, получится ли петелечка при сплайсинге)
Простого ответа на то, какая должна быть последовательность, нет, только частично.
Unlimiformer seminar notes Another interesting thing is how robust are retrieval methods to changes in the datastore. It affects how well it holds up over time and whether the whole system needs constant re-training.
Unlimiformer seminar notes The long input is encoded in chunks and stored in a datastore. 6000 tokens -> 6000 embeddings. Then the retrieval happens on embed

In [None]:
def 

In [138]:
import ollama
def llm(query):
    response = ollama.chat(model='llama3',
                            messages=[{'role': 'user', 
                                        'content': query}])
    return response['message']['content']

In [114]:
text = '''08-09-22 11:08
#nlp #science 

---
## A Mathematical Framework for Transformer Circuits
[thread](https://transformer-circuits.pub/2021/framework/index.html#notation)
[YT playlist](https://www.youtube.com/playlist?list=PLoyGOS2WIonajhAVqKUgEMNmeq3nEeM51)

An attempt to interpret transformer as a combination of basic mathematical operations. In this paper only 0-2 layer **attention-only** (no MLP) transformers are studied, larger models are leaved for future work.

**Simplifications**
- MLP are hard to understand, so authors leave them out for elegance and simplicity.
- No biases
- No layer-norm
- decoder-only

### Summary of results

-   Zero layer transformers model bigram statistics. The bigram table can be accessed directly from the weights.
-   One layer attention-only transformers are an ensemble of bigram and “skip-trigram” The bigram and skip-trigram tables can be accessed directly from the weights.
-   Two layer attention-only transformers can implement much more complex algorithms using compositions of attention heads (also be detected directly from the weights).
-   One layer and two layer attention-only transformers use very different algorithms to perform in-context learning.

![[Pasted image 20220908112200.png]]

### Key takeaways
- Attention head is an individual operation outputting a result into a residual system.
- Attention is a sum of interpretable end-to-end functions
- QK and OV are largely independent
- q,k,v are intermediate results of multiplying low-rank Wq^TWk, WoWv
- Composition of heads greatly increases the expressivity
- token embedding, attention heads, MLP layers, and unembedding communicate with each other by reading and writing to different subspaces of the residual system **(?)**

# Paradigm

**Virtual weights**
Authors separate residual connection to a so-called **stream**, that serves as a communication highway for layers. It can be thought of as virtual weights that link any pair of layers. Information flows in this stream until specifically removed with an element. The bandwith of the stream is relatively low in comparison to the number of preceding neurons => hiddens are challenging to interpret, easier to find out what a single head does.

Each element (head) computes its own projection and adds it to the stream with own weight. Attention heads have low-dim projections (~64).

![[Pasted image 20220908112455.png]]

Some heads perform kind of **memory management** by writing out the negative version of some previously made changes.

![[Pasted image 20220908113642.png]]

It's important to think of attention heads as independent and additive elements that operate in parallel and add their output into the residual stream.

The fundamental action of attention heads is moving information. They gather information from stream tokens and rewrite it into another stream token. Read and Write tokens here are completely sepate from each other! So are operations performing them

### Observations
- Attention heads move informations between tokens
- Attention head is applying 2 linear operations: 
	- A=SM(qT\*k) - (non-linear one), governs information flow
	- WoWv - (linear one), which onformation is read and how to pass to destination
	- these operations act on different subspaces and act differently
- Wq & Wk, Wo & Wv always operate together => may be substituted by low-rank matrices Wov, Wqk
- q, k, v  are superficial, not really something crucial
- ![[Pasted image 20220912115534.png]]


## 0- and 1-layer Transformer
**0 - layer** just counts bi-gram statistics
**1 - layer**:

![[Pasted image 20220912115742.png]]

Rewrite using weight matrices and expand product: 
![[Pasted image 20220912130643.png]]
First term does not move informations, just updates bigram statistics
Second one links outputs and logits (kind of akin to [[gradient-based attention]] idea)

### Split second term into circuits
![[Pasted image 20220912133427.png]]

So we've got 2 independent patterns that have different purposes. It allows us e.g. to first compute all attentions, freeze them and then produce output values for all tokens. This way logits are a **linear function** of input tokens!

So we can just deal with matrices like *source*, *destination* -> *out*

**Problems**:
- large matrices for decent vocabs
- qk weights can have different scale across heads
- correlated variables

**Positives**
We can [explore](https://transformer-circuits.pub/2021/framework/head_dump/small_a.html) influence of bi-grams on output token probs by looking at maps.

**Occurring patterns**
- enormous fraction of attention capacity is copying ![[Pasted image 20220912134947.png]]
- In-context learning patterns ![[Pasted image 20220912135230.png]]
- Common phrases and constructions (e.g. `keep … [in → mind / at → bay / under → wraps]`, `difficult … not → impossible`)
- Python patterns:
	- Predicting that the python keywords `else`, `elif` and `except` are more likely after an indentation is reduced using skip-trigrams of the form: `\n\t\t\t … \n\t\t → else/elif/except` where the first part is indented N times, and the second part N-1, for various values of N, and where the whitespace can be tabs or spaces.
	- Predicting that `open()` will have a file mode string argument: `open … "," → [rb / wb / r / w]` (for example `open("abc.txt","r")`)
	- The first argument to a function is often `self`: `def … ( → self` (for example `def method_name(self):`)
- **(!)** most trigrams relate to tokenization specifics
- **(!)** many are hard to interpret without specific knowledge
- **(!)** OV and QK matrices (50k x 50k) have extremely low rank (64, 128)

### How common is copy behaviour?
Copy is generally mapping a word to itself in a different context
**Approach**: compare eigenvectors of OV matrices to the (1, 0) vector to find copying heads.

![[Pasted image 20220912140252.png]]

It appears that 10 of 12 heads are significantly copying! Well, on average. such eigenvalues are not rigorous proof.


## Two-layer Transformers
### Induction heads
### More circuits




---
[[00 NLP]]

---'''

In [135]:
def llm_get_thoughts(text):
    # prompt = '''Find 1-3 main ideas from the following text, formulate them very concisely. Text: {} Output !!ONLY!! 1-3 sentences: concise main ideas, separated by newline. '''
    prompt = '''Summarize the following text in 2-3 sentences, formulate it very concisely. Text: {} Output only the concise summary, 2-3 sentences.'''
    query = prompt.format(text)
    ans = llm(query)
    if '\n' in ans: 
        ans = ans.split('\n')[-1]
    thoughts = ans.split('.')
    thoughts = list(filter(len, thoughts))
    return thoughts

In [136]:
thoughts = llm_get_thoughts(text)

In [137]:
thoughts

['The paper presents a mathematical framework for understanding transformer circuits as combinations of basic mathematical operations',
 ' Specifically, it studies attention-only transformers with 0-2 layers and shows that these models can implement bigram statistics, skip-trigrams, and more complex algorithms using compositions of attention heads',
 ' The framework also reveals that attention heads move information between tokens and can be viewed as independent, additive elements operating in parallel']

In [103]:
ans = llm("Hello!")

In [104]:
ans

"Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?"

In [3]:
response = ollama.chat(model='llama3', messages=[
    {
        'role': 'user',
        'content': 'Hello!',
    },
    ])

In [None]:
available_models = ['llama3', 'llava']

In [4]:
import base64

In [3]:
file_path = "/home/booydar/mpv-shot0001.jpg"
with open(file_path, 'rb') as f:
    img = f.read()
img_enc = base64.b64encode(img)

In [12]:
ollama.__version__

AttributeError: module 'ollama' has no attribute '__version__'

In [10]:
import ollama
response = ollama.chat(model='llava', messages=[
    {
        'role': 'user',
        'content': 'Whats on the image?',
        'images': [img_enc]
    },
    ])

In [11]:
response

{'model': 'llava',
 'created_at': '2024-04-25T16:50:32.524342442Z',
 'message': {'role': 'assistant',
  'content': ' The image shows a landscape with rocks and vegetation in the foreground. In the background, there appears to be a river or body of water running through a valley or canyon. The terrain is rugged with varying elevations, suggesting it might be part of a mountainous region.\n\nOn the right side of the image, there\'s an overlay of text and numerical data that seems to be related to a GPS or geographic tracking system. It includes information such as "F2.8 5041," which likely refers to the camera settings and GPS coordinates, along with "1/32s ISO: 640," which indicates exposure settings of the camera. The text also includes numbers like 2569, 5041, 4.73, 859, 1234, 0.83m, 200.00m/s, and 13.91, which might be additional data related to the GPS system or the image\'s metadata.\n\nThe specific details about what exactly is being measured or recorded with this equipment are no

In [5]:
response['message']

{'role': 'assistant',
 'content': "Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?"}

In [10]:
save_path = "/home/booydar/Desktop/_projects/tg_notebot/data/thoughts_cache"
note_db_path = "/home/booydar/Sync/obsidian-db"
model_name = "intfloat/multilingual-e5-large"
tm = ThoughtManager(note_db_path, 
                    model_name=model_name, 
                    save_path=save_path, 
                    device='cuda',
                    batch_size=64)

### Loading existing model ###


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


### Parsing notes ###
### Finished parsing ###


In [11]:
thoughts = tm.note_db.thoughts.values

In [12]:
notes = tm.note_db.cleaned_note.unique()

In [None]:
def summarize()

In [17]:
prompt = '''Summarize the context in one short sentence.\nContext: {}'''

def summarize(text, prompt=prompt):
    query = prompt.format(text)
    return llm(query)

In [18]:
notes[0]

'2022-11-30 01-48-47\n\n \nКлассно было в универе не париться о том, что поправишься, о том что будешь тупой с утра. Просто бухаешь а потом идёшь на пары.\n\n '

In [19]:
prompt = '''Summarize the context in one short sentence.\nContext: {}'''
summarize(notes[0], prompt)

"A student is expressing frustration that they didn't bother to review their notes the night before and are now having to go to class without being prepared."

In [21]:
note = notes[10]
note

'2022-12-23 13-17-12\n\n \nзабавно, что работу с обученными языковыми моделями можно сравнить с упрощением, потому что у самой модели нет никакой цели или глобального критерия правильности, кроме лоса, но вас не может нам помочь для решения задачи, на которую модель не была обучена.. это превращает inference или 0-shot модели в некоторое подобие укрощения или родео, где с помощью заправок и уговоров. Мы пытаемся заставить модели сделать то, что нам нужно.\n\n '

In [35]:
prompt = '''Context: {}\n Get 1-5 keywords for the context. Output ONLY keywords, split by a comma. Do NOT output anything else.'''
llm(prompt.format(note))

'language, models, simplification, inference, task'

In [36]:
keywords = [llm(prompt.format(note)) for note in notes[:30]]

In [37]:
for k, n in zip(keywords, notes):
    print(k, n)

university, morning, classes, studying, habits 2022-11-30 01-48-47

 
Классно было в универе не париться о том, что поправишься, о том что будешь тупой с утра. Просто бухаешь а потом идёшь на пары.

 
photography, timeline, classification, images, sorting 2022-12-24 11-16-56

 
Сайт с фотографиями, которые расположены в виде ленты. Сначала самая последняя потом по возрастанию возраста плюс, когда открываешь одну отдельную фотографию. Только ней подбираются наиболее похожи похожи стреляется базам случаев по среднему вектору, который получен с помощью обычного картона. Что-то типа разрезанная взять, а классификационная, чтобы получить распределение классов как бы.

 
death, life, fear, suffering, happiness 2023-06-16 18-28-26

 
Человек смерти боится, потому что жизнь любит вот как я понимаю. Итак, природа велела. Но это подло и тут весь обман жизнь есть боль жизни. Страх и человек несчастен. Теперь всё боль и страх. Теперь человек жизнь любит, потому что боль и страх любит и так сделали