In [26]:
import re
import os
import nltk
import numpy as np
# import pandas as pd
import json
from threading import Timer
from collections import Counter
import torch
import faiss                 
from transformers import AutoModel, AutoTokenizer
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def clean(note):
    # remove zero-links
    note = re.sub(r'\[.*\]', '', note)
    # remove tags and headers
    note = re.sub(r'\#.*\n', '', note)
    # remove lines
    note = re.sub('---', ' ', note)
    # remove **
    note = re.sub('\*', '', note)
    
    return note

def clean_thought(thought):
    thought = re.sub(r'\(http\S+', '<LINK>', thought)
    thought = re.sub(r'http\S+', '<LINK>', thought)

    if thought[:2] == '- ':
        thought = thought[2:]

    if '<LINK>' in thought:
        linkless = re.sub('<LINK>', '', thought)
        linkless = re.sub('[^a-zA-Zа-яА-Я ]', '',  linkless)
        linkless = linkless.strip()
        if len(linkless.split(' ')) < 2:
            return ''
    
    return thought.strip()


def filter_thought(thought):
    if not thought:
        return False
    
    thought = str(thought)
    letters_only = re.sub('[^a-zA-Zа-яА-Я]', '',  thought)
    if len(letters_only) < 30:
        return False
    
    words_only = re.sub('[^a-zA-Zа-яА-Я ]', '',  thought)
    if len(words_only.split(' ')) < 10:
        return False
    
    return True


def find_tags(note):
    tags = re.findall("\B(\#[a-zA-Z]+(\n|\ ))", note)
    tags = [t.split(s)[0][1:] for (t, s) in tags]
    return tuple(tags)


def parse_folder(db_path, len_thr=40):
    path, folders, files = next(os.walk(db_path))

    subfolder_dbs = []
    if len(folders) > 0:
        for f in folders:
            folder_path = os.path.join(path, f)
            folder_db = parse_folder(folder_path, len_thr)
            subfolder_dbs.append(folder_db)

    db = []
    for fn in files:
        if '.md' not in fn:
            continue

        filepath = os.path.join(path, fn)
        with open(filepath, 'r') as f:
            note = f.read()

        if len(note) < len_thr:
            continue
        cleaned_note = clean(note)
        tags = find_tags(note)
        thoughts = get_thoughts(cleaned_note)
        note_dict = {'name': fn.split('.md')[0], 'path':filepath, 
                     'note':note, 'cleaned_note': cleaned_note, 
                     'thoughts': thoughts, 'tags': tags}
        db.append(note_dict)

    db = db + subfolder_dbs
    return db


def get_thoughts(note):
    thoughts = [t for thought in re.split('\n|\t', note) for t in nltk.sent_tokenize(thought)]
    cleaned_thoughts = list(map(clean_thought, thoughts))
    filtered_thoughts = list(filter(filter_thought, cleaned_thoughts))
    return filtered_thoughts


class NoteDatabase:
    def __init__(self, db_path, 
                        model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
                        device='cpu',
                        save_path='../saved',
                        batch_size=32):
        os.makedirs(save_path, exist_ok=True)
        
        self.db_path, self.save_path, self.batch_size = db_path, save_path, batch_size
        self.init_model(model_name, device)
        self.parse_notes()
        self.start_timer()
    
    def parse_notes(self):
        print("### Parsing notes ###")
        loaded = parse_folder(self.db_path, len_thr=40)

        db_path = os.path.join(self.save_path, 'note_db.json')
        if os.path.exists(db_path):
            with open(db_path, 'r') as f:
                loaded_db = json.load(f)
            embeddings = np.load(os.path.join(self.save_path, 'embeddings.npy'))

            set_notes = set(n['note'] for n in self.db)
            new_notes = [n for n in loaded_db if n['note'] not in set_notes]
            
            # remove old notes that were changed
            new_paths = set([n['path'] for n in new_notes])
            changed_note_inds = {i for (i, n) in enumerate(self.db) if n['path'] in new_paths}
            self.db = [n for i, n in enumerate(self.db) if i not in changed_note_inds]
            self.embeddings = [n for i, n in enumerate(self.embeddings) if i not in changed_note_inds]


            # self.db = [n for n in self.db if n['path'] not in new_paths] 
            # self.embeddings = [e for e in self.embeddings if n['path'] not in new_paths] # remove old notes that were changed
            # unchanged_inds = [i for i, (old, new) in enumerate(zip(self.db, loaded_db)) if old['note'] == new['note']]
            # self.db = []

            # changed = []

            # parsed_paths = set(parsed.path)
            # loaded_paths = set(loaded.path)
            
            # drop deleted notes            
            self.embeddings = embeddings[loaded.path.isin(parsed_paths)]
            self.note_db = loaded[loaded.path.isin(parsed_paths)]

            new_paths = parsed_paths.difference(loaded_paths)
            if len(new_paths) > 0:
                # add new notes
                new_thoughts = parsed[~parsed.path.isin(loaded_paths)]
                new_embeddings = self.embed(list(new_thoughts.thoughts.values), self.batch_size)

                self.note_db = pd.concat((self.note_db, new_thoughts))
                self.embeddings = np.concatenate((self.embeddings, new_embeddings), axis=0)
        else:
            self.note_db = loaded
            self.embeddings = self.embed(loaded, self.batch_size)
        
        self.create_index(self.embeddings)
        self.save()
        print("### Finished parsing ###")

    def get_nearest(self, note, k):
        thoughts = get_thoughts(clean(note))
        nearest = [self.get_knn(t, k=k) for t in thoughts]
        if len(nearest) > 0:
            nearest = pd.concat(nearest)
        else: 
            nearest = self.get_knn(clean(note), k=k)
        return nearest.sort_values('distance')

    def get_knn(self, thought, k=5):
        text_embedding = self.embed([thought])

        D, I = self.index.search(text_embedding, k)
        nearest = self.note_db.iloc[I[0]].copy()
        nearest['distance'] = D[0]
        return nearest

    def create_index(self, emb_matrix):
        self.index = faiss.IndexFlatL2(self.model.config.hidden_size)
        self.index.add(emb_matrix)
  
    def embed(self, texts, batch_size=32):
        embeddings = []
        for i in range(0, len(texts), batch_size):
            text_batch = texts[i:i+batch_size]
            tokenized = self.tokenizer.batch_encode_plus(text_batch, return_tensors='pt', padding='max_length', truncation=True)
            for t in tokenized:
                tokenized[t] = tokenized[t].to(self.device)
            with torch.no_grad():
                encoded = self.model(**tokenized)
            for bn, states in enumerate(encoded.last_hidden_state):
                emb = states[tokenized['attention_mask'][bn] == 1].mean(dim=0).cpu().detach()
                embeddings.append(emb)

        return torch.vstack(embeddings)

    def init_model(self, model_name, device):
        model_path = os.path.join(self.save_path, 'model.pth')
        tokenizer_path = os.path.join(self.save_path, 'tokenizer.pth')
        if os.path.exists(model_path):
            print("### Loading existing model ###")
            self.tokenizer = torch.load(tokenizer_path)
            self.model = torch.load(model_path)
        else:
            print("### Downloading model ###")
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModel.from_pretrained(model_name)
            torch.save(self.tokenizer, tokenizer_path)
            torch.save(self.model, model_path)
        self.model.eval()
        self.model.to(device)
        self.device = device

    def suggest_tags(self, text):
        drop_tags = {'', 'voice'}
        nearest = self.get_knn(text, 10)
        all_tags = ', '.join(nearest.tags.dropna()).split(', ')
        all_tags = list(filter(lambda x: x not in drop_tags, all_tags))
        suggested_tags = [t[0] for t in Counter(all_tags).most_common(4)]
        return suggested_tags
    
    def save(self):
        if not os.path.exists(self.save_path):
            os.system(f'mkdir {self.save_path}')

        self.note_db.to_csv(os.path.join(self.save_path, 'thoughts.csv'), sep=';', index=False)
        np.save(os.path.join(self.save_path, 'embeddings.npy'), self.embeddings)
    
    def start_timer(self):
        class RepeatTimer(Timer):
            def run(self):
                while not self.finished.wait(self.interval):
                    self.function(*self.args, **self.kwargs)

        self.timer = RepeatTimer(1800, self.parse_notes)
        self.timer.start()

In [6]:
tm = ThoughtManager(db_path="/home/booydar/Sync/obsidian-db/",
                    device='cuda',
                    save_path="./saved_thoughts",
                    batch_size=512
                    )

### Loading existing model ###
### Parsing notes ###
### Finished parsing ###


In [11]:
print(tm.note_db.cleaned_note.values[10])

2023-06-16 18-28-26

 
Человек смерти боится, потому что жизнь любит вот как я понимаю. Итак, природа велела. Но это подло и тут весь обман жизнь есть боль жизни. Страх и человек несчастен. Теперь всё боль и страх. Теперь человек жизнь любит, потому что боль и страх любит и так сделали жизнь теперь даётся за более страх. И тут весь обман теперь человек ещё не тот человек будет новый Человек счастливый и гордый. Кому будет всё равно жить или не жить. Тот будет новый человек. Кто победит боль и страх тот сам Бог будет. 

 


In [7]:
tm1 = ThoughtManager(db_path="/home/booydar/Sync/obsidian-db/",
                    device='cuda',
                    save_path="./saved_thoughts",
                    batch_size=512
                    )

### Loading existing model ###
### Parsing notes ###
### Finished parsing ###


In [23]:
class Database:
    def __init__(self, save_folder="./save"):
        self.save_folder = save_folder
        self.db = []


    def parse_folder(self, db_path, len_thr=40):
        path, folders, files = next(os.walk(db_path))

        subfolder_dbs = []
        if len(folders) > 0:
            for f in folders:
                folder_path = os.path.join(path, f)
                folder_db = parse_note_db(folder_path, len_thr)
                subfolder_dbs.append(folder_db)

        db = []
        for fn in files:
            if '.md' not in fn:
                continue

            filepath = os.path.join(path, fn)
            with open(filepath, 'r') as f:
                note = f.read()

            if len(note) < len_thr:
                continue
            cleaned_note = clean(note)
            tags = find_tags(note)
            thoughts = get_thoughts(cleaned_note)
            note_dict = {'name': fn.split('.md')[0], 'path':filepath, 'note':note, 'cleaned_note': cleaned_note, 'thoughts': thoughts, 'tags': tags}
            db.append(note_dict)

        db = db + subfolder_dbs
        self.db = db
        return db

In [28]:
db_path="/home/booydar/Sync/obsidian-db/"


DB = Database(save_folder="./saved_thoughts/")
db = DB.parse_folder(db_path)

In [29]:
len(db)

135

In [30]:
db[0]

{'name': 'RMT framework',
 'path': '/home/booydar/Sync/obsidian-db/RMT framework.md',
 'note': "17-08-22 16:20\n#rmt #nlp #science \n\n---\n### A vision of RMT as more than just one model\n\n**Idea**: \nwe've tried a lot of things with memory, but each one of them works in different situations. We can see RMT as a memory framework, where a user can experiment with flags based on his own task with his own models.\n\nTechnically, it means that all changes in RMT should be made in a clear way so that they don't interrupt with each other.\n\n---\n[[00 RMT]] \n\n---\n",
 'cleaned_note': "17-08-22 16:20\n\n \n\nIdea: \nwe've tried a lot of things with memory, but each one of them works in different situations. We can see RMT as a memory framework, where a user can experiment with flags based on his own task with his own models.\n\nTechnically, it means that all changes in RMT should be made in a clear way so that they don't interrupt with each other.\n\n \n \n\n \n",
 'thoughts': ["we've tri

### Parsing notes ###


Todo
- implement version checking??
- dont parse if notes are ok
- free memory after embedding

In [1]:
import ollama
def llm(query):
    response = ollama.chat(model='llama3', messages=[
    {
        'role': 'user',
        'content': query,
    },
    ])
    return response['message']['content']

In [2]:
ans = llm("Hello!")

"Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?"

In [3]:
response = ollama.chat(model='llama3', messages=[
    {
        'role': 'user',
        'content': 'Hello!',
    },
    ])

In [None]:
available_models = ['llama3', 'llava']

In [4]:
import base64

In [3]:
file_path = "/home/booydar/mpv-shot0001.jpg"
with open(file_path, 'rb') as f:
    img = f.read()
img_enc = base64.b64encode(img)

In [12]:
ollama.__version__

AttributeError: module 'ollama' has no attribute '__version__'

In [10]:
import ollama
response = ollama.chat(model='llava', messages=[
    {
        'role': 'user',
        'content': 'Whats on the image?',
        'images': [img_enc]
    },
    ])

In [11]:
response

{'model': 'llava',
 'created_at': '2024-04-25T16:50:32.524342442Z',
 'message': {'role': 'assistant',
  'content': ' The image shows a landscape with rocks and vegetation in the foreground. In the background, there appears to be a river or body of water running through a valley or canyon. The terrain is rugged with varying elevations, suggesting it might be part of a mountainous region.\n\nOn the right side of the image, there\'s an overlay of text and numerical data that seems to be related to a GPS or geographic tracking system. It includes information such as "F2.8 5041," which likely refers to the camera settings and GPS coordinates, along with "1/32s ISO: 640," which indicates exposure settings of the camera. The text also includes numbers like 2569, 5041, 4.73, 859, 1234, 0.83m, 200.00m/s, and 13.91, which might be additional data related to the GPS system or the image\'s metadata.\n\nThe specific details about what exactly is being measured or recorded with this equipment are no

In [5]:
response['message']

{'role': 'assistant',
 'content': "Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?"}

In [10]:
save_path = "/home/booydar/Desktop/_projects/tg_notebot/data/thoughts_cache"
note_db_path = "/home/booydar/Sync/obsidian-db"
model_name = "intfloat/multilingual-e5-large"
tm = ThoughtManager(note_db_path, 
                    model_name=model_name, 
                    save_path=save_path, 
                    device='cuda',
                    batch_size=64)

### Loading existing model ###


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


### Parsing notes ###
### Finished parsing ###


In [11]:
thoughts = tm.note_db.thoughts.values

In [12]:
notes = tm.note_db.cleaned_note.unique()

In [None]:
def summarize()

In [17]:
prompt = '''Summarize the context in one short sentence.\nContext: {}'''

def summarize(text, prompt=prompt):
    query = prompt.format(text)
    return llm(query)

In [18]:
notes[0]

'2022-11-30 01-48-47\n\n \nКлассно было в универе не париться о том, что поправишься, о том что будешь тупой с утра. Просто бухаешь а потом идёшь на пары.\n\n '

In [19]:
prompt = '''Summarize the context in one short sentence.\nContext: {}'''
summarize(notes[0], prompt)

"A student is expressing frustration that they didn't bother to review their notes the night before and are now having to go to class without being prepared."

In [21]:
note = notes[10]
note

'2022-12-23 13-17-12\n\n \nзабавно, что работу с обученными языковыми моделями можно сравнить с упрощением, потому что у самой модели нет никакой цели или глобального критерия правильности, кроме лоса, но вас не может нам помочь для решения задачи, на которую модель не была обучена.. это превращает inference или 0-shot модели в некоторое подобие укрощения или родео, где с помощью заправок и уговоров. Мы пытаемся заставить модели сделать то, что нам нужно.\n\n '

In [35]:
prompt = '''Context: {}\n Get 1-5 keywords for the context. Output ONLY keywords, split by a comma. Do NOT output anything else.'''
llm(prompt.format(note))

'language, models, simplification, inference, task'

In [36]:
keywords = [llm(prompt.format(note)) for note in notes[:30]]

In [37]:
for k, n in zip(keywords, notes):
    print(k, n)

university, morning, classes, studying, habits 2022-11-30 01-48-47

 
Классно было в универе не париться о том, что поправишься, о том что будешь тупой с утра. Просто бухаешь а потом идёшь на пары.

 
photography, timeline, classification, images, sorting 2022-12-24 11-16-56

 
Сайт с фотографиями, которые расположены в виде ленты. Сначала самая последняя потом по возрастанию возраста плюс, когда открываешь одну отдельную фотографию. Только ней подбираются наиболее похожи похожи стреляется базам случаев по среднему вектору, который получен с помощью обычного картона. Что-то типа разрезанная взять, а классификационная, чтобы получить распределение классов как бы.

 
death, life, fear, suffering, happiness 2023-06-16 18-28-26

 
Человек смерти боится, потому что жизнь любит вот как я понимаю. Итак, природа велела. Но это подло и тут весь обман жизнь есть боль жизни. Страх и человек несчастен. Теперь всё боль и страх. Теперь человек жизнь любит, потому что боль и страх любит и так сделали