In [1]:
import numpy as np
import pandas as pd
import torch
import os
import re

In [2]:
from transformers import AutoModel, AutoTokenizer

In [3]:
# model_name = 'DeepPavlov/rubert-base-cased-sentence'
# model_name = 'DeepPavlov/bert-base-multilingual-cased-sentence'
# model_name = 'DeepPavlov/distilrubert-small-cased-conversational'
model_name = 'DeepPavlov/distilrubert-tiny-cased-conversational'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading: 100%|██████████| 538/538 [00:00<00:00, 131kB/s]
Downloading: 100%|██████████| 409M/409M [01:13<00:00, 5.86MB/s] 
Some weights of the model checkpoint at DeepPavlov/distilrubert-tiny-cased-conversational were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████| 24.0/24.0 [00:00<00:00, 14.9kB/s]
Downloading: 100%|██████████| 1.34M/1.34M [00:01<00:0

In [4]:
def get_first_header(note):
    if '## ' in note:
        first_header_prefix = '## '
        if '### ' in note:
            first_header_prefix = '### '

        first_header = note.split(first_header_prefix)[1].split('\n')[0]
    else:
        first_header = ''

    return first_header

def clean(note):
    # remove zero-links
    note = re.sub(r'\[.*\]', '', note)

    # remove tags and headers
    note = re.sub(r'\#.*\n', '', note)

    # remove \n
    note = re.sub('\n', ' ', note)

    # remove lines
    note = re.sub('---', ' ', note)

    # remove **
    note = re.sub('\*', '', note)
    
    return note

In [5]:
length_thr = 20
device = 'cpu'
encode_kwargs = {'truncation': True, 'padding': 'max_length', 'pad_to_multiple_of': 1, 'max_length':512}


In [6]:
db_path = '/home/booydar/Documents/obsidian/fort-knox'


def parse_folder(path):
    # print(f'Parsing {path}')

    path, folders, files = next(os.walk(path))

    db_df = pd.DataFrame()
    if len(folders) > 0:
        for f in folders:
            folder_path = os.path.join(path, f)
            f_res_df = parse_folder(folder_path)
            db_df = pd.concat([db_df, f_res_df])

    for fn in files:
        if '.md' not in fn:
            continue

        filepath = os.path.join(path, fn)
        with open(filepath, 'r') as f:
            note = f.read()

        if len(note) < length_thr:
            continue

        header = get_first_header(note)
        if not header: 
            header = fn[:-3]

        cleaned_note = clean(note)
        
        tokenized_header = tokenizer.encode(header, **encode_kwargs)
        tokenized_note = tokenizer.encode(cleaned_note, **encode_kwargs)

        note_dict = {'name': fn, 'path':filepath, 'header': header, 'note': cleaned_note, 'tokenized_header':[tokenized_header], 'tokenized_note':[tokenized_note]}

        db_df = pd.concat([db_df, pd.DataFrame(note_dict)])
    
    return db_df


In [7]:
db_df = parse_folder(db_path)

In [8]:
tokenized_headers = torch.Tensor(np.vstack(db_df.tokenized_header.values)).long().to(device)
tokenized_notes = torch.Tensor(np.vstack(db_df.tokenized_note.values)).long().to(device)

vectorized_headers = model(tokenized_headers)
vectorized_notes = model(tokenized_notes)

In [9]:
num_notes = db_df.shape[0]
# header_embeddings = vectorized_headers.last_hidden_state.reshape(num_notes, -1)
# header_embeddings = vectorized_headers.last_hidden_state.mean(dim=-2)
header_embeddings = vectorized_headers.last_hidden_state[:, 0, :].detach().numpy()
note_embeddings = vectorized_notes.last_hidden_state[:, 0, :].detach().numpy()

In [10]:
from sklearn.cluster import KMeans

### headers

In [45]:
cluster = KMeans(n_clusters=10)
cluster = cluster.fit(header_embeddings)
header_clusters = cluster.predict(header_embeddings)
headers = db_df.header.values

In [48]:
for c in np.unique(header_clusters):
    group = headers[header_clusters == c]
    print(f'Cluster {c}\n{group}\n\n')

#### notes

In [11]:
notes = db_df.note.values

In [2]:
cluster = KMeans(n_clusters=10)
cluster = cluster.fit(note_embeddings)
note_clusters = cluster.predict(note_embeddings)

for c in np.unique(note_clusters):
    group = notes[note_clusters == c]
    print(f'Cluster {c}\n{group}\n\n')