In [3]:
import numpy as np
import pandas as pd
import torch
import os
import re

In [4]:
from transformers import AutoModel, AutoTokenizer

In [5]:
model_name = 'DeepPavlov/rubert-base-cased-sentence'
model = AutoModel.from_pretrained(model_name)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
db_path = '/home/booydar/Documents/obsidian/fort-knox'

In [9]:
path, folders, files = next(os.walk(db_path))

In [10]:
def get_first_header(note):
    if '## ' in note:
        first_header_prefix = '## '
        if '### ' in note:
            first_header_prefix = '### '

        first_header = note.split(first_header_prefix)[1].split('\n')[0]
    else:
        first_header = ''

    return first_header

def clean(note):
    # remove zero-links
    note = re.sub(r'\[.*\]', '', note)

    # remove tags and headers
    note = re.sub(r'\#.*\n', '', note)

    # remove \n
    note = re.sub('\n', ' ', note)

    # remove lines
    note = re.sub('---', ' ', note)

    # remove **
    note = re.sub('\*', '', note)
    
    return note

In [11]:
length_thr = 20
device = 'cpu'
encode_kwargs = {'truncation': True, 'padding': 'max_length', 'pad_to_multiple_of': 1, 'max_length':512}


In [12]:
# %%time 
db_df = pd.DataFrame()
for fn in files:
    if '.md' not in fn:
        continue
    filepath = os.path.join(path, fn)
    with open(filepath, 'r') as f:
        note = f.read()

    if len(note) < length_thr:
        continue

    header = get_first_header(note)
    if not header: 
        header = fn[:-3]

    cleaned_note = clean(note)
    
    tokenized_header = tokenizer.encode(header, **encode_kwargs)
    tokenized_note = tokenizer.encode(cleaned_note, **encode_kwargs)

    note_dict = {'name': fn, 'path':filepath, 'header': header, 'note': cleaned_note, 'tokenized_header':[tokenized_header], 'tokenized_note':[tokenized_note]}

    db_df = pd.concat([db_df, pd.DataFrame(note_dict)])

In [13]:
tokenized_headers = torch.Tensor(np.vstack(db_df.tokenized_header.values)).long().to(device)
tokenized_notes = torch.Tensor(np.vstack(db_df.tokenized_note.values)).long().to(device)
# vectorized = model(inp)

In [14]:
vectorized_headers = model(tokenized_headers)
# vectorized_notes = model(tokenized_notes)

In [None]:
vectorized_headers

AttributeError: 'list' object has no attribute 'astype'

In [104]:
','.join(['1','2'])

'1,2'

In [96]:
note_dict

{'name': 'linux commands.md',
 'header': 'linux commands',
 'tokenized_header': [101, 12126, 11659, 279, 97386, 10929, 269, 102],
 'tokenized_note': [101,
  13510,
  268,
  13776,
  130,
  196,
  130,
  242,
  130,
  254,
  10918,
  16306,
  11270,
  10545,
  156,
  68433,
  153,
  156,
  10918,
  16306,
  11270,
  10545,
  156,
  68433,
  155,
  93921,
  263,
  237,
  10710,
  10829,
  14599,
  275,
  263,
  259,
  130,
  130,
  10685,
  74627,
  134,
  12328,
  134,
  21018,
  47040,
  4228,
  134,
  14322,
  65252,
  134,
  34385,
  201,
  209,
  134,
  10616,
  17317,
  11871,
  275,
  27023,
  134,
  274,
  263,
  259,
  132,
  10685,
  243,
  244,
  263,
  273,
  244,
  15225,
  10725,
  271,
  130,
  248,
  138,
  130,
  236,
  263,
  273,
  201,
  12281,
  12181,
  267,
  21185,
  73690,
  23369,
  263,
  11180,
  269,
  10037,
  11931,
  72291,
  12362,
  11180,
  139,
  132,
  11180,
  141,
  13510,
  10937,
  257,
  244,
  14335,
  19747,
  141,
  130,
  130,
  28668,
  1082

In [59]:
print(note)

12-08-22 10:41
#nlp  #science #rmt 

---
### Do transformers need long contexts to solve long tasks

**Vasily's idea:**
Prove that no long attention is needed, instead we can somehow shorten the input so that it fits in the transformer and manages to keep the task quality still nice.

### Idea
compress long input 
(i.e. using extractive summarization techniques)
and then use a transformer to perform summarization



[analog]([https://www.dialog-21.ru/media/5514/iazykovatplusetal037.pdf](https://www.dialog-21.ru/media/5514/iazykovatplusetal037.pdf) attacking russian superGLUE:


---
[[00 NLP]]


In [85]:
note

"12-08-22 10:41Vasily's idea:Prove that no long attention is needed, instead we can somehow shorten the input so that it fits in the transformer and manages to keep the task quality still nice.compress long input (i.e. using extractive summarization techniques)and then use a transformer to perform summarization(https://www.dialog-21.ru/media/5514/iazykovatplusetal037.pdf) attacking russian superGLUE:"