In [4]:
import urllib.request
import sqltables
import json
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from adapters import AutoAdapterModel

In [5]:
database_file = "or_submissions.sqlite3"
# database_file = "submissions.sqlite3"

In [6]:
db = sqltables.sqlite3.Database(database_file)
submissions = db.open_table("submissions")
    
if "specter" in list(db.tables):
    specter = db.open_table("specter")
else:
    specter = db.create_table(name="specter", column_names=["arxiv_id", "paper_info"])

In [7]:
processed = set(row.arxiv_id for row in specter)
len(processed)

0

In [8]:
batch_size = 32

In [9]:
rows = list(submissions.view("""
select _.* from _ left join specter using (arxiv_id)
where specter.arxiv_id is null
group by arxiv_id
"""))
len(rows)

3704

In [10]:
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
model = AutoAdapterModel.from_pretrained('allenai/specter2_base')
model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True)

BertAdapterModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

'specter2'

In [11]:
from tqdm.notebook import tqdm

In [12]:
progress = tqdm(total=len(rows))
for i in range(0, len(rows), batch_size):
    print(f"{i}/{len(rows)}")
# for i in [0]:
    batch_rows = rows[i:(i+batch_size)]
    title_abs = [row.title + tokenizer.sep_token + row.abstract for row in batch_rows]
    inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
    result = model(**inputs)
    # take the first token in the batch as the embedding
    embeddings = result.last_hidden_state[:, 0, :]
    specter_rows = []
    for row, embedding in zip(batch_rows, embeddings.tolist()):
        paper_info = {"embedding": {"model": "specter2@local", "vector": embedding}}
        specter_rows.append([row.arxiv_id, json.dumps(paper_info)])
    specter.insert(specter_rows)
    progress.update(batch_size)

  0%|          | 0/3704 [00:00<?, ?it/s]

0/3704
32/3704
64/3704
96/3704
128/3704
160/3704
192/3704
224/3704
256/3704
288/3704
320/3704
352/3704
384/3704
416/3704
448/3704
480/3704
512/3704
544/3704
576/3704
608/3704
640/3704
672/3704
704/3704
736/3704
768/3704
800/3704
832/3704
864/3704
896/3704
928/3704
960/3704
992/3704
1024/3704
1056/3704
1088/3704
1120/3704
1152/3704
1184/3704
1216/3704
1248/3704
1280/3704
1312/3704
1344/3704
1376/3704
1408/3704
1440/3704
1472/3704
1504/3704
1536/3704
1568/3704
1600/3704
1632/3704
1664/3704
1696/3704
1728/3704
1760/3704
1792/3704
1824/3704
1856/3704
1888/3704
1920/3704
1952/3704
1984/3704
2016/3704
2048/3704
2080/3704
2112/3704
2144/3704
2176/3704
2208/3704
2240/3704
2272/3704
2304/3704
2336/3704
2368/3704
2400/3704
2432/3704
2464/3704
2496/3704
2528/3704
2560/3704
2592/3704
2624/3704
2656/3704
2688/3704
2720/3704
2752/3704
2784/3704
2816/3704
2848/3704
2880/3704
2912/3704
2944/3704
2976/3704
3008/3704
3040/3704
3072/3704
3104/3704
3136/3704
3168/3704
3200/3704
3232/3704
3264/3704
3296/37

In [13]:
db.close()