In [1]:
import urllib.request
import sqltables
import json
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel

In [2]:
db = sqltables.sqlite3.Database("submissions.sqlite3")
if "submissions" in list(db.tables):
    submissions = db.open_table("submissions")
else:
    submissions = db.create_table(name="submissions", column_names=["arxiv_id", "date", "title", "authors", "url", "abstract"])
    
if "specter" in list(db.tables):
    specter = db.open_table("specter")
else:
    specter = db.create_table(name="specter", column_names=["arxiv_id", "paper_info"])

In [3]:
processed = set(row.arxiv_id for row in specter)
len(processed)

19291

In [4]:
batch_size = 64

In [5]:
rows = list(submissions.view("""
select _.* from _ left join specter using (arxiv_id)
where specter.arxiv_id is null
group by arxiv_id
"""))
len(rows)

1322

In [6]:
tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

for i in range(0, len(rows), batch_size):
    print(f"{i}/{len(rows)}")
# for i in [0]:
    batch_rows = rows[i:(i+batch_size)]
    title_abs = [row.title + tokenizer.sep_token + row.abstract for row in batch_rows]
    inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
    result = model(**inputs)
    # take the first token in the batch as the embedding
    embeddings = result.last_hidden_state[:, 0, :]
    specter_rows = []
    for row, embedding in zip(batch_rows, embeddings.tolist()):
        paper_info = {"embedding": {"model": "specter@local", "vector": embedding}}
        specter_rows.append([row.arxiv_id, json.dumps(paper_info)])
    specter.insert(specter_rows)

0/1322
64/1322
128/1322
192/1322
256/1322
320/1322
384/1322
448/1322
512/1322
576/1322
640/1322
704/1322
768/1322
832/1322
896/1322
960/1322
1024/1322
1088/1322
1152/1322
1216/1322
1280/1322


In [7]:
db.close()