In [43]:
import urllib.request
import sqltables
import json
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel

In [70]:
db = sqltables.sqlite3.Database("submissions.sqlite3")
if "submissions" in list(db.tables):
    submissions = db.open_table("submissions")
else:
    submissions = db.create_table(name="submissions", column_names=["arxiv_id", "date", "title", "authors", "url", "abstract"])
    
if "specter" in list(db.tables):
    specter = db.open_table("specter")
else:
    specter = db.create_table(name="specter", column_names=["arxiv_id", "paper_info"])

In [71]:
processed = set(row.arxiv_id for row in specter)
processed

set()

In [63]:
batch_size = 64

In [74]:
rows = list(submissions.view("""select * from _ group by arxiv_id"""))

tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

for i in tqdm(range(0, len(rows), batch_size)):
# for i in [0]:
    batch_rows = rows[i:(i+batch_size)]
    title_abs = [row.title + tokenizer.sep_token + row.abstract for row in batch_rows]
    inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
    result = model(**inputs)
    # take the first token in the batch as the embedding
    embeddings = result.last_hidden_state[:, 0, :]
    specter_rows = []
    for row, embedding in zip(batch_rows, embeddings.tolist()):
        paper_info = {"embedding": {"model": "specter@local", "vector": embedding}}
        specter_rows.append([row.arxiv_id, json.dumps(paper_info)])
    specter.insert(specter_rows)

  0%|          | 0/265 [00:00<?, ?it/s]

In [75]:
db.close()