# arXiv Paper Embedding

## On a Single GPU
This notebook utilizes an NVIDIA T4 on Saturn Cloud.

In [None]:
%%capture
!pip install sentence_transformers

In [None]:
%%capture
!pip install semanticscholar

In [None]:
import cudf
import pandas as pd
import json
import os
import re
from tqdm import tqdm
import string
import pickle
import src.scholar_citations as sch
import src.vectors as vect

DATA_PATH = "arxiv-metadata-oai-snapshot.json"
YEAR_PATTERN = r"(19|20[0-9]{2})"

In [None]:
import warnings
warnings.filterwarnings('ignore')
tqdm.pandas()

## Step 1: Data pre processing
Before we do anything else, we need to load the papers dataset, do some basic cleaning, and get it into a workable format. Below,
we will use CuDF to house the data and apply seom transformations in a generator, loading from file.

In [None]:
def clean_description(description: str):
    if not description:
        return ""
    # remove unicode characters
    description = description.encode('ascii', 'ignore').decode()

    # remove punctuation
    description = re.sub('[%s]' % re.escape(string.punctuation), ' ', description)

    # clean up the spacing
    description = re.sub('\s{2,}', " ", description)

    # remove urls
    #description = re.sub("https*\S+", " ", description)

    # remove newlines
    description = description.replace("\n", " ")

    # remove all numbers
    #description = re.sub('\w*\d+\w*', '', description)

    # split on capitalized words
    description = " ".join(re.split('(?=[A-Z])', description))

    # clean up the spacing again
    description = re.sub('\s{2,}', " ", description)

    # make all words lowercase
    description = description.lower()

    return description

In [None]:
data = vect.load_raw_data("./arxiv-metadata-oai-snapshot.json")
data = pd.DataFrame.from_records(data)

In [None]:
data.categories = data.categories.str.lower()
condition = (
    (data.categories.str.contains('cs.lg')) |
    (data.categories.str.contains('cs.ai')) |
    (data.categories.str.contains('cs.cl')) |
    (data.categories.str.contains('cs.cv')) |
    (data.categories.str.contains('cs.ne')) 
)
data_ds = data[condition]

In [None]:
def get_categories(cat_str: str):
    cat_str = [elem for elem in cat_str.split(" ") if elem]
    return ", ".join(cat_str)
data_ds['categories'] = [get_categories(x) for x in data_ds.categories]

In [None]:
data_ds = sch.fill_year_column(data_ds)

In [None]:
input_ = data_ds['title'] + data_ds['abstract']
data_ds['input'] =  input_.apply(lambda x: clean_description(x))

In [None]:
cdf = data_ds.DataFrame.from_pandas(data_ds)

In [None]:
len(cdf)

In [None]:
cdf.year.value_counts()

## Step 2: Create sentence embeddings
Here I use a cookie-cutter -- **out of the box** -- model from HuggingFace to transform papers abstracts + titles into vectors.

**This takes a long time**... So best to take a subset. Or use the dask cluster for multi-gpu encoding.

In [None]:
cdf = cdf.reset_index(drop=True)

In [None]:
# change if needed
sample = len(cdf)
batch = cdf[:sample]

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

vectors = model.encode(
    sentences = batch.input.values_host,
    normalize_embeddings = True,
    batch_size = 64,
    show_progress_bar = True
)

In [None]:
# Vectors created!
batch['vector'] = cudf.Series(vectors.tolist(), index=batch.index)

In [None]:
batch.to_json("ds_embeddings.json")

In [None]:
ids = batch.id

In [None]:
for id_ in ids:
    sch.get_sch_paper(id_, write_to = "./citations.json")

In [None]:
citations_df = sch.get_citations_df(
    from_ = "./citations.json",
    write_to = "./citations_df.json"
)

In [None]:
citations_df['citations'] = citations_df['citations'].apply(lambda x: sch.process_citation(x))

In [None]:
## merge batch and citations_df[[id, citations]]