In [4]:
import pandas as pd
import json
import re
import string
from datasets import Dataset
import torch
import numpy as np

DATA_PATH = "arxiv-metadata-oai-snapshot.json"
YEAR_PATTERN = r"(19|20[0-9]{2})"

## Load data

In [5]:
def clean_description(description: str):
    if not description:
        return ""
    # remove unicode characters
    description = description.encode('ascii', 'ignore').decode()

    # remove punctuation
    description = re.sub('[%s]' % re.escape(string.punctuation), ' ', description)

    # clean up the spacing
    description = re.sub('\s{2,}', " ", description)

    # remove urls
    #description = re.sub("https*\S+", " ", description)

    # remove newlines
    description = description.replace("\n", " ")

    # remove all numbers
    #description = re.sub('\w*\d+\w*', '', description)

    # split on capitalized words
    description = " ".join(re.split('(?=[A-Z])', description))

    # clean up the spacing again
    description = re.sub('\s{2,}', " ", description)

    # make all words lowercase
    description = description.lower()

    return description

# Generator functions that iterate through the file and process/load papers
def process(paper: dict):
    paper = json.loads(paper)
    if paper['journal-ref']:
        # Attempt to parse the date using Regex: this could be improved
        years = [int(year) for year in re.findall(YEAR_PATTERN, paper['journal-ref'])]
        years = [year for year in years if (year <= 2022 and year >= 1991)]
        year = min(years) if years else None
    else:
        year = None
    return {
        'id': paper['id'],
        'title': paper['title'],
        'year': year,
        'authors': paper['authors'],
        'categories': ','.join(paper['categories'].split(' ')),
        'abstract': paper['abstract'],    }

def papers():
    with open(DATA_PATH, 'r') as f:
        for paper in f:
            paper = process(paper)
            # Yield only papers that have a year I could process
            if paper['year']:
                yield paper


In [6]:
df = pd.DataFrame(papers())

In [167]:
df_sample = df.copy()
df_sample_clean = df.copy()

## Prepare text and labels

In [168]:
df_sample['text'] = df_sample.apply(lambda r: clean_description(r['title'] + ' ' + r['abstract']), axis=1).tolist()
df_sample = df_sample[['text', 'categories']]


In [169]:
# concatenate df_sample and dummies (ooe_df will be used to inverse the preds and get category names)
ooe_df = df_sample['categories'].str.get_dummies(sep=',')
num_classes = ooe_df.shape[1]

In [170]:
category_cols = ooe_df.columns.tolist()
parse_labels = lambda x : [x[c] for c in category_cols]

# parse the labels
df_sample['labels'] = ooe_df.apply(parse_labels, axis=1)
df_sample = df_sample[['text', 'labels']]

In [172]:
df_dataset = Dataset.from_pandas(df_sample)


## Modelling

In [173]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments

In [174]:
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny",
                                          problem_type="multi_label_classification",
                                          model_max_length=512)

def tokenize_and_encode(examples):
  return tokenizer(examples["text"], truncation=True)
cols = df_dataset.column_names
cols.remove('labels')
df_dataset = df_dataset.map(tokenize_and_encode, batched=True, remove_columns=cols)

df_dataset.set_format("torch")
df_dataset = (df_dataset
          .map(lambda x : {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels", "token_type_ids"])
          .rename_column("float_labels", "labels"))

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--prajjwal1--bert-tiny/snapshots/6f75de8b60a9f8a2fdf7b69cbd86d9e64bcb3837/config.json
Model config BertConfig {
  "_name_or_path": "prajjwal1/bert-tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "multi_label_classification",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/jovyan/.cache/huggingface/hub/models--prajjwal1--bert-tiny

  0%|          | 0/718 [00:00<?, ?ba/s]

  0%|          | 0/717846 [00:00<?, ?ex/s]

In [175]:
model = AutoModelForSequenceClassification.from_pretrained(
    "prajjwal1/bert-tiny",
    num_labels=num_classes,
    problem_type="multi_label_classification"
    )

loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--prajjwal1--bert-tiny/snapshots/6f75de8b60a9f8a2fdf7b69cbd86d9e64bcb3837/config.json
Model config BertConfig {
  "_name_or_path": "prajjwal1/bert-tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL

In [177]:
args = TrainingArguments(
    save_strategy="epoch",
    num_train_epochs=2,
    output_dir = '.outputs'
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=df_dataset,
                  tokenizer=tokenizer)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 717846
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 179462
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.2884
1000,0.0848
1500,0.0607
2000,0.0526
2500,0.0499


In [None]:
preds = trainer.predict(df_dataset)

In [None]:
preds = torch.nn.functional.softmax(torch.tensor(preds.predictions))

## Get top category name from predictions

In [None]:
def get_best_args_and_score(row):
    # get 5 best predictions
    best_args = np.argpartition(row, -3)[-3:]
    best_score = row[best_args]
    return best_args, best_score

# map get_best_args_and_score to all rows on preds
best_args_score_vec = np.apply_along_axis(get_best_args_and_score, 1, preds.predictions)

In [None]:
def get_category_names(args, ooe_df):
    # get category names from args
    return ooe_df.columns[args[0].astype(int).tolist()]

In [None]:
categories_vec = np.apply_along_axis(get_category_names, 1, best_args_score_vec, ooe_df)

In [None]:
best_score =  best_args_score_vec[:,1]

In [None]:
soft_tags = {'category' : categories_vec.tolist(), 'score' : np.around(best_score, 3).tolist()}

In [None]:
df_sample_clean['category_predicted'] = soft_tags['category']
df_sample_clean['category_predicted'] = df_sample_clean['category_predicted'].str.join(',')

df_sample_clean['category_score'] = soft_tags['score']

In [None]:
df_sample_clean['category_score']

In [None]:
# Dump these to file with pickle or write them to Redis
with open('papers_with_soft_labels.pkl', 'wb') as f:
    pickle.dump(df_sample_clean, f)