In [6]:
import pandas as pd
import json
import re
import string
from datasets import Dataset
import torch
import numpy as np

DATA_PATH = "arxiv-metadata-oai-snapshot.json"
YEAR_PATTERN = r"(19|20[0-9]{2})"

In [141]:
import transformers

In [142]:
transformers.logging

<module 'transformers.utils.logging' from '/srv/conda/envs/saturn/lib/python3.7/site-packages/transformers/utils/logging.py'>

## Load data

In [7]:
def clean_description(description: str):
    if not description:
        return ""
    # remove unicode characters
    description = description.encode('ascii', 'ignore').decode()

    # remove punctuation
    description = re.sub('[%s]' % re.escape(string.punctuation), ' ', description)

    # clean up the spacing
    description = re.sub('\s{2,}', " ", description)

    # remove urls
    #description = re.sub("https*\S+", " ", description)

    # remove newlines
    description = description.replace("\n", " ")

    # remove all numbers
    #description = re.sub('\w*\d+\w*', '', description)

    # split on capitalized words
    description = " ".join(re.split('(?=[A-Z])', description))

    # clean up the spacing again
    description = re.sub('\s{2,}', " ", description)

    # make all words lowercase
    description = description.lower()

    return description

# Generator functions that iterate through the file and process/load papers
def process(paper: dict):
    paper = json.loads(paper)
    if paper['journal-ref']:
        # Attempt to parse the date using Regex: this could be improved
        years = [int(year) for year in re.findall(YEAR_PATTERN, paper['journal-ref'])]
        years = [year for year in years if (year <= 2022 and year >= 1991)]
        year = min(years) if years else None
    else:
        year = None
    return {
        'id': paper['id'],
        'title': paper['title'],
        'year': year,
        'authors': paper['authors'],
        'categories': ','.join(paper['categories'].split(' ')),
        'abstract': paper['abstract'],    }

def papers():
    with open(DATA_PATH, 'r') as f:
        for paper in f:
            paper = process(paper)
            # Yield only papers that have a year I could process
            if paper['year']:
                yield paper


In [8]:
df = pd.DataFrame(papers())

In [9]:
df_sample = df.copy()
df_sample_clean = df.copy()

## Prepare text and labels

In [10]:
df_sample['text'] = df_sample.apply(lambda r: clean_description(r['title'] + ' ' + r['abstract']), axis=1).tolist()
df_sample = df_sample[['text', 'categories']]


In [37]:
df_tmp = df.copy()

In [38]:
df_tmp['categories'].str.get_dummies(sep=',')

Unnamed: 0,adap-org,alg-geom,astro-ph,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,chao-dyn,...,q-fin.TR,quant-ph,solv-int,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,stat.TH,supr-con
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717841,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
717842,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
717843,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
717844,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [11]:
# concatenate df_sample and dummies (ooe_df will be used to inverse the preds and get category names)
ooe_df = df_sample['categories'].str.get_dummies(sep=',')
num_classes = ooe_df.shape[1]

In [12]:
category_cols = ooe_df.columns.tolist()
parse_labels = lambda x : [x[c] for c in category_cols]

# parse the labels
df_sample['labels'] = ooe_df.apply(parse_labels, axis=1)
df_sample = df_sample[['text', 'labels']]

In [13]:
df_dataset = Dataset.from_pandas(df_sample)


## Modelling

In [14]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments

In [15]:
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny",
                                          problem_type="multi_label_classification",
                                          model_max_length=512)

def tokenize_and_encode(examples):
  return tokenizer(examples["text"], truncation=True)
cols = df_dataset.column_names
cols.remove('labels')
df_dataset = df_dataset.map(tokenize_and_encode, batched=True, remove_columns=cols)

df_dataset.set_format("torch")
df_dataset = (df_dataset
          .map(lambda x : {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels", "token_type_ids"])
          .rename_column("float_labels", "labels"))

  0%|          | 0/718 [00:00<?, ?ba/s]

  0%|          | 0/717846 [00:00<?, ?ex/s]

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    "prajjwal1/bert-tiny",
    num_labels=num_classes,
    problem_type="multi_label_classification"
    )

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

In [17]:
args = TrainingArguments(
    save_strategy="epoch",
    num_train_epochs=1,
    output_dir = '.outputs',
    logging_steps = 10000
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=df_dataset,
                  tokenizer=tokenizer)

In [18]:
trainer.train()

***** Running training *****
  Num examples = 717846
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 89731
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.2923
1000,0.0852
1500,0.0609
2000,0.0527
2500,0.0499
3000,0.0484
3500,0.0482
4000,0.0481
4500,0.0472
5000,0.0469


Saving model checkpoint to .outputs/checkpoint-89731
Configuration saved in .outputs/checkpoint-89731/config.json
Model weights saved in .outputs/checkpoint-89731/pytorch_model.bin
tokenizer config file saved in .outputs/checkpoint-89731/tokenizer_config.json
Special tokens file saved in .outputs/checkpoint-89731/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=89731, training_loss=0.03297703592773346, metrics={'train_runtime': 1361.8695, 'train_samples_per_second': 527.103, 'train_steps_per_second': 65.888, 'total_flos': 586731813565920.0, 'train_loss': 0.03297703592773346, 'epoch': 1.0})

In [19]:
preds = trainer.predict(df_dataset)

***** Running Prediction *****
  Num examples = 717846
  Batch size = 8


In [20]:
preds = torch.nn.functional.softmax(torch.tensor(preds.predictions))

  """Entry point for launching an IPython kernel.


## Get top category name from predictions

In [23]:
def get_best_args_and_score(row):
    # get 5 best predictions
    best_args = np.argpartition(row, -3)[-3:]
    best_score = row[best_args]
    return best_args, best_score

# map get_best_args_and_score to all rows on preds
best_args_score_vec = np.apply_along_axis(get_best_args_and_score, 1, preds)

In [24]:
def get_category_names(args, ooe_df):
    # get category names from args
    return ooe_df.columns[args[0].astype(int).tolist()]

In [25]:
categories_vec = np.apply_along_axis(get_category_names, 1, best_args_score_vec, ooe_df)

In [26]:
best_score =  best_args_score_vec[:,1]

In [30]:
soft_tags = {'category' : categories_vec.tolist(), 'score' : np.around(best_score, 2).tolist()}

In [31]:
df_sample_clean['category_predicted'] = soft_tags['category']
df_sample_clean['category_predicted'] = df_sample_clean['category_predicted'].str.join(',')

df_sample_clean['category_score'] = soft_tags['score']

In [110]:
import pickle
# Dump these to file with pickle or write them to Redis
with open('papers_with_soft_labels.pkl', 'wb') as f:
    pickle.dump(df_sample_clean, f)