In [1]:
import pymongo
import pandas as pd
import numpy as np
from pathlib import Path
from logging import Logger, StreamHandler

In [2]:
dataset_path = Path("../../dataset")

In [3]:
import os

os.listdir(dataset_path)

['arxiv-metadata-oai-snapshot.json',
 'cache_dir',
 'data',
 'lemmatized_test_df_dataset_1.pq',
 'lemmatized_test_df_dataset_2.pq',
 'lemmatized_test_df_dataset_3.pq',
 'lemmatized_test_df_dataset_4.pq',
 'lemmatized_test_df_dataset_5.pq',
 'lemmatized_train_df_dataset_1.pq',
 'lemmatized_train_df_dataset_2.pq',
 'lemmatized_train_df_dataset_3.pq',
 'lemmatized_train_df_dataset_4.pq',
 'lemmatized_train_df_dataset_5.pq',
 'lemmatized_validation_df_dataset_1.pq',
 'lemmatized_validation_df_dataset_2.pq',
 'lemmatized_validation_df_dataset_3.pq',
 'lemmatized_validation_df_dataset_4.pq',
 'lemmatized_validation_df_dataset_5.pq',
 'outliers_df.pq',
 'parquet',
 'split-test_dataset-1_model-distilbert-base-nli-mean-tokens_embeddings.npy',
 'split-test_dataset-1_model-sentence-transformers_distilbert-base-nli-stsb-quora-ranking_embeddings.npy',
 'split-test_dataset-1_model-sentence-transformers_distilroberta-base-paraphrase-v1_embeddings.npy',
 'split-validation_dataset-4_model-sentence-tran

### Arguments

In [4]:
dataset_index = 4
model_name = "sentence-transformers/distilroberta-base-paraphrase-v1"
splits = ['train', 'validation', 'test']

### Load saved embeddings

In [5]:
import re
from pathlib import Path

def load_embeddings(split_name: str, model_name: str):
    embeddings_filename = get_embeddings_filename(split_name, model_name=model_name)

    if Path(embeddings_filename).exists():
        try:
            embeddings = np.load(embeddings_filename)
        except FileNotFoundError as e:
            logger.error(f"Expected file named {embeddings_filename} was not found")

    return embeddings

def get_embeddings_filename(split_name, model_name):
    model_normalized_name = re.sub("/", "_", model_name)
    
    return str(
        dataset_path /
        f"split-{split_name}_dataset-{dataset_index}_model-{model_normalized_name}_embeddings.npy"
    )

In [6]:
train_embeddings = load_embeddings(split_name='train', model_name=model_name)
train_embeddings.shape

(1580762, 768)

In [7]:
validation_embeddings = load_embeddings(split_name='validation', model_name=model_name)
validation_embeddings.shape

(225752, 768)

### Prepare labels

In [8]:
cache_dir = dataset_path / 'cache_dir'

In [9]:
from datasets import load_dataset

def load_target_dataset(split: str, dataset_index: int = None):
    prefix = ""

    dataset = \
        load_dataset('parquet',
                     data_files=[str(dataset_path / f"{split}_df_dataset_{dataset_index}.pq")],
                     cache_dir=cache_dir)['train']

    return dataset

In [10]:
train_dataset = load_target_dataset(split='train', dataset_index=dataset_index)
validation_dataset = load_target_dataset(split='validation', dataset_index=dataset_index)
test_dataset = load_target_dataset(split='test', dataset_index=dataset_index)

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/classifier/../../dataset/cache_dir/parquet/default-ff94db2645fa5e2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/classifier/../../dataset/cache_dir/parquet/default-cf285e0ca1bbb326/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/classifier/../../dataset/cache_dir/parquet/default-5433279a47d817c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
test_embeddings = load_embeddings(split_name='test', model_name=model_name)
test_embeddings.shape

(451833, 768)

In [12]:
categories_list = train_dataset['categories_list']
categories_list.extend(validation_dataset['categories_list'])
categories_list.extend(test_dataset['categories_list'])

In [13]:
all_unique_categories = set()

[all_unique_categories.update(x) for x in train_dataset['categories_list']]
all_unique_categories = list(all_unique_categories)
len(all_unique_categories)

176

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

In [15]:
models_dir = Path('../../models/')

In [16]:
import pickle

In [17]:
with open(models_dir / 'multilabel_binarizer.pkl', 'rb') as f:
    multilabel_binarizer = pickle.load(f)
len(multilabel_binarizer.classes_)

176

### Data Prep

In [18]:
from typing import List

def transform_labels(labels: List[List[str]]):
    y = multilabel_binarizer.transform(labels)
    return y

In [19]:
train_y = transform_labels(train_dataset['categories_list'])
train_y.shape

(1580762, 176)

In [20]:
validation_y = transform_labels(validation_dataset['categories_list'])
validation_y.shape

(225752, 176)

In [21]:
test_y = transform_labels(test_dataset['categories_list'])
test_y.shape

(451833, 176)

In [22]:
import gc

del train_dataset
del test_dataset
del validation_dataset

gc.collect()

0

### Build a classifier

In [23]:
import torch

torch.cuda.is_available()

True

In [24]:
out_dim = train_embeddings.shape[1]
out_dim

768

#### Custom Dataset

In [25]:
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataset import random_split
from typing import List, Union
import numpy as np


class ArxivDataset(Dataset):
    def __init__(self, data: np.ndarray, targets: np.ndarray):
        self.data = data
        self.targets = targets

    def __getitem__(self, index):
        x = self.data[index]
        y = self.targets[index]
        return x, y

    def __len__(self):
        return len(self.data)

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl


# Define the model architecture
class ArxivAbstractClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(ArxivAbstractClassifier, self).__init__()
        
        self.fc1 = nn.Linear(input_size, 512)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(512, 256)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(256, 128)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(128, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.fc4(x)
        x = self.sigmoid(x)
        return x

    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self(inputs)
        loss = nn.BCELoss()(outputs, targets)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self(inputs)
        loss = nn.BCELoss()(outputs, targets)
        self.log("val_loss", loss)
        
    def test_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self(inputs)
        loss = nn.BCELoss()(outputs, targets)
        self.log("test_loss", loss)

#     def configure_optimizers(self):
#         optimizer = optim.Adam(self.parameters(), lr=0.001)
#         return optimizer

2023-06-14 03:03:21.990947: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [29]:
# Create a custom dataset instance
train_dataset = ArxivDataset(train_embeddings, train_y.astype(np.float32))
validation_dataset = ArxivDataset(validation_embeddings, validation_y.astype(np.float32))
test_dataset = ArxivDataset(test_embeddings, test_y.astype(np.float32))

In [30]:
num_classes = train_y.shape[1]
num_classes

176

In [31]:
model = ArxivAbstractClassifier(input_size=768, num_classes=num_classes)
model

ArxivAbstractClassifier(
  (fc1): Linear(in_features=768, out_features=512, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (relu3): ReLU()
  (fc4): Linear(in_features=128, out_features=176, bias=True)
  (sigmoid): Sigmoid()
)

In [34]:
learning_rate = 1e-2

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [35]:
from logging import Logger, StreamHandler
logger = Logger(__name__)
logger.addHandler(StreamHandler())

In [None]:
from tqdm import tqdm

num_epochs = 10

def train(train_dataloader: DataLoader, validation_dataloader: DataLoader):
    for epoch in range(num_epochs):
        
        for X, y in tqdm(train_dataloader):

### Train a Classifier

In [31]:
batch_size = 768

# Create data loaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

In [33]:
import os

n_cpus = max(os.cpu_count() - 3, 1)

num_epochs = 10

# Create an instance of the model
input_size = 768
model = ArxivAbstractClassifier(input_size, num_classes)

# Create a PyTorch Lightning trainer
trainer = pl.Trainer(max_epochs=num_epochs)

# Train the model
trainer.fit(model, train_loader, val_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A4000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type    | Params
------------------------------------
0 | fc1     | Linear  | 393 K 
1 | relu1   | ReLU    | 0     
2 | fc2     | Linear  | 131 K 
3 | relu2   | ReLU    | 0     
4 | fc3     | Linear  | 32.9 K
5 | relu3   | ReLU    | 0     
6 | fc4     | Linear  | 22.7 K
7 | sigmoid | Sigmoid | 0     
------------------------------------
580 K     Trainable params
0         Non-trainable params
580 K     Total params
2.323     Tota

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


### Evaluate a classifier on Test Dataset

In [36]:
trainer.test(model=model, dataloaders=test_loader)

In [None]:
mod