In [1]:
import pymongo
import pandas as pd
import numpy as np
from pathlib import Path
from logging import Logger, StreamHandler

In [2]:
dataset_path = Path("../../dataset")

In [3]:
import os

os.listdir(dataset_path)

['arxiv-metadata-oai-snapshot.json',
 'cache_dir',
 'data',
 'lemmatized_test_df_dataset_1.pq',
 'lemmatized_test_df_dataset_2.pq',
 'lemmatized_test_df_dataset_3.pq',
 'lemmatized_test_df_dataset_4.pq',
 'lemmatized_test_df_dataset_5.pq',
 'lemmatized_train_df_dataset_1.pq',
 'lemmatized_train_df_dataset_2.pq',
 'lemmatized_train_df_dataset_3.pq',
 'lemmatized_train_df_dataset_4.pq',
 'lemmatized_train_df_dataset_5.pq',
 'lemmatized_validation_df_dataset_1.pq',
 'lemmatized_validation_df_dataset_2.pq',
 'lemmatized_validation_df_dataset_3.pq',
 'lemmatized_validation_df_dataset_4.pq',
 'lemmatized_validation_df_dataset_5.pq',
 'outliers_df.pq',
 'parquet',
 'split-test_dataset-1_model-distilbert-base-nli-mean-tokens_embeddings.npy',
 'split-test_dataset-1_model-sentence-transformers_distilbert-base-nli-stsb-quora-ranking_embeddings.npy',
 'split-test_dataset-1_model-sentence-transformers_distilroberta-base-paraphrase-v1_embeddings.npy',
 'split-validation_dataset-4_model-sentence-tran

### Arguments

In [4]:
dataset_index = 5
model_name = "sentence-transformers/distilroberta-base-paraphrase-v1"
splits = ['train', 'validation', 'test']

### Load saved embeddings

In [5]:
import re
from pathlib import Path

def load_embeddings(split_name: str, model_name: str):
    embeddings_filename = get_embeddings_filename(split_name, model_name=model_name)

    if Path(embeddings_filename).exists():
        try:
            embeddings = np.load(embeddings_filename)
        except FileNotFoundError as e:
            logger.error(f"Expected file named {embeddings_filename} was not found")

    return embeddings

def get_embeddings_filename(split_name, model_name):
    model_normalized_name = re.sub("/", "_", model_name)
    
    return str(
        dataset_path /
        f"split-{split_name}_dataset-{dataset_index}_model-{model_normalized_name}_embeddings.npy"
    )

In [6]:
train_embeddings = load_embeddings(split_name='train', model_name=model_name)
train_embeddings.shape

(112830, 768)

In [7]:
validation_embeddings = load_embeddings(split_name='validation', model_name=model_name)
validation_embeddings.shape

(112830, 768)

### Prepare labels

In [8]:
cache_dir = dataset_path / 'cache_dir'

In [9]:
from datasets import load_dataset

def load_target_dataset(split: str, dataset_index: int = None):
    prefix = ""

    dataset = \
        load_dataset('parquet',
                     data_files=[str(dataset_path / f"{split}_df_dataset_{dataset_index}.pq")],
                     cache_dir=cache_dir)['train']

    return dataset

In [10]:
train_dataset = load_target_dataset(split='train', dataset_index=dataset_index)
validation_dataset = load_target_dataset(split='validation', dataset_index=dataset_index)
test_dataset = load_target_dataset(split='test', dataset_index=dataset_index)

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/classifier/../../dataset/cache_dir/parquet/default-a97d8722ee27f6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/classifier/../../dataset/cache_dir/parquet/default-64457f2740b93e09/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset parquet (/mnt/NVMe/workspace/github_projects/arxiv_dataset_insights/apps/classifier/../../dataset/cache_dir/parquet/default-6d7d74b312461a4f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
test_embeddings = load_embeddings(split_name='test', model_name=model_name)
test_embeddings.shape

(112830, 768)

### This does not include 2 categories that are missing from dataset 5 due to them being minor categories

In [12]:
categories_list = train_dataset['categories_list']
categories_list.extend(validation_dataset['categories_list'])
categories_list.extend(test_dataset['categories_list'])

In [13]:
all_unique_categories = set()

[all_unique_categories.update(x) for x in train_dataset['categories_list']]
all_unique_categories = list(all_unique_categories)
len(all_unique_categories)

174

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

In [65]:
multilabel_binarizer = MultiLabelBinarizer(sparse_output=False)
_ = multilabel_binarizer.fit_transform(train_dataset['categories_list'])

In [20]:
len(multilabel_binarizer.classes_)

174

In [17]:
models_dir = Path('../../models/')

In [18]:
import pickle

In [69]:
with open(models_dir / 'multilabel_binarizer.pkl', 'wb') as f:
    pickle.dump(multilabel_binarizer, f)

In [19]:
with open(models_dir / 'multilabel_binarizer.pkl', 'rb') as f:
    multilabel_binarizer = pickle.load(f)
len(multilabel_binarizer.classes_)

174

### Data Prep

In [21]:
from typing import List

def transform_labels(labels: List[List[str]]):
    y = multilabel_binarizer.transform(labels)
    return y

In [22]:
train_y = transform_labels(train_dataset['categories_list'])
train_y.shape

(112830, 174)

In [23]:
validation_y = transform_labels(validation_dataset['categories_list'])
validation_y.shape

(112830, 174)

In [24]:
test_y = transform_labels(test_dataset['categories_list'])
test_y.shape

(112830, 174)

### Build a classifier

In [26]:
!pip install tensorflow --upgrade



In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

In [28]:
import torch

torch.cuda.is_available()

True

In [27]:
import tensorflow as tf   # TensorFlow registers PluggableDevices here.
tf.config.list_physical_devices()

2023-06-13 23:53:24.649446: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-13 23:53:24.695435: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-13 23:53:24.696320: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-13 23:53:27.362384: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-13 23:53:27.398859: W tensorflow/core/common_runtime/gpu/gpu_device.

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [83]:
out_dim = train_embeddings.shape[1]
out_dim

768

In [None]:
from tensorflow import keras

def get_dense_model(num_classes: int):
    # Define the model architecture
    model = keras.Sequential([
        keras.layers.Dense(512, activation='relu', input_shape=(768,)),
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dense(num_classes, activation='sigmoid')  # Using sigmoid activation for multi-label classification
    ])
    
    return model

In [98]:
model = get_dense_model(num_classes=len(multilabel_binarizer.classes_))

In [99]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_11 (Dense)            (None, 512)               393728    
                                                                 
 dense_12 (Dense)            (None, 256)               131328    
                                                                 
 dense_13 (Dense)            (None, 128)               32896     
                                                                 
 dense_14 (Dense)            (None, 174)               22446     
                                                                 
Total params: 580,398
Trainable params: 580,398
Non-trainable params: 0
_________________________________________________________________


### Train a classifier

In [100]:
# Compile the model
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'f1_score', 'jaccard_score'])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [101]:
# Train the model
batch_size = 768
epochs = 100
model.fit(train_embeddings, train_y, batch_size=batch_size, epochs=epochs,
          validation_data=(validation_embeddings, validation_y))

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


In [102]:
import tensorflow as tf   # TensorFlow registers PluggableDevices here.
tf.config.list_physical_devices()

### Evaluate a classifier on test dataset