In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import torch

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

device = "cuda" if torch.cuda.is_available() else "cpu"
data_dir = Path(r'C:\Users\Stille\Desktop\Arxiv NLP task')


# Text classification with Deep learning using BERT

In [2]:
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

In [3]:
df = pd.read_csv(data_dir/"arxiv_update.csv") # save excel as csv utf-8

In [4]:
df.head()

Unnamed: 0,title,abstract,categories
0,New air fluorescence detectors employed in the\nTelescope Array experiment\n,"Since 2007, the Telescope Array (TA) experiment, based in Utah, USA, has\nbeen observing ultra high energy cosmic ra...",Astrophysics
1,"THE RELATION BETWEEN EJECTA VELOCITY, INTRINSIC COLOR, AND HOST-GALAXY MASS FOR HIGH-REDSHIFT TYPE Ia SUPERNOVAE\n","Recently, using a large low-redshift sample of Type Ia supernovae (SNe Ia), we discovered a relation\nbetween SN Ia ...",Astrophysics
2,The JCMT Nearby Galaxies Legacy Survey. VII. Hα\nimaging and massive star formation properties,"We present Hα fluxes, star formation rates (SFRs) and equivalent widths (EWs) for\na sample of 156 nearby galaxies o...",Astrophysics
3,The magnetic field of IRAS 16293-2422 as traced by shock-induced\nH2O masers,Context. Shock-induced H2O masers are important magnetic field tracers at very high density gas. Water masers are fo...,Astrophysics
4,Scars of Intense Accretion Episodes at Metal-Rich White\nDwarfs\n,"A re-evaluation of time-averaged accretion rates at DBZ-type white dwarfs points\nto historical, time-averaged rates...",Astrophysics


In [5]:
col = ['title', 'abstract', 'categories']
df = df[col]
df.columns = ['title', 'abstract', 'categories']

In [6]:
# Set numerical values for each category
df['category_id'] = df['categories'].factorize()[0]
df.head(20)

Unnamed: 0,title,abstract,categories,category_id
0,New air fluorescence detectors employed in the\nTelescope Array experiment\n,"Since 2007, the Telescope Array (TA) experiment, based in Utah, USA, has\nbeen observing ultra high energy cosmic ra...",Astrophysics,0
1,"THE RELATION BETWEEN EJECTA VELOCITY, INTRINSIC COLOR, AND HOST-GALAXY MASS FOR HIGH-REDSHIFT TYPE Ia SUPERNOVAE\n","Recently, using a large low-redshift sample of Type Ia supernovae (SNe Ia), we discovered a relation\nbetween SN Ia ...",Astrophysics,0
2,The JCMT Nearby Galaxies Legacy Survey. VII. Hα\nimaging and massive star formation properties,"We present Hα fluxes, star formation rates (SFRs) and equivalent widths (EWs) for\na sample of 156 nearby galaxies o...",Astrophysics,0
3,The magnetic field of IRAS 16293-2422 as traced by shock-induced\nH2O masers,Context. Shock-induced H2O masers are important magnetic field tracers at very high density gas. Water masers are fo...,Astrophysics,0
4,Scars of Intense Accretion Episodes at Metal-Rich White\nDwarfs\n,"A re-evaluation of time-averaged accretion rates at DBZ-type white dwarfs points\nto historical, time-averaged rates...",Astrophysics,0
5,"DUST EXTINCTION BIAS IN THE COLUMN DENSITY DISTRIBUTION OF GAMMA-RAY BURSTS; HIGH COLUMN\nDENSITY, LOW REDSHIFT GRBS...",The afterglows of gamma-ray bursts (GRBs) have more soft X-ray absorption than expected from the foreground gas colu...,Astrophysics,0
6,THE ZCOSMOS1 20K GROUP CATALOG\n,"We present an optical group catalog between 0.1 . z . 1 based on 16,500 high-quality spectroscopic\nredshifts in the...",Astrophysics,0
7,Making Galaxies in a Cosmological Context: The Need for\nEarly Stellar Feedback,We introduce the Making Galaxies in a Cosmological Context (MaGICC) program\nof smoothed particle hydrodynamics (SPH...,Astrophysics,0
8,THE TEMPERATURE-DENSITY RELATION IN THE INTERGALACTIC MEDIUM AT REDSHIFT hZi = 2.4,We present new measurements of the temperature-density (T − ρ) relation for neutral hydrogen\nin the 2.0 < z < 2.8 i...,Astrophysics,0
9,Dipole leakage and low CMB multipoles,A number of studies of WMAP-7 have highlighted that the power at the low\nmultipoles in CMB power spectrum are lower...,Astrophysics,0


In [7]:
# text cleaning
df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=[" "," "], regex=True)

Unnamed: 0,title,abstract,categories,category_id
0,New air fluorescence detectors employed in the Telescope Array experiment,"Since 2007, the Telescope Array (TA) experiment, based in Utah, USA, has been observing ultra high energy cosmic ray...",Astrophysics,0
1,"THE RELATION BETWEEN EJECTA VELOCITY, INTRINSIC COLOR, AND HOST-GALAXY MASS FOR HIGH-REDSHIFT TYPE Ia SUPERNOVAE","Recently, using a large low-redshift sample of Type Ia supernovae (SNe Ia), we discovered a relation between SN Ia e...",Astrophysics,0
2,The JCMT Nearby Galaxies Legacy Survey. VII. Hα imaging and massive star formation properties,"We present Hα fluxes, star formation rates (SFRs) and equivalent widths (EWs) for a sample of 156 nearby galaxies ob...",Astrophysics,0
3,The magnetic field of IRAS 16293-2422 as traced by shock-induced H2O masers,Context. Shock-induced H2O masers are important magnetic field tracers at very high density gas. Water masers are fo...,Astrophysics,0
4,Scars of Intense Accretion Episodes at Metal-Rich White Dwarfs,"A re-evaluation of time-averaged accretion rates at DBZ-type white dwarfs points to historical, time-averaged rates ...",Astrophysics,0
...,...,...,...,...
163,Probing Nuclear Matter With Jets and γ-Hadron Correlations: Results from PHENIX,"Fully reconstructed jets and direct photon-tagged jet fragments significantly reduce energy-loss bias, the bias towa...",Nuclear Experiment,6
164,The light nuclei spin structure from hadronic channels at intermediate energies,"The investigation of the d, 3H and 3He spin structure has been performed at the RIKEN(Japan) accelerator research fa...",Nuclear Experiment,6
165,Particle-yield modification in jet-like azimuthal di-hadron correlations in Pb–Pb collisions at √sNN = 2 .76TeV,The yield of charged particles associated with highp t trigger particles (8 < p t < 15GeV / c) is measured with the ...,Nuclear Experiment,6
166,Experimental study of α-induced reactions on 64Zn for the astrophysical γ-process,"For the synthesis of the heavy, proton rich isotopes in the astrophysical γ-process the precise knowledge of α-induc...",Nuclear Experiment,6


## Step 1. Data exploration and preprocess

In [8]:
df['categories'].value_counts()

Nuclear Experiment                          24
Mathematical Physics                        24
General Relativity and Quantum Cosmology    24
Quantum Physics                             24
High Energy Physics                         24
Condensed Matter                            24
Astrophysics                                24
Name: categories, dtype: int64

In [9]:
possible_labels = df.categories.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'Astrophysics': 0,
 'Condensed Matter': 1,
 'General Relativity and Quantum Cosmology': 2,
 'High Energy Physics': 3,
 'Mathematical Physics': 4,
 'Quantum Physics': 5,
 'Nuclear Experiment': 6}

In [10]:
df['label'] = df.categories.replace(label_dict)

In [11]:
df

Unnamed: 0,title,abstract,categories,category_id,label
0,New air fluorescence detectors employed in the\nTelescope Array experiment\n,"Since 2007, the Telescope Array (TA) experiment, based in Utah, USA, has\nbeen observing ultra high energy cosmic ra...",Astrophysics,0,0
1,"THE RELATION BETWEEN EJECTA VELOCITY, INTRINSIC COLOR, AND HOST-GALAXY MASS FOR HIGH-REDSHIFT TYPE Ia SUPERNOVAE\n","Recently, using a large low-redshift sample of Type Ia supernovae (SNe Ia), we discovered a relation\nbetween SN Ia ...",Astrophysics,0,0
2,The JCMT Nearby Galaxies Legacy Survey. VII. Hα\nimaging and massive star formation properties,"We present Hα fluxes, star formation rates (SFRs) and equivalent widths (EWs) for\na sample of 156 nearby galaxies o...",Astrophysics,0,0
3,The magnetic field of IRAS 16293-2422 as traced by shock-induced\nH2O masers,Context. Shock-induced H2O masers are important magnetic field tracers at very high density gas. Water masers are fo...,Astrophysics,0,0
4,Scars of Intense Accretion Episodes at Metal-Rich White\nDwarfs\n,"A re-evaluation of time-averaged accretion rates at DBZ-type white dwarfs points\nto historical, time-averaged rates...",Astrophysics,0,0
...,...,...,...,...,...
163,Probing Nuclear Matter With Jets and γ-Hadron\nCorrelations: Results from PHENIX,"Fully reconstructed jets and direct photon-tagged jet fragments\nsignificantly reduce energy-loss bias, the bias tow...",Nuclear Experiment,6,6
164,The light nuclei spin structure from hadronic\nchannels at intermediate energies,"The investigation of the d, 3H and 3He spin structure has been performed at the RIKEN(Japan)\naccelerator research f...",Nuclear Experiment,6,6
165,Particle-yield modification in jet-like azimuthal di-hadron correlations in\nPb–Pb collisions at\n√sNN\n=\n2\n.76TeV,The yield of charged particles associated with highp\nt\ntrigger particles (8\n<\np\nt\n< 15GeV\n/\nc) is measured w...,Nuclear Experiment,6,6
166,Experimental study of α-induced reactions on 64Zn\nfor the astrophysical γ-process\n,"For the synthesis of the heavy, proton rich isotopes in the astrophysical γ-process the precise\nknowledge of α-indu...",Nuclear Experiment,6,6


In [12]:
# Train and validation split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.2, 
                                                  random_state=42, 
                                                  stratify=df.label.values)

df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['categories', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,title,abstract,category_id
categories,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Astrophysics,0,train,20,20,20
Astrophysics,0,val,4,4,4
Condensed Matter,1,train,19,19,19
Condensed Matter,1,val,5,5,5
General Relativity and Quantum Cosmology,2,train,19,19,19
General Relativity and Quantum Cosmology,2,val,5,5,5
High Energy Physics,3,train,19,19,19
High Energy Physics,3,val,5,5,5
Mathematical Physics,4,train,19,19,19
Mathematical Physics,4,val,5,5,5


## Step 2. Tokenization

In [13]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].title.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].title.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


## Step 3. Model and training

In [14]:
# BERT Pre-trained Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [15]:
# Dataloader
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 5

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [16]:
# optimizer and scheduler
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 10

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



In [17]:
# metrics
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [18]:
# training
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/27 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.9386628866195679
Validation loss: 1.815054212297712
F1 Score (Weighted): 0.2871457489878542


Epoch 2:   0%|          | 0/27 [00:00<?, ?it/s]


Epoch 2
Training loss: 1.7339360846413507
Validation loss: 1.732507518359593
F1 Score (Weighted): 0.28740970072239425


Epoch 3:   0%|          | 0/27 [00:00<?, ?it/s]


Epoch 3
Training loss: 1.5619208812713623
Validation loss: 1.6781314781733923
F1 Score (Weighted): 0.4368512110726644


Epoch 4:   0%|          | 0/27 [00:00<?, ?it/s]


Epoch 4
Training loss: 1.4742709045056943
Validation loss: 1.6408398151397705
F1 Score (Weighted): 0.4854083344567774


Epoch 5:   0%|          | 0/27 [00:00<?, ?it/s]


Epoch 5
Training loss: 1.3377542010060064
Validation loss: 1.5537586552756173
F1 Score (Weighted): 0.48237628384687214


Epoch 6:   0%|          | 0/27 [00:00<?, ?it/s]


Epoch 6
Training loss: 1.2626292396474768
Validation loss: 1.521634680884225
F1 Score (Weighted): 0.5578367710720651


Epoch 7:   0%|          | 0/27 [00:00<?, ?it/s]


Epoch 7
Training loss: 1.1866432030995686
Validation loss: 1.4996697221483504
F1 Score (Weighted): 0.5640968508615567


Epoch 8:   0%|          | 0/27 [00:00<?, ?it/s]


Epoch 8
Training loss: 1.111799743440416
Validation loss: 1.4704238516943795
F1 Score (Weighted): 0.5594283167812579


Epoch 9:   0%|          | 0/27 [00:00<?, ?it/s]


Epoch 9
Training loss: 1.0968734798608002
Validation loss: 1.4688770430428642
F1 Score (Weighted): 0.5247211938388409


Epoch 10:   0%|          | 0/27 [00:00<?, ?it/s]


Epoch 10
Training loss: 1.0574228785656117
Validation loss: 1.4543318067278181
F1 Score (Weighted): 0.5894024276377217
