This model combines the predictions from BERT and the predictions GCN for node classification on the references. The BERT model is first ran and the output probabilities(logits) are extracted, this is then combined with (added to) the logits from the GCN and the softmax function is applied on it.

Install huggingface for BERT and dgl for GCN



In [None]:
!pip install transformers
!pip install dgl

Import libraries


In [None]:
# Generic libraries
import random
import itertools
import time
import datetime
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, 
                                                           SequentialSampler

from google.colab import drive
from google.colab import files
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

# GCN libraries
from dgl import DGLGraph
import networkx as nx
from dgl.nn.pytorch import GraphConv

#BERT libraries
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup


Load dataset from google drive (To run it you'll have to load the dataset from your own drive)


In [None]:
#drive.mount('/content/drive')
root_path = 'drive/My Drive/homework2/'

train = pd.read_csv(root_path + "train.csv")
text = pd.read_csv(root_path+"text.csv")
reference = pd.read_csv(root_path+"reference.csv")
test = pd.read_csv(root_path+"test.csv")
sample = pd.read_csv(root_path+"sample.csv")

Create pandas dataframe with train and text and extract values


In [None]:
train_text = pd.merge(train, text, on="id")
test_text = pd.merge(test, text, on="id")

titles = train_text.title.values
labels = train_text.label.values

**BERT MODEL**

The code in this section is partially adapted from the huggingface tutorial at https://huggingface.co/transformers/quickstart.html



**BERT** tokenisation

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased',
                                                      do_lower_case=True)
input_ids = []
for title in titles:
    encoded_title = tokenizer.encode(
    title, 
    add_special_tokens = True,
    max_length = 48 ,)
    input_ids.append(encoded_title)

input_ids = pad_sequences(input_ids, maxlen=48, dtype="long", 
 value=0, truncating="post", padding="post")

attention_masks = []
for sent in input_ids:
     att_mask = [int(token_id > 0) for token_id in sent]
     attention_masks.append(att_mask)

Train/test split

In [None]:
train_inputs, validation_inputs, train_labels, validation_labels = 
         train_test_split(input_ids, labels, random_state=2020, test_size=0.20)

train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
 random_state=2020, test_size=0.20)

Converting to pytorch datatype and create dataloaders

In [None]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, 
                                                     batch_size=batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks, 
                                                           validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, 
                                                        batch_size=batch_size)

**BERT** model definition (Hyperparameters are tuned here)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-large-uncased", 
    num_labels = 5,)

model.cuda()

optimizer = AdamW(model.parameters(),
 lr = 1.5e-5,
 eps = 1e-8 )

epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
 num_warmup_steps = 0,
 num_training_steps = total_steps)

Define helper functions for BERT training loop

In [None]:
def flat_accuracy(preds, labels):
 pred_flat = np.argmax(preds, axis=1).flatten()
 labels_flat = labels.flatten()
 return np.sum(pred_flat == labels_flat) / len(labels_flat)

def flat_pred(preds):
 pred_flat = np.argmax(preds, axis=1).flatten()
 return pred_flat

def format_time(elapsed):
 elapsed_rounded = int(round((elapsed)))
 return str(datetime.timedelta(seconds=elapsed_rounded))

**BERT** training loop

In [None]:
device = torch.device("cuda")    
torch.cuda.empty_cache()

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []
for epoch_i in range(0, epochs):

####################Training###############################
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')    
    t0 = time.time()    
    total_loss = 0   

    model.train()
    for step, batch in enumerate(train_dataloader):    
        if step % 40 == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)

            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, 
                                                len(train_dataloader), elapsed))        
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)       

        model.zero_grad()              

        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        loss = outputs[0]      
        total_loss += loss.item() 
        loss.backward()       
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)    
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

###########################################Validation##########################

    print("Running Validation...")    
    t0 = time.time() 

    model.eval()   
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0  

    for batch in validation_dataloader:

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():                    
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy        
        # Track the number of batches
        nb_eval_steps += 1    
    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")

Predict and write Output

In [None]:
# Create sentence and label lists
titles = test_text.title.values
input_ids = []

for title in titles:
    encoded_title = tokenizer.encode(
                        title,         
                        add_special_tokens = True,)
    input_ids.append(encoded_title)

input_ids = pad_sequences(input_ids, maxlen=48, 
                          dtype="long", truncating="post", padding="post")
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

batch_size = 32  
prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler,
                                                         batch_size=batch_size)

predictions = []
for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask = batch
  
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)  
      
  logits = outputs[0] 
  logits = logits.detach().cpu().numpy()

  predictions.append(logits)


Extract predicted probabilities

In [None]:
pred = []
for i in range(len(predictions)):
  pred_labels_i = np.array(predictions[i])
  for j in pred_labels_i:
      pred.append(j)

BERT_pred = torch.Tensor(pred)

**GCN for Node classification**

For some unknown reason both GCN and GAT with input features being the paper titles have performed very poorly. The plain GCN with only the references performed the best so this will be the model to combine with BERT. 

The code in this section is partially adapted from the dgl tutorial at https://docs.dgl.ai/en/0.4.x/tutorials/basics/1_first.html

Preprocessing and graph definition

In [None]:
# Get the lists of sentences and their labels.
all_titles = np.concatenate((train_text.title.values, test_text.title.values))

labels_filled = []
all_train_idx = []
test_idx = []

for i in range(len(all_titles)):
    if i in train_text.id.values:
        all_train_idx.append(i)
        labels_filled.append(train_text.loc[train_text.id == i].label.values[0])
    else:
        test_idx.append(i)
        labels_filled.append(0)

# Graph
reference.columns = ['src', 'tgt']
G_nx = nx.Graph()
G_nx.add_nodes_from(range(25561))
for ind in reference.index:
  G_nx.add_edge(reference['src'][ind], reference['tgt'][ind])

G_dgl = DGLGraph(G_nx)

embed = nn.Embedding(25561, 300)
G_dgl.ndata['feat'] = embed.weight

# Labels
labels = torch.LongTensor(labels_filled)

# Train test split
valid_idx = all_train_idx[round(0.90*len(all_train_idx)):]
train_idx = all_train_idx[:round(0.90*len(all_train_idx))]

all_tr_idx = torch.tensor(all_train_idx)
tr_idx = torch.tensor(train_idx)
va_idx = torch.tensor(valid_idx)
te_idx = torch.tensor(test_idx)

GCN  and helper functions definition

In [None]:
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, hidden_size)
        self.conv2 = GraphConv(hidden_size, num_classes)

    def forward(self, g, inputs):
        h = self.conv1(g, inputs)
        h = torch.relu(h)
        h = self.conv2(g, h)
        return h

def accuracy(logits, labels):
    _, indices = torch.max(logits, dim=1)
    correct = torch.sum(indices == labels)
    return correct.item() * 1.0 / len(labels)

def predict(logits):
    _, indices = torch.max(logits, dim=1)
    
    return indices


GCN Training loop

In [None]:
net = GCN(300, 128, 5)
inputs = embed.weight

optimizer = torch.optim.Adam(itertools.chain(net.parameters(), 
                                                  embed.parameters()), lr=0.01)

dur = []
for epoch in range(12):
    logits = net(G_dgl, inputs)
    logp = F.log_softmax(logits, 1)
    loss = F.nll_loss(logp[tr_idx], labels[tr_idx])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch >= 3:
        t0 = time.time()
        dur.append(time.time() - t0)

    train_acc = accuracy(logp[tr_idx], labels[tr_idx])
    val_acc = accuracy(logp[va_idx], labels[va_idx])
    print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | TrainAcc {:.4f} |"
              " ValAcc {:.4f}".
              format(epoch, np.mean(dur), loss.item(), train_acc,
            val_acc,))
    
graph_pred = logp[test_idx]

**Model outputs combination**

Combine logits from the two models for prediction

In [None]:
stack_prob = torch.stack([BERT_pred,graph_pred])
final_prob = torch.sum(stack_prob,dim = 0)

final_pred = predict(final_prob)

test_text["label"] = final_pred
print(test_text.head())

test = test_text.drop("title",1)
test.to_csv('test.csv',index=False)

files.download('test.csv')