In [None]:
# libraries install
!pip install transformers
!pip install torch-scatter
!pip install torch-sparse
!pip install torch-geometric

In [None]:
# Libraries upload

import os
import sys
import math
import random
import numpy as np
import pandas as pd
from google.colab import drive
import xml.etree.ElementTree as ET
import glob
from scipy import io
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel, XLMRobertaConfig, XLMRobertaModel, XLMRobertaTokenizer
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from torch.utils.data import Dataset, DataLoader
import torch.nn.utils as utils
import torch.optim.lr_scheduler as lr_scheduler
from torch.nn import Parameter
from sklearn.utils import shuffle

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix, classification_report, f1_score, recall_score, precision_score, accuracy_score
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
from nltk.tokenize import WordPunctTokenizer
import warnings
warnings.filterwarnings("ignore")

In [None]:
# define device for deep learning
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("working on gpu ", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("working on cpu")

In [None]:
# set environment as googledrive to folder "resource"
data_path =  "/Colab Notebooks/"

try:
    drive.mount('/content/drive')
    data_path = "/content/drive/MyDrive/Colab Notebooks/UCCA-CNN/"

except:
    print("You are not working in Colab at the moment :(")

In [None]:
# Model hyperparameters GAT

RANDOM_SEED = 42
batch_size = 1
n_out = 2
epoch_size = 50
learning_rate = 0.0005
init_weight_decay = 0.2
init_clip_max_norm = 0.1
nhid=800
dropout = 0.2

In [None]:
# Model dataset

class CustomDataset(Dataset):
    def __init__(self, split, feature, adj, label, length):
        self.feature_array = np.array(feature)
        self.adj_array = np.array(adj)
        self.label_array = label
        self.s_length = length

        #print("len ", len(self.feature_array), " ", len(self.label_array))

    def __len__(self):
        return len(self.feature_array)

    def __getitem__(self, idx):
        selected_label = int(self.label_array[idx])
        selected_feature = self.feature_array[idx]
        selected_adj = self.adj_array[idx]
        selected_length = self.s_length[idx]

        return selected_feature, selected_adj, selected_label, selected_length

    
def collate_fn(data):
    data.sort(key=lambda x: (x[0].shape[0]), reverse=True)
    arrays, adjs, labels, sentence_length = zip(*data)
    lengths = [(array.shape[0]) for array in arrays]
    longest = max(lengths)
    targets = np.zeros([len(arrays), max(lengths), 768])
    targets_adj = np.zeros([len(arrays), max(lengths), max(lengths)])
    for i, cap in enumerate(arrays):
        end = lengths[i]
        array = arrays[i]
        slength = sentence_length[i]
        adj = adjs[i]
        new_adj = np.pad(adj, [((longest - end),0),(0,(longest - end))], mode='constant')
        new_array = np.pad(array, [((longest - end),0),(0,0)], mode='constant')
        targets[i,:,:] = new_array
        targets_adj[i,:,:] = new_adj
    return targets, targets_adj, labels, slength

In [None]:
class GraphAttentionLayer(nn.Module):
    """
    Simple GAT layer, similar to https://arxiv.org/abs/1710.10903
    """
    def __init__(self, in_features, out_features, dropout, alpha, concat=True):
        super(GraphAttentionLayer, self).__init__()
        self.dropout = dropout
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha
        self.concat = concat

        self.W = nn.Parameter(torch.empty(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)
        self.a = nn.Parameter(torch.empty(size=(2*out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

        self.leakyrelu = nn.LeakyReLU(self.alpha)

    def forward(self, h, adj):
        Wh = torch.matmul(h, self.W) # h.shape: (N, in_features), Wh.shape: (N, out_features)
        e = self._prepare_attentional_mechanism_input(Wh)

        zero_vec = -9e15*torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime = torch.matmul(attention, Wh)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

    def _prepare_attentional_mechanism_input(self, Wh):
        # Wh.shape (N, out_feature)
        # self.a.shape (2 * out_feature, 1)
        # Wh1&2.shape (N, 1)
        # e.shape (N, N)
        Wh1 = torch.matmul(Wh, self.a[:self.out_features, :])
        Wh2 = torch.matmul(Wh, self.a[self.out_features:, :])
        # broadcast add
        e = Wh1 + Wh2.T
        return self.leakyrelu(e)

    def __repr__(self):
        return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'


class SpecialSpmmFunction(torch.autograd.Function):
    """Special function for only sparse region backpropataion layer."""
    @staticmethod
    def forward(ctx, indices, values, shape, b):
        assert indices.requires_grad == False
        a = torch.sparse_coo_tensor(indices, values, shape)
        ctx.save_for_backward(a, b)
        ctx.N = shape[0]
        return torch.matmul(a, b)

    @staticmethod
    def backward(ctx, grad_output):
        a, b = ctx.saved_tensors
        grad_values = grad_b = None
        if ctx.needs_input_grad[1]:
            grad_a_dense = grad_output.matmul(b.t())
            edge_idx = a._indices()[0, :] * ctx.N + a._indices()[1, :]
            grad_values = grad_a_dense.view(-1)[edge_idx]
        if ctx.needs_input_grad[3]:
            grad_b = a.t().matmul(grad_output)
        return None, grad_values, None, grad_b


class SpecialSpmm(nn.Module):
    def forward(self, indices, values, shape, b):
        return SpecialSpmmFunction.apply(indices, values, shape, b)

    
class SpGraphAttentionLayer(nn.Module):
    """
    Sparse version GAT layer, similar to https://arxiv.org/abs/1710.10903
    """

    def __init__(self, in_features, out_features, dropout, alpha, concat=True):
        super(SpGraphAttentionLayer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha
        self.concat = concat

        self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
        nn.init.xavier_normal_(self.W.data, gain=1.414)
                
        self.a = nn.Parameter(torch.zeros(size=(1, 2*out_features)))
        nn.init.xavier_normal_(self.a.data, gain=1.414)

        self.dropout = nn.Dropout(dropout)
        self.leakyrelu = nn.LeakyReLU(self.alpha)
        self.special_spmm = SpecialSpmm()

    def forward(self, input, adj):
        dv = 'cuda' if input.is_cuda else 'cpu'

        N = input.size()[0]
        edge = adj.nonzero().t()

        h = torch.mm(input, self.W)
        # h: N x out
        assert not torch.isnan(h).any()

        # Self-attention on the nodes - Shared attention mechanism
        edge_h = torch.cat((h[edge[0, :], :], h[edge[1, :], :]), dim=1).t()
        # edge: 2*D x E

        edge_e = torch.exp(-self.leakyrelu(self.a.mm(edge_h).squeeze()))
        assert not torch.isnan(edge_e).any()
        # edge_e: E

        e_rowsum = self.special_spmm(edge, edge_e, torch.Size([N, N]), torch.ones(size=(N,1), device=dv))
        # e_rowsum: N x 1

        edge_e = self.dropout(edge_e)
        # edge_e: E

        h_prime = self.special_spmm(edge, edge_e, torch.Size([N, N]), h)
        assert not torch.isnan(h_prime).any()
        # h_prime: N x out
        
        h_prime = h_prime.div(e_rowsum)
        # h_prime: N x out
        assert not torch.isnan(h_prime).any()

        if self.concat:
            # if this layer is not last layer,
            return F.elu(h_prime)
        else:
            # if this layer is last layer,
            return h_prime

    def __repr__(self):
        return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'

In [None]:
class UCCA_GAT(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout, alpha, nheads):
        """Dense version of GAT."""
        super(UCCA_GAT, self).__init__()
        self.dropout = dropout

        self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True).to(device) for _ in range(nheads)]
        self.attentions1 = [GraphAttentionLayer(nhid*nheads, nhid, dropout=dropout, alpha=alpha, concat=True).to(device) for _ in range(nheads)]
        self.attentions2 = [GraphAttentionLayer(nhid*nheads, nhid, dropout=dropout, alpha=alpha, concat=True).to(device) for _ in range(nheads)]
        self.attentions3 = [GraphAttentionLayer(nhid*nheads, nhid, dropout=dropout, alpha=alpha, concat=True).to(device) for _ in range(nheads)]
        self.attentions4 = [GraphAttentionLayer(nhid*nheads, nhid, dropout=dropout, alpha=alpha, concat=True).to(device) for _ in range(nheads)]
        self.attentions5 = [GraphAttentionLayer(nhid*nheads, nhid, dropout=dropout, alpha=alpha, concat=True).to(device) for _ in range(nheads)]
        self.attentions6 = [GraphAttentionLayer(nhid*nheads, nhid, dropout=dropout, alpha=alpha, concat=True).to(device) for _ in range(nheads)]

        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)
        for i, attention in enumerate(self.attentions1):
            self.add_module('attention_{}'.format(i), attention)
        for i, attention in enumerate(self.attentions2):
            self.add_module('attention_{}'.format(i), attention)
        for i, attention in enumerate(self.attentions3):
            self.add_module('attention_{}'.format(i), attention)
        for i, attention in enumerate(self.attentions4):
            self.add_module('attention_{}'.format(i), attention)
        for i, attention in enumerate(self.attentions5):
            self.add_module('attention_{}'.format(i), attention)

        self.out_att = GraphAttentionLayer(nhid * nheads, nclass, dropout=dropout, alpha=alpha, concat=False)

    def forward(self, x, adj):
        x = F.dropout(x, self.dropout, training=self.training)
        x = torch.cat([att(x, adj) for att in self.attentions], dim=1)
        x = F.dropout(x, self.dropout, training=self.training)
        x = torch.cat([att(x, adj) for att in self.attentions1], dim=1)
        x = F.elu(self.out_att(x, adj))
        x= torch.mean(x, 1)
        return x

In [None]:
def save_checkpoint(state, location):
	filepath = os.path.join(location, 'best.pth.tar')
	torch.save(state, filepath)
    
def train(train_dl, model, optimizer):
    model.train()
    total_loss = 0.
    for batch in train_dl:
        feature, adj, label, length = batch
        feature, adj, label = torch.FloatTensor(feature), torch.FloatTensor(adj), torch.LongTensor(label)
        feature, adj, label = feature.to(device), adj.to(device), label.to(device)
        optimizer.zero_grad()
        output = model(feature, adj)
        loss = criterion(output.view(-1, n_out), label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        total_loss += loss.item()

    return total_loss/float(len(train_dl))


def evaluate(model, dl):
    total_loss = 0
    prediction_list = []
    label_list = []
    length_list = []
    model.eval()
    with torch.no_grad():
        for batch in dl:		
            feature, adj, label,length = batch
            feature, adj, label = torch.FloatTensor(feature), torch.FloatTensor(adj), torch.LongTensor(label)
            feature, adj, label = feature.to(device), adj.to(device), label.to(device)
            output = model(feature, adj)
            loss = criterion(output.view(-1, n_out), label)
            total_loss += loss.item()
            predicted = torch.argmax(output, dim=1)
            prediction_list.extend(predicted.data.cpu().numpy())
            label_list.extend(label.data.cpu().numpy())
            length_list.append(length)
    return f1_score(label_list, prediction_list, average='macro'), total_loss, label_list, prediction_list, length_list

def train_and_evaluate(model, optimizer, train_dl, val_dl, test_dl=None, fold=0):
    best_val_acc = -999.9
    r_test_acc = -999.0
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
    label_best = []
    prediction_best = []
    for epoch in range(1, epoch_size+1):
        total_loss = train(train_dl, model, optimizer)
        val_acc, val_loss, label_list, prediction_list, length_list = evaluate(model, val_dl)
        #test_acc, test_loss = evaluate(model, test_dl)
        #print("Epoch = ", epoch, " train loss = ", total_loss, " val_acc = ", val_acc) #, " test_acc = ", test_acc)
        if val_acc > best_val_acc:
            save_checkpoint({'epoch': epoch , 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, location=data_path + 'result/')
            best_val_acc = val_acc
            #r_test_acc = test_acc
            label_best = label_list
            prediction_best = prediction_list
        scheduler.step()
    print("Best Val acc = ", best_val_acc) #, " Test Acc = ", r_test_acc)
    return best_val_acc,label_best, prediction_best, length_list

def train_and_evaluate_fold():
    label_all = []
    prediction_all = []
    best_accuracy = []
    length_all = []

    k_folds = 10

    results = {}
    torch.manual_seed(42)

    # Define the K-fold Cross Validator
    kfold = KFold(n_splits=k_folds, random_state=RANDOM_SEED, shuffle=True)
    # Start print
    print('--------------------------------')

    dataset = pd.DataFrame({'feature' : np.squeeze(np.array(dataset["feature"])), 'adj' : np.squeeze(np.array(dataset["adjacency"])), 'label' : np.squeeze(np.array(dataset["labels"])), 'length' : np.squeeze(np.array(dataset["lenghts"]))})
    dataset = shuffle(dataset)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
        gcn_model = UCCA_GAT(nfeat=768, 
                            nhid=nhid,
                             nclass=n_out,
                             dropout=dropout,
                           alpha=0.1,
                            nheads=1).to(device)
        optimizer = torch.optim.Adam(gcn_model.parameters(), lr=learning_rate) #, weight_decay=init_weight_decay)

        train_df = dataset.iloc[train_idx]

        valid_df = dataset.iloc[val_idx]
        print(fold)
        dl_train = DataLoader(IronyDataset("train", train_df["feature"].to_numpy(), train_df["adj"].to_numpy(), list(train_df["label"]), list(train_df["length"])), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
        dl_val= DataLoader(IronyDataset("val", valid_df["feature"].to_numpy(), valid_df["adj"].to_numpy(), list(valid_df["label"]), list(valid_df["length"])), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
        best_a,label_list, prediction_list, length_list = train_and_evaluate(gcn_model, optimizer, dl_train, dl_val, fold)
        best_accuracy.append(best_a)
        label_all.extend(label_list)
        prediction_all.extend(prediction_list)
        length_all.extend(length_list)
    print(np.mean(best_accuracy))
    return label_all, prediction_all, length_all, gcn_model

In [None]:
criterion = nn.CrossEntropyLoss()
dataset = io.loadmat(data_path + 'dependency_dataset/bert_mr_xlm.mat')
label_all, prediction_all, length_all, gcn_model = train_and_evaluate_fold()