In [1]:
import argparse
import sys
from sentence_transformers import SentenceTransformer
from sentence_transformers_local import models, losses, SentenceTransformerSequential
from models.Transformers import SCCLBert
from learners.cluster import ClusterLearner
from dataloader.dataloader import augment_loader, augment_loader_split
from training import training
from utils.kmeans import get_kmeans_centers
from utils.logger import setup_path
from utils.randomness import set_global_random_seed
import torch
import pandas as pd
import os
from torch import nn

In [2]:
# !pip install torch
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]='6'

# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print(torch.cuda.device_count())

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda
1
Tesla V100-SXM2-32GB-LS
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
MODEL_CLASS = {
    "distil": 'distilbert-base-nli-stsb-mean-tokens', 
    "robertabase": 'roberta-base-nli-stsb-mean-tokens',
    "robertalarge": 'roberta-large-nli-stsb-mean-tokens',
    "msmarco": 'distilroberta-base-msmarco-v2',
    "xlm": "xlm-r-distilroberta-base-paraphrase-v1",
    "bertlarge": 'bert-large-nli-stsb-mean-tokens',
    "bertbase": 'bert-base-nli-stsb-mean-tokens',
    "paraphrase": "paraphrase-mpnet-base-v2",
    "paraphrase-distil": "paraphrase-distilroberta-base-v2",
    "paraphrase-Tiny" : "paraphrase-TinyBERT-L6-v2"
}

parser = argparse.ArgumentParser()
# parser.add_argument('--gpuid', nargs="+", type=int, default=[0], help="The list of gpuid, ex:--gpuid 3 1. Negative value means cpu-only")
parser.add_argument('--seed', type=int, default=0, help="")
parser.add_argument('--print_freq', type=float, default=400, help="")  
parser.add_argument('--result_path', type=str, default='./results/')

parser.add_argument('--bert', type=str, default='paraphrase', help="")
#parser.add_argument('--bert', type=str, default='distil', help="")

parser.add_argument('--bert_model', type=str, default='bert-base-uncased', help="")
parser.add_argument('--note', type=str, default='_search_snippets_distil_lre-4_JSD', help="")

# Dataset
# stackoverflow/stackoverflow_true_text
parser.add_argument('--dataset', type=str, default='search_snippets', help="")
#parser.add_argument('--dataset', type=str, default='stackoverflow', help="")
# parser.add_argument('--data_path', type=str, default='./datasets/stackoverflow/')
parser.add_argument('--max_length', type=int, default=32)
parser.add_argument('--train_val_ratio', type=float, default= [0.9, 0.1])

# Data for train and test
# ###### AgNews
# parser.add_argument('--data_path', type=str, default='./datasets/')
# parser.add_argument('--dataname', type=str, default='agnewsdataraw-8000', help="")
# parser.add_argument('--dataname_val', type=str, default='agnewsdataraw-8000', help="")
# parser.add_argument('--num_classes', type=int, default=4, help="")
# ####### SearchSnippets
parser.add_argument('--data_path', type=str, default='./datasets/augmented/contextual_30_2col_roberta/')
# ## parser.add_argument('--dataname', type=str, default='train_search_snippets.csv', help="")
## parser.add_argument('--dataname_val', type=str, default='test_search_snippets.csv', help="")
# parser.add_argument('--dataname', type=str, default='search_snippets', help="")
# parser.add_argument('--dataname_val', type=str, default='search_snippets', help="")
# parser.add_argument('--num_classes', type=int, default=8, help="")
# # ###### StackOverFlow
# parser.add_argument('--data_path', type=str, default='./datasets/stackoverflow/')
# parser.add_argument('--dataname', type=str, default='stackoverflow', help="")
# parser.add_argument('--dataname_val', type=str, default='stackoverflow_', help="")
# parser.add_argument('--num_classes', type=int, default=20, help="")
# ###### Biomedical
# parser.add_argument('--data_path', type=str, default='./datasets/biomedical/')
parser.add_argument('--dataname', type=str, default='biomedical', help="")
parser.add_argument('--dataname_val', type=str, default='biomedical', help="")
parser.add_argument('--num_classes', type=int, default=20, help="")
# ######## Tweet
# parser.add_argument('--data_path', type=str, default='./datasets/')
# parser.add_argument('--dataname', type=str, default='tweet_remap_label', help="")
# parser.add_argument('--dataname_val', type=str, default='tweet_remap_label', help="")
# parser.add_argument('--num_classes', type=int, default=89, help="")
# ######## GoogleNewsTS
# parser.add_argument('--data_path', type=str, default='./datasets/')
# parser.add_argument('--dataname', type=str, default='TS', help="")
# parser.add_argument('--dataname_val', type=str, default='TS', help="")
# parser.add_argument('--num_classes', type=int, default=152, help="")
# ######## GoogleNewsT
# parser.add_argument('--data_path', type=str, default='./datasets/')
# parser.add_argument('--dataname', type=str, default='T', help="")
# parser.add_argument('--dataname_val', type=str, default='T', help="")
# parser.add_argument('--num_classes', type=int, default=152, help="")
# ######## GoogleNewsS
# parser.add_argument('--data_path', type=str, default='./datasets/')
# parser.add_argument('--dataname', type=str, default='S', help="")
# parser.add_argument('--dataname_val', type=str, default='S', help="")
# parser.add_argument('--num_classes', type=int, default=152, help="")

# Learning parameters
parser.add_argument('--lr', type=float, default=1e-6, help="") #learning rate
parser.add_argument('--lr_scale', type=int, default=100, help="")
parser.add_argument('--max_iter', type=int, default=30000)
parser.add_argument('--batch_size', type=int, default=256) #batch size

# CNN Setting
#parser.add_argument('--out_channels', type=int, default=768)
#parser.add_argument('--use_cnn', type=str, default='cnn_1')
#parser.add_argument('--use_cnn', type=str, default='cnn_3')
#parser.add_argument('--use_cnn', type=str, default='cnn_5')
#parser.add_argument('--use_cnn', type=str, default='cnn_7')
#parser.add_argument('--use_cnn', type=str, default='cnn_cat')
#parser.add_argument('--use_cnn', type=str, default='cnn_avg')

# Contrastive learning
parser.add_argument('--use_head', type=bool, default=False)
parser.add_argument('--use_normalize', type=bool, default=False)

parser.add_argument('--weighted_local', type=bool, default=False, help="")
#parser.add_argument('--normalize_method', type=str, default='inverse_prob', help="")
parser.add_argument('--normalize_method', type=str, default='none', help="")

parser.add_argument('--contrastive_local_scale', type=float, default=0.00) #scale of contrastive loss
parser.add_argument('--contrastive_global_scale', type=float, default=0.01) #scale of contrastive loss
parser.add_argument('--temperature', type=float, default=0.5, help="temperature required by contrastive loss")
parser.add_argument('--base_temperature', type=float, default=0.1, help="temperature required by contrastive loss")

# Clustering
parser.add_argument('--clustering_scale', type=float, default=0.02) #scale of clustering loss
parser.add_argument('--use_perturbation', action='store_true', help="")
parser.add_argument('--alpha', type=float, default=1)

args = parser.parse_args(args=[])
# args.use_gpu = args.gpuid[0] >= 0
args.resPath = None
args.tensorboard = None

In [None]:
resPath, tensorboard = setup_path(args)
args.resPath, args.tensorboard = resPath, tensorboard
set_global_random_seed(args.seed)

# Dataset loader
train_loader = augment_loader(args)

# torch.cuda.set_device(args.gpuid[0])
# torch.cuda.set_device(device)

# Initialize cluster centers
# by performing k-means after getting embeddings from Sentence-BERT with mean-pooling(defualt)
sbert = SentenceTransformer(MODEL_CLASS[args.bert])
cluster_centers = get_kmeans_centers(sbert, train_loader, args.num_classes) 



# Model
# 1. Transformer model 
# use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
# word_embedding_model = models.Transformer(MODEL_CLASS[args.bert])
word_embedding_model = models.Transformer('sentence-transformers/paraphrase-mpnet-base-v2')
# word_embedding_model = models.Transformer('sentence-transformers/stanford-sentiment-treebank-roberta.2021-03-11')

# model = SentenceTransformer('distilbert-base-nli-mean-tokens')
dimension = word_embedding_model.get_word_embedding_dimension()
# word_embedding_model = torch.nn.DataParallel(word_embedding_model)


# 2. CNN model
# cnn = models.CNN(in_word_embedding_dimension = word_embedding_model.get_word_embedding_dimension(), 
#                  use_cnn = args.use_cnn, out_channels = word_embedding_model.get_word_embedding_dimension())

# 3. Pooling 
# pooling_model = models.Pooling(cnn.get_word_embedding_dimension(),
#                                pooling_mode_mean_tokens=True,
#                                pooling_mode_cls_token=False,
#                                pooling_mode_max_tokens=False)
pooling_model = models.Pooling(dimension,
                               pooling_mode_mean_tokens=False,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False,
                               pooling_mode_weighted_tokens=True)

# 4. Feature extractor 
#feature_extractor = SentenceTransformerSequential(modules=[word_embedding_model, cnn, pooling_model])
feature_extractor = SentenceTransformerSequential(modules=[word_embedding_model, pooling_model], device = 'cuda')

# 5. main model
model = SCCLBert(feature_extractor, cluster_centers=cluster_centers, alpha = args.alpha, use_head = args.use_head)  


# Optimizer 
optimizer = torch.optim.Adam([
    {'params':word_embedding_model.parameters(), 'lr': args.lr*6},
#    {'params':cnn.parameters(), 'lr': args.lr*50},
    {'params':pooling_model.parameters()},
#    {'params':model.head.parameters(), 'lr': args.lr*args.lr_scale},
    {'params':model.cluster_centers, 'lr': args.lr*60}], lr=args.lr)
# # optimizer = torch.optim.Adam(lr=1e-4,params=model.parameters())
# optimizer = torch.optim.AdamW([
#     {'params':word_embedding_model.parameters(), 'lr': args.lr},
# #    {'params':cnn.parameters(), 'lr': args.lr*50},
#     {'params':pooling_model.parameters()},
# #    {'params':model.head.parameters(), 'lr': args.lr*args.lr_scale},
#     {'params':model.cluster_centers, 'lr': args.lr*20}], lr=args.lr)
# # optimizer = torch.optim.Adam(lr=1e-4,params=model.parameters())
print(optimizer)


# Set up the trainer    
learner = ClusterLearner(model, feature_extractor, optimizer, args.temperature, args.base_temperature,
                         args.contrastive_local_scale, args.contrastive_global_scale, args.clustering_scale, use_head = args.use_head, use_normalize = args.use_normalize)
# learner = torch.nn.DataParallel(learner)
learner = learner.cuda()

# split train - validation
if(args.train_val_ratio != -1):
    train_loader, val_loader = augment_loader_split(args)
    training(train_loader, learner, args, val_loader = val_loader)
# normal
else:
    training(train_loader, learner, args)

results path: ./results/SCCL.paraphrase.search_snippets.lr1e-06.lrscale100.tmp0.5.alpha1.seed0/
all_embeddings:(20000, 768), true_labels:20000, pred_labels:20000
true_labels tensor([ 9,  7, 11,  ..., 10,  0, 15])
pred_labels tensor([ 0, 10, 12,  ...,  7, 17, 15], dtype=torch.int32)
Iterations:58, Clustering ACC:0.441, centers:(20, 768)
initial_cluster_centers =  torch.Size([20, 768])
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 6e-06
    weight_decay: 0

Parameter Group 1
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 1e-06
    weight_decay: 0

Parameter Group 2
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 5.9999999999999995e-05
    weight_decay: 0
)
train_sample 0.9 18000
val_sample 0.1 2000

=30000/71=Iterations/Batches




[0]-----
contrastive_local_loss:	 1.24337
contrastive_global_loss:	 0.01378
clustering_loss:	 0.00000
local_consistency_loss:	 0.00000
------------- Evaluate Training Set -------------
------------- 71 batches -------------


In [None]:
class ScoringAttention(nn.Module):
    def __init__(self):
        super(ScoringAttention, self).__init__()
        
        self.dense1 = nn.Linear(768, 1)
        self.dense2 = nn.Linear(768, 256)
        self.dense3 = nn.Linear(768, 256)
        
        self.tanh = torch.nn.Tanh()
        self.softm = torch.nn.Softmax(dim=1)
        
    def forward(self, l, g):
        
        # score = dense1(tanh(dense2(local_1) + dense3(global_i)))
        
        l = self.dense2(l)
        g = self.dense3(g)
        ins = self.tanh(l+g)
        score = self.dense1(ins)
    
        return score

In [None]:
# sentence_0_lengths [20 18 17 27 31 25 32 32 23 22 32 23 21 22 21 32 22 21 25 29 26 27 32 32
#  30 27 24 26 29 32 32 32 19 27 28 32 20 24 21 32 17 30 32 28 29 30 28 32
#  23 32 18 26 23 23 32 32 25 32 25 32 31 32 21 32 32 28 32 24 32 19 30 32
#  23 29 27 17 20 29 32 23 17 26 31 32 16 32 24 19 27 29 25 32 32 29 21 32
#  31 32 32 32 32 26 32 32 26 32 18 27 28 32 31 28 22 30 24 31 24 32 24 32
#  24 23 23 32 27 32 32 25 21 22 29 32 27 19 15 32 30 27 30 32 29 24 25 32
#  30 31 25 20 26 28 24 20 25 32 25 32 24 18 13 26 24 29 32 26 21 24 15 19
#  32 32 19 23 26 13 28 25 32 30 32 32 32 32 31 32 18 25 22 30 32 16 32 27
#  28 27 32 32 32 32 29 20 18 24 27 32 19 27 17 26 21 32 21 22 28 16 28 31
#  24 32 24 26 28 20 22 14 21 32 19 22 23 22 16 27 22 21 22 22 26 31 27 29
#  22 32 17 32 19 28 26 25 23 32 21 18 24 26 32 32]

# tok_rep_0 256
# tok_rep_1 256
# tok_rep_0[0].shape torch.Size([20, 768])
# tok_rep_1[0].shape torch.Size([19, 768])
# local_rep_0 torch.Size([6743, 768])
# local_rep_1 torch.Size([6640, 768])
# global_rep_0 torch.Size([256, 768])
# global_rep_1 torch.Size([256, 768])
# pos_mask.shape torch.Size([13383, 256])
# pos_mask tensor([[1., 0., 0.,  ..., 0., 0., 0.],
#         [1., 0., 0.,  ..., 0., 0., 0.],
#         [1., 0., 0.,  ..., 0., 0., 0.],
#         ...,
#         [0., 0., 0.,  ..., 0., 0., 1.],
#         [0., 0., 0.,  ..., 0., 0., 1.],
#         [0., 0., 0.,  ..., 0., 0., 1.]], device='cuda:0')
# neg_mask.shape torch.Size([13383, 256])
# neg_mask tensor([[0., 1., 1.,  ..., 1., 1., 1.],
#         [0., 1., 1.,  ..., 1., 1., 1.],
#         [0., 1., 1.,  ..., 1., 1., 1.],
#         ...,
#         [1., 1., 1.,  ..., 1., 1., 0.],
#         [1., 1., 1.,  ..., 1., 1., 0.],
#         [1., 1., 1.,  ..., 1., 1., 0.]], device='cuda:0')

In [None]:
import numpy as np
import torch

sentence_0_lengths = [8, 6, 4]
att_weight = torch.rand(np.sum(sentence_0_lengths))

def create_local_masks_attention(lens_a, att_weight):
    
    pos_mask = torch.zeros((np.sum(lens_a), len(lens_a))).cuda()
    neg_mask = torch.ones((np.sum(lens_a), len(lens_a))).cuda()
    temp = 0
    for idx in range(len(lens_a)):
        for j in range(temp, lens_a[idx] + temp):
            pos_mask[j][idx] = att_weight[j]
            neg_mask[j][idx] = 0.
        temp += lens_a[idx]

    return pos_mask, neg_mask

In [None]:
pos_mask, neg_mask = create_local_masks_attention(sentence_0_lengths, att_weight)

In [None]:
from allennlp.predictors.predictor import Predictor
from allennlp.interpret.saliency_interpreters import SimpleGradient 
import allennlp_models.tagging

predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/stanford-sentiment-treebank-roberta.2021-03-11.tar.gz")
# predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.11.19.tar.gz")

In [None]:
word = "ofcom ultimatum bt open network access face competition lt gt lt gt bt work harder open network rivals order aid uptake broadband internet access risk investigation uk competition commission uk communications regulator warned"

In [None]:
print(predictor.predict(word)['token_ids'])

In [None]:
a = [    0,  1116,   175, 20265,   757, 15368,   741,    90,   490,  1546,
          899,   652,  1465,   784,    90,   821,    90,   784,    90,   821,
           90,   741,    90,   173,  4851,   490,  1546,  4346,   645,  2887,
        33646,     2]

In [None]:
from captum.attr import IntegratedGradients

import torch
import torch.nn as nn
import torch.nn.functional as F
class ToyModel(nn.Module):
    r"""
    Example toy model from the original paper (page 10)

    https://arxiv.org/pdf/1703.01365.pdf


    f(x1, x2) = RELU(ReLU(x1) - 1 - ReLU(x2))
    """

    def __init__(self):
        super().__init__()

    def forward(self, input1, input2):
        relu_out1 = F.relu(input1)
        relu_out2 = F.relu(input2)
        return F.relu(relu_out1 - 1 - relu_out2)
    
model = ToyModel()

# defining model input tensors
input1 = torch.tensor([3.0], requires_grad=True)
input2 = torch.tensor([1.0], requires_grad=True)

# defining baselines for each input tensor
baseline1 = torch.tensor([0.0])
baseline2 = torch.tensor([0.0])

# defining and applying integrated gradients on ToyModel and the
ig = IntegratedGradients(model)
attributions, approximation_error = ig.attribute((input1, input2),
#                                                  baselines=(baseline1, baseline2),
                                                 method='gausslegendre',
                                                 return_convergence_delta=True)

In [None]:
def squad_pos_forward_func(inputs, token_type_ids=None, position_ids=None, attention_mask=None, position=0):
    pred = predict(inputs,
                   token_type_ids=token_type_ids,
                   position_ids=position_ids,
                   attention_mask=attention_mask)
    pred = pred[position]
    return pred.max(1).values

lig = LayerIntegratedGradients(squad_pos_forward_func, model.bert.embeddings)

attributions, delta = lig.attribute(inputs=input_ids,
                                  baselines=ref_input_ids,
                                  additional_forward_args=(token_type_ids, position_ids, attention_mask, 0),
                                  return_convergence_delta=True)

In [None]:
# !pip uninstall -y torch
!pip install torch