In [20]:
#pip install sacremoses

In [1]:
import torch
from simplediff import diff


## TAKEN FROM GITHUB CODE OF NEUTRALIZING BIAS
## https://github.com/rpryzant/neutralizing-bias

# from https://spacy.io/api/annotation#section-dependency-parsing
RELATIONS = [
    'det', 'amod', 'nsubj', 'prep', 'pobj', 'ROOT', 
    'attr', 'punct', 'advmod', 'compound', 'acl', 'agent', 
    'aux', 'ccomp', 'dobj', 'cc', 'conj', 'appos', 'nsubjpass', 
    'auxpass', 'poss', 'nummod', 'nmod', 'relcl', 'mark', 
    'advcl', 'pcomp', 'npadvmod', 'preconj', 'neg', 'xcomp', 
    'csubj', 'prt', 'parataxis', 'expl', 'case', 'acomp', 'predet',
    'quantmod', 'dep', 'oprd', 'intj', 'dative', 'meta', 'csubjpass', 
    '<UNK>'
]
REL2ID = {x: i for i, x in enumerate(RELATIONS)}

# from https://spacy.io/api/annotation#section-pos-tagging
POS_TAGS = [
    'DET', 'ADJ', 'NOUN', 'ADP', 'NUM', 'VERB', 'PUNCT', 'ADV', 
    'PART', 'CCONJ', 'PRON', 'X', 'INTJ', 'PROPN', 'SYM',
    '<UNK>'
]
POS2ID = {x: i for i, x in enumerate(POS_TAGS)}

import nltk
import numpy as np

from nltk.parse.stanford import StanfordDependencyParser

class Featurizer:

    def __init__(self, tok2id={}, pad_id=0, lexicon_feature_bits=1):
        self.tok2id = tok2id
        self.id2tok = {x: tok for tok, x in tok2id.items()}
        self.pad_id = pad_id

        self.pos2id = POS2ID
        self.rel2id = REL2ID

        self.lexicons = {
            'assertives': self.read_lexicon('lexicons/assertives_hooper1975.txt'),
            'entailed_arg': self.read_lexicon('lexicons/entailed_arg_berant2012.txt'),
            'entailed': self.read_lexicon('lexicons/entailed_berant2012.txt'), 
            'entailing_arg': self.read_lexicon('lexicons/entailing_arg_berant2012.txt'), 
            'entailing': self.read_lexicon('lexicons/entailing_berant2012.txt'), 
            'factives': self.read_lexicon('lexicons/factives_hooper1975.txt'),
            'hedges': self.read_lexicon('lexicons/hedges_hyland2005.txt'),
            'implicatives': self.read_lexicon('lexicons/implicatives_karttunen1971.txt'),
            'negatives': self.read_lexicon('lexicons/negative_liu2005.txt'),
            'positives': self.read_lexicon('lexicons/positive_liu2005.txt'),
            'npov': self.read_lexicon('lexicons/npov_lexicon.txt'),
            'reports': self.read_lexicon('lexicons/report_verbs.txt'),
            'strong_subjectives': self.read_lexicon('lexicons/strong_subjectives_riloff2003.txt'),
            'weak_subjectives': self.read_lexicon('lexicons/weak_subjectives_riloff2003.txt')
        }
        self.lexicon_feature_bits = lexicon_feature_bits


    def get_feature_names(self):

        lexicon_feature_names = list(self.lexicons.keys())
        context_feature_names = [x + '_context' for x in lexicon_feature_names]
        pos_names = list(list(zip(*sorted(self.pos2id.items(), key=lambda x: x[1])))[0])
        rel_names = list(list(zip(*sorted(self.rel2id.items(), key=lambda x: x[1])))[0])

        return lexicon_feature_names + context_feature_names + pos_names + rel_names        

    def read_lexicon(self, fp):
        out = set([
            l.strip() for l in open(fp, errors='ignore') 
            if not l.startswith('#') and not l.startswith(';')
            and len(l.strip().split()) == 1
        ])
        return out


    def lexicon_features(self, words, bits=2):
        assert bits in [1, 2]
        if bits == 1:
            true = 1
            false = 0
        else:
            true = [1, 0]
            false = [0, 1]
    
        out = []
        for word in words:
            out.append([
                true if word in lexicon else false 
                for _, lexicon in self.lexicons.items()
            ])
        out = np.array(out)

        if bits == 2:
            out = out.reshape(len(words), -1)

        return out


    def context_features(self, lex_feats, window_size=2):
        out = []
        nwords = lex_feats.shape[0]
        nfeats = lex_feats.shape[1]
        for wi in range(lex_feats.shape[0]):
            window_start = max(wi - window_size, 0)
            window_end = min(wi + window_size + 1, nwords)

            left = lex_feats[window_start: wi, :] if wi > 0 else np.zeros((1, nfeats))
            right = lex_feats[wi + 1: window_end, :] if wi < nwords - 1 else np.zeros((1, nfeats))

            out.append((np.sum(left + right, axis=0) > 0).astype(int))

        return np.array(out)


    def features(self, id_seq, rel_ids, pos_ids):
        if self.pad_id in id_seq:
            pad_idx = id_seq.index(self.pad_id)
            pad_len = len(id_seq[pad_idx:])
            id_seq = id_seq[:pad_idx]
            rel_ids = rel_ids[:pad_idx]
            pos_ids = pos_ids[:pad_idx]
        else:
            pad_len = 0

        toks = [self.id2tok[x] for x in id_seq]
        # build list of [word, [tok indices the word came from]]
        words = []
        word_indices = []
        for i, tok in enumerate(toks):
            if tok.startswith('##'):
                words[-1] += tok.replace('##', '')
                word_indices[-1].append(i)
            else:
                words.append(tok)
                word_indices.append([i])

        # get expert features
        lex_feats = self.lexicon_features(words, bits=self.lexicon_feature_bits)
        context_feats = self.context_features(lex_feats)
        expert_feats = np.concatenate((lex_feats, context_feats), axis=1)
        # break word-features into tokens
        feats = np.concatenate([
            np.repeat(np.expand_dims(word_vec, axis=0), len(indices), axis=0) 
            for (word_vec, indices) in zip(expert_feats, word_indices)
        ], axis=0)

        # add in the pos and relational features
        pos_feats = np.zeros((len(pos_ids), len(POS2ID)))
        pos_feats[range(len(pos_ids)), pos_ids] = 1
        rel_feats = np.zeros((len(rel_ids), len(REL2ID)))
        rel_feats[range(len(rel_ids)), rel_ids] = 1
        
        feats = np.concatenate((feats, pos_feats, rel_feats), axis=1)

        # add pad back in                
        feats = np.concatenate((feats, np.zeros((pad_len, feats.shape[1]))))

        return feats


    def featurize_batch(self, batch_ids, rel_ids, pos_ids, padded_len=0):
        """ takes [batch, len] returns [batch, len, features] """
        #print(rel_ids)
        #print(batch_ids)
        #print(pos_ids)
        batch_feats = [
            self.features(list(id_seq), list(rel_ids), list(pos_ids)) 
            for id_seq, rel_ids, pos_ids in zip(batch_ids, rel_ids, pos_ids)]
        batch_feats = np.array(batch_feats)
        return batch_feats

In [2]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tok2id = tokenizer.vocab
tok2id['<del>'] = len(tok2id)
bertmodel = BertModel.from_pretrained("bert-base-uncased")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
layers = 12
hidden_size = 768

## TAKEN FROM GITHUB CODE OF NEUTRALIZING BIAS
## https://github.com/rpryzant/neutralizing-bias

class ConcatCombine(torch.nn.Module):
    def __init__(self, hidden_size, feature_size, out_size, layers,
            dropout_prob, small=False, pre_enrich=False, activation=False,
            include_categories=False, category_emb=False,
            add_category_emb=False):
        super(ConcatCombine, self).__init__()

        self.include_categories = include_categories
        self.add_category_emb = add_category_emb
        if include_categories:
            if category_emb and not add_category_emb:
                feature_size *= 2
            elif not category_emb:
                feature_size += 43

        if layers == 1:
            self.out = torch.nn.Sequential(
                torch.nn.Linear(hidden_size + feature_size, out_size),
                torch.nn.Dropout(dropout_prob))
        elif layers == 2:
            waist_size = min(hidden_size, feature_size) if small else max(hidden_size, feature_size)
            if activation:
                self.out = torch.nn.Sequential(
                    torch.nn.Linear(hidden_size + feature_size, waist_size),
                    torch.nn.Dropout(dropout_prob),
                    torch.nn.ReLU(),
                    torch.nn.Linear(waist_size, out_size),
                    torch.nn.Dropout(dropout_prob))
            else:
                self.out = torch.nn.Sequential(
                    torch.nn.Linear(hidden_size + feature_size, waist_size),
                    torch.nn.Dropout(dropout_prob),
                    torch.nn.Linear(waist_size, out_size),
                    torch.nn.Dropout(dropout_prob))
        if pre_enrich:
            if activation:
                self.enricher = torch.nn.Sequential(
                    torch.nn.Linear(feature_size, feature_size),
                    torch.nn.ReLU())
            else:
                self.enricher = torch.nn.Linear(feature_size, feature_size)
        else:
            self.enricher = None
        # manually set cuda because module doesn't see these combiners for bottom 
#         if CUDA:
#             self.out = self.out.cuda()
#             if self.enricher: 
#                 self.enricher = self.enricher.cuda()
                
    def forward(self, hidden, features, categories=None):
        if self.include_categories:
            categories = categories.unsqueeze(1)
            categories = categories.repeat(1, features.shape[1], 1)
            if self.add_category_emb:
                features = features + categories
            else:
                features = torch.cat((features, categories), -1)

        if self.enricher is not None:
            features = self.enricher(features)

        return self.out(torch.cat((hidden, features), dim=-1))
    

class BertDetector(torch.nn.Module):
    def __init__(self,cls_num_labels=2,token_num_labels=2,tok2id=None):
        super(BertDetector,self).__init__()
        self.bert = bertmodel
        self.featurizer = Featurizer(tok2id,lexicon_feature_bits=1)
        self.cls_dropout = torch.nn.Dropout(0.15)
        self.cls_classifier = torch.nn.Linear(hidden_size,cls_num_labels)
        
        self.token_dropout = torch.nn.Dropout(0.15)
        #self.token_classifier = torch.nn.Linear(hidden_size,token_num_labels)
        
        self.token_classifier = ConcatCombine(
                hidden_size, 90, token_num_labels, 
                1, 0, False, pre_enrich=False,
                activation=False,
                include_categories=False,
                category_emb=False,
                add_category_emb=False)
        
    def forward(self,input_ids,train=None,token_type_ids=None,attention_mask=None, 
        labels=None,rel_ids=None,pos_ids=None,categories=None,pre_len=None):
        features = torch.tensor(self.featurizer.featurize_batch(
            input_ids.numpy(), 
            rel_ids.numpy(), 
            pos_ids.numpy(), 
            padded_len=input_ids.shape[1]),dtype=torch.float)
        results = self.bert(input_ids)
        sequence = results.last_hidden_state
        pooled = results.pooler_output
        if train == "train":
            cls_logits = self.cls_dropout(self.cls_classifier(pooled))
            token_logits = self.token_dropout(self.token_classifier(sequence,features))
        else:
            cls_logits = self.cls_classifier(pooled)
            token_logits = self.token_classifier(sequence,features)
        return cls_logits,token_logits

In [4]:
text_1 = "China vowed on Wednesday to \"fight back\" after the United States announced a possible second round of tariff hikes on $200 billion worth of Chinese goods. The U.S. proposal released on Tuesday included increased taxes on imported food products and consumer electronics."
text_2 = "not sure"
indexed_tokens = tokenizer.encode(text_1, add_special_tokens=True)

tokens_tensor = torch.tensor([indexed_tokens])
lengths = (tokens_tensor == 102).nonzero(as_tuple=True)[1]
try: 
    segments_ids = [0] * (lengths[0] + 1) + [1] * (lengths[1] - lengths[0])
except:
    segments_ids = [0] * (lengths[0] + 1)
segments_tensors = torch.tensor([segments_ids])
segments_tensors
rel_tensor = torch.tensor([REL2ID[x] for x in ["<UNK>"]*len(indexed_tokens)])
pos_tensor = torch.tensor([POS2ID[x] for x in ["<UNK>"]*len(indexed_tokens)])

(tokens_tensor,segments_tensors,rel_tensor,pos_tensor)

(tensor([[  101,  2859, 18152,  2006,  9317,  2000,  1000,  2954,  2067,  1000,
           2044,  1996,  2142,  2163,  2623,  1037,  2825,  2117,  2461,  1997,
          23234, 21857,  2015,  2006,  1002,  3263,  4551,  4276,  1997,  2822,
           5350,  1012,  1996,  1057,  1012,  1055,  1012,  6378,  2207,  2006,
           9857,  2443,  3445,  7773,  2006, 10964,  2833,  3688,  1998,  7325,
           8139,  1012,   102]]),
 tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0]]),
 tensor([45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
         45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
         45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45]),
 tensor([15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
         15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,

In [5]:
model = BertDetector(tok2id=tok2id)
results = model(tokens_tensor,token_type_ids=segments_tensors,rel_ids=rel_tensor.unsqueeze(0),pos_ids=pos_tensor.unsqueeze(0))
(results,torch.nn.Softmax(dim=1)(results[1]))

((tensor([[-0.0517, -0.5244]], grad_fn=<AddmmBackward>),
  tensor([[[ 0.4846,  0.2814],
           [ 0.3863,  0.0817],
           [ 0.2775, -0.3158],
           [ 0.5077, -0.1507],
           [ 0.1739, -0.0308],
           [-0.1166, -0.1629],
           [ 0.0812, -0.3776],
           [-0.2618, -0.6185],
           [-0.0981, -0.3988],
           [ 0.4846,  0.0514],
           [ 0.4730, -0.0056],
           [ 0.5697, -0.3963],
           [ 0.5234, -0.2881],
           [ 0.6422, -0.1745],
           [ 0.3207, -0.4543],
           [ 0.1040, -0.0873],
           [-0.1369,  0.3556],
           [-0.0221, -0.0388],
           [-0.2431, -0.0989],
           [ 0.0042, -0.2524],
           [ 0.0117, -0.7513],
           [-0.3080, -0.4874],
           [-0.1520, -0.2115],
           [ 0.0286, -0.0834],
           [ 0.3734,  0.0252],
           [ 0.1491,  0.0577],
           [-0.0652,  0.1527],
           [-0.1598, -0.0655],
           [-0.3144, -0.3606],
           [-0.1854, -0.1669],
           [-

In [6]:
text_test= "165188319	ch ##lor ##of ##or ##m \" the molecular life ##sa ##ver \" an article at oxford university providing interesting facts about ch ##lor ##of ##or ##m .	ch ##lor ##of ##or ##m \" the molecular life ##sa ##ver \" an article at oxford university providing facts about ch ##lor ##of ##or ##m .	chloroform \"the molecular lifesaver\" an article at oxford university providing interesting facts about chloroform.	chloroform \"the molecular lifesaver\" an article at oxford university providing facts about chloroform.	NOUN NOUN NOUN NOUN NOUN PUNCT DET ADJ NOUN NOUN NOUN PUNCT DET NOUN ADP NOUN NOUN VERB ADJ NOUN ADP NOUN NOUN NOUN NOUN NOUN PUNCT	ROOT ROOT ROOT ROOT ROOT punct det amod dobj dobj dobj punct det appos prep compound pobj acl amod dobj prep pobj pobj pobj pobj pobj punct"
[revid, _, _, biased, nonbiased, pos, rels] = text_test.strip().split("\t")

indexed_tokens = tokenizer.encode(biased.strip(),nonbiased.strip(), add_special_tokens=True)

tokens_tensor = torch.tensor([indexed_tokens])
lengths = (tokens_tensor == 102).nonzero(as_tuple=True)[1]
try: 
    segments_ids = [0] * (lengths[0] + 1) + [1] * (lengths[1] - lengths[0])
except:
    segments_ids = [0] * (lengths[0] + 1)
segments_tensors = torch.tensor([segments_ids])
segments_tensors
rel_tensor = torch.tensor([REL2ID[x] for x in rels.strip().split(" ")]*2)
pos_tensor = torch.tensor([POS2ID[x] for x in pos.strip().split(" ")]*2)
while rel_tensor.size()[0] < len(indexed_tokens):
    rel_tensor = torch.cat((rel_tensor, torch.tensor([0])), dim=-1)
while pos_tensor.size()[0] < len(indexed_tokens):
    pos_tensor = torch.cat((pos_tensor, torch.tensor([0])), dim=-1)
    
ground_truth_list = [(x[0],len(x[1])) for x in diff(biased.strip().split(),nonbiased.strip().split())]
ground_truth = []
for i in ground_truth_list:
    if i[0] == "-":
        ground_truth.extend([1]*i[1])
    else:
        ground_truth.extend([0]*i[1])
ground_truth
#rel_tensor = torch.tensor([REL2ID[x] for x in ["<UNK>"]*len(indexed_tokens)])
#pos_tensor = torch.tensor([POS2ID[x] for x in ["<UNK>"]*len(indexed_tokens)])

(tokens_tensor,segments_tensors,rel_tensor,pos_tensor)

(tensor([[  101, 10381, 10626, 11253,  2953,  2213,  1000,  1996,  8382,  2166,
           3736,  6299,  1000,  2019,  3720,  2012,  4345,  2118,  4346,  5875,
           8866,  2055, 10381, 10626, 11253,  2953,  2213,  1012,   102, 10381,
          10626, 11253,  2953,  2213,  1000,  1996,  8382,  2166,  3736,  6299,
           1000,  2019,  3720,  2012,  4345,  2118,  4346,  8866,  2055, 10381,
          10626, 11253,  2953,  2213,  1012,   102]]),
 tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1]]),
 tensor([ 5,  5,  5,  5,  5,  7,  0,  1, 14, 14, 14,  7,  0, 17,  3,  9,  4, 10,
          1, 14,  3,  4,  4,  4,  4,  4,  7,  5,  5,  5,  5,  5,  7,  0,  1, 14,
         14, 14,  7,  0, 17,  3,  9,  4, 10,  1, 14,  3,  4,  4,  4,  4,  4,  7,
          0,  0]),
 tensor([2, 2, 2, 2, 2, 6, 0, 1, 2, 2, 2, 6, 0, 2, 3, 2, 2, 5, 1, 2, 3, 2, 2, 2,
 

In [7]:
diff(biased.strip().split(),nonbiased.strip().split())

[('=',
  ['chloroform',
   '"the',
   'molecular',
   'lifesaver"',
   'an',
   'article',
   'at',
   'oxford',
   'university',
   'providing']),
 ('-', ['interesting']),
 ('=', ['facts', 'about', 'chloroform.'])]

In [8]:
(tokens_tensor.size(),segments_tensors.size(),rel_tensor.size(),pos_tensor.size())

(torch.Size([1, 56]), torch.Size([1, 56]), torch.Size([56]), torch.Size([56]))

In [9]:
model = BertDetector(tok2id=tok2id)
results = model(tokens_tensor,token_type_ids=segments_tensors,rel_ids=rel_tensor.unsqueeze(0),pos_ids=pos_tensor.unsqueeze(0))
(results,torch.nn.Softmax(dim=1)(results[1]),torch.argmax(torch.nn.Softmax(dim=1)(results[1]),dim=-1))

((tensor([[ 0.1812, -0.2723]], grad_fn=<AddmmBackward>),
  tensor([[[-0.6615,  0.1716],
           [-0.2087,  0.2696],
           [-0.4593, -0.0786],
           [ 0.1960, -0.6619],
           [-0.5898,  0.0152],
           [-0.4499, -0.0224],
           [-0.1283, -0.2362],
           [-0.4974, -0.1788],
           [-0.3294, -0.4225],
           [-0.0648, -0.2173],
           [ 0.4301, -0.2815],
           [ 0.2340, -0.3602],
           [-0.0687, -0.1605],
           [ 0.0427, -0.3235],
           [-0.0901, -0.3714],
           [ 0.2663, -0.3083],
           [-0.1482, -0.1498],
           [ 0.1569,  0.2401],
           [ 0.2422, -0.3405],
           [-0.0475, -0.4424],
           [-0.1028, -0.2259],
           [-0.1633, -0.5786],
           [ 0.0893, -0.5959],
           [-0.2626, -0.3703],
           [ 0.3370, -0.8774],
           [-0.4027, -0.2592],
           [-0.3603, -0.5741],
           [-0.4923, -0.6132],
           [ 0.2358, -0.2823],
           [ 0.0046, -0.4434],
           [-

In [10]:
x = np.array(results[1].detach())[:,:,:2]
x = x - x.max(axis=2, keepdims=True)
y = np.exp(x)
y / y.sum(axis=2, keepdims=True)
y[:,:,-1]

array([[1.        , 1.        , 1.        , 0.4240568 , 1.        ,
        1.        , 0.8977299 , 1.        , 0.9110744 , 0.8585662 ,
        0.49084106, 0.5519678 , 0.91231334, 0.69338435, 0.7548474 ,
        0.56288207, 0.9984426 , 1.        , 0.55840206, 0.67377377,
        0.88414544, 0.660167  , 0.5039606 , 0.8979253 , 0.296886  ,
        1.        , 0.80752534, 0.8861072 , 0.5956413 , 0.6389143 ,
        1.        , 0.3483299 , 1.        , 1.        , 0.7427272 ,
        1.        , 0.78771454, 0.7905991 , 0.49470252, 0.44892636,
        0.8805472 , 0.61353207, 0.7707484 , 0.61257994, 1.        ,
        0.74654776, 0.5383597 , 0.7703232 , 0.6371283 , 0.55286795,
        0.794155  , 0.28399527, 1.        , 0.8098489 , 0.8163168 ,
        0.5473603 ]], dtype=float32)

In [11]:
f = open("biased.word.train",encoding="utf-8")
i = 0
model = BertDetector(tok2id=tok2id)
weights = torch.ones(2)
weights[-1] = 0
celossfn = torch.nn.CrossEntropyLoss(weight=weights)
softmax = torch.nn.Softmax(dim=-1)
optimizer = torch.optim.Adam(model.parameters(),lr=0.0001)

for line in f:
    linedata = line.strip().split('\t')
    if len(linedata) == 7:
        [revid, biased2, nonbiased2, biased, nonbiased, pos, rels] = linedata
        indexed_tokens = tokenizer.encode(biased2.strip().replace(" ##",""), add_special_tokens=True)
    elif len(linedata) == 5:
        [revid, biased2, nonbiased2, biased, nonbiased] = linedata
        indexed_tokens = tokenizer.encode(biased2.strip().replace(" ##",""), add_special_tokens=True)
        pos = ["<UNK>"]*len(indexed_tokens)
        rels = ["<UNK>"]*len(indexed_tokens)
    else:
        continue
    
    tokens_tensor = torch.tensor([indexed_tokens])
    lengths = (tokens_tensor == 102).nonzero(as_tuple=True)[1]
    try: 
        segments_ids = [0] * (lengths[0] + 1) + [1] * (lengths[1] - lengths[0])
    except:
        segments_ids = [0] * (lengths[0] + 1)
    segments_tensors = torch.tensor([segments_ids])
    rel_tensor = torch.tensor([REL2ID[x] for x in rels.strip().split(" ")])
    pos_tensor = torch.tensor([POS2ID[x] for x in pos.strip().split(" ")])
    while rel_tensor.size()[0] < len(indexed_tokens):
        rel_tensor = torch.cat((rel_tensor, torch.tensor([0])), dim=-1)
    while pos_tensor.size()[0] < len(indexed_tokens):
        pos_tensor = torch.cat((pos_tensor, torch.tensor([0])), dim=-1)

    ground_truth_list = [(x[0],len(x[1])) for x in diff(biased2.strip().split(),nonbiased2.strip().split())]
    ground_truth = []
    for j in ground_truth_list:
        if j[0] == "-":
            ground_truth.extend([1]*j[1])
        elif j[0] == "=":
            ground_truth.extend([0]*j[1])
    
    #print((tokens_tensor,segments_tensors,rel_tensor,pos_tensor))
    i = i + 1
    
    results = model(tokens_tensor,train="train",token_type_ids=segments_tensors,rel_ids=rel_tensor.unsqueeze(0),pos_ids=pos_tensor.unsqueeze(0))
    _,token_logits = results
    
    optimizer.zero_grad()
    while len(ground_truth) + 2 < len(softmax(token_logits)[0]):
        ground_truth.extend([0])
    loss = celossfn(softmax(token_logits)[0],torch.Tensor([0] + ground_truth + [0]).type('torch.LongTensor'))
    loss.backward()
    optimizer.step()
    if i % 10 == 0:
        print("Line: " + str(i))
        print(loss)
        print(" ")
        #break
    if i >= 10000:
        break
    
f.close()

Line: 10
tensor(0.3227, grad_fn=<NllLossBackward>)
 
Line: 20
tensor(0.3706, grad_fn=<NllLossBackward>)
 
Line: 30
tensor(0.3134, grad_fn=<NllLossBackward>)
 
Line: 40
tensor(0.3135, grad_fn=<NllLossBackward>)
 
Line: 50
tensor(0.3349, grad_fn=<NllLossBackward>)
 
Line: 60
tensor(0.3134, grad_fn=<NllLossBackward>)
 
Line: 70
tensor(0.3134, grad_fn=<NllLossBackward>)
 
Line: 80
tensor(0.3312, grad_fn=<NllLossBackward>)
 
Line: 90
tensor(0.3135, grad_fn=<NllLossBackward>)
 
Line: 100
tensor(0.3315, grad_fn=<NllLossBackward>)
 
Line: 110
tensor(0.3234, grad_fn=<NllLossBackward>)
 
Line: 120
tensor(0.3249, grad_fn=<NllLossBackward>)
 
Line: 130
tensor(0.3240, grad_fn=<NllLossBackward>)
 
Line: 140
tensor(0.3231, grad_fn=<NllLossBackward>)
 
Line: 150
tensor(0.3133, grad_fn=<NllLossBackward>)
 
Line: 160
tensor(0.3245, grad_fn=<NllLossBackward>)
 
Line: 170
tensor(0.3513, grad_fn=<NllLossBackward>)
 
Line: 180
tensor(0.3228, grad_fn=<NllLossBackward>)
 
Line: 190
tensor(0.3398, grad_fn=<Nll

In [30]:
# linedata = line.strip().split('\t')
# if len(linedata) == 7:
#     [revid, biased2, nonbiased2, biased, nonbiased, pos, rels] = linedata
#     indexed_tokens = tokenizer.encode(biased2.strip().replace(" ##",""), add_special_tokens=True)
# elif len(linedata) == 5:
#     [revid, biased2, nonbiased2, biased, nonbiased] = linedata
#     indexed_tokens = tokenizer.encode(biased2.strip().replace(" ##",""), add_special_tokens=True)
#     pos = ["<UNK>"]*len(indexed_tokens)
#     rels = ["<UNK>"]*len(indexed_tokens)
# else:
#     continue
biased2 = "one of the stand ##out tracks , \" parker ' s band , \" was a tribute to legendary jazz saxophonist charlie parker ."
indexed_tokens = tokenizer.encode(biased2.strip().replace(" ##",""), add_special_tokens=True)
pos = " ".join(["<UNK>"]*len(indexed_tokens))
rels = " ".join(["<UNK>"]*len(indexed_tokens))

tokens_tensor = torch.tensor([indexed_tokens])
lengths = (tokens_tensor == 102).nonzero(as_tuple=True)[1]
try: 
    segments_ids = [0] * (lengths[0] + 1) + [1] * (lengths[1] - lengths[0])
except:
    segments_ids = [0] * (lengths[0] + 1)
segments_tensors = torch.tensor([segments_ids])
rel_tensor = torch.tensor([REL2ID[x] for x in rels.strip().split(" ")])
pos_tensor = torch.tensor([POS2ID[x] for x in pos.strip().split(" ")])
while rel_tensor.size()[0] < len(indexed_tokens):
    rel_tensor = torch.cat((rel_tensor, torch.tensor([0])), dim=-1)
while pos_tensor.size()[0] < len(indexed_tokens):
    pos_tensor = torch.cat((pos_tensor, torch.tensor([0])), dim=-1)

# ground_truth_list = [(x[0],len(x[1])) for x in diff(biased2.strip().split(),nonbiased2.strip().split())]
# ground_truth = []
# for j in ground_truth_list:
#     if j[0] == "-":
#         ground_truth.extend([1]*j[1])
#     elif j[0] == "=":
#         ground_truth.extend([0]*j[1])

#print((tokens_tensor,segments_tensors,rel_tensor,pos_tensor))
i = i + 1

results = model(tokens_tensor,train="test",token_type_ids=segments_tensors,rel_ids=rel_tensor.unsqueeze(0),pos_ids=pos_tensor.unsqueeze(0))
_,token_logits = results

#optimizer.zero_grad()
#while len(ground_truth) + 2 < len(softmax(token_logits)[0]):
#    ground_truth.extend([0])
#loss = celossfn(softmax(token_logits)[0],torch.Tensor([0] + ground_truth + [0]).type('torch.LongTensor'))
#loss.backward()
#optimizer.step()
#    print(loss)
results

(tensor([[-0.1390,  0.3761]], grad_fn=<AddmmBackward>),
 tensor([[[ 12.0613, -12.1638],
          [ 12.0665, -12.1632],
          [ 12.0435, -12.1683],
          [ 12.0458, -12.1656],
          [ 12.0983, -12.1359],
          [ 12.0951, -12.1393],
          [ 12.0457, -12.1650],
          [ 12.0444, -12.1664],
          [ 12.0646, -12.1652],
          [ 12.0678, -12.2047],
          [ 12.0684, -12.2040],
          [ 12.0507, -12.1675],
          [ 12.0654, -12.2072],
          [ 12.0650, -12.2054],
          [ 12.0662, -12.1632],
          [ 12.0502, -12.1740],
          [ 12.0277, -12.1280],
          [ 12.0552, -12.0866],
          [ 12.0875, -12.2047],
          [ 12.0487, -12.0863],
          [ 12.0463, -12.1089],
          [ 12.0672, -12.1571],
          [ 12.0672, -12.1629],
          [ 12.0668, -12.1635],
          [ 12.0625, -12.1621],
          [ 12.0531, -12.1570]]], grad_fn=<AddBackward0>))

In [31]:
index = softmax(token_logits)[:,:,1][0][1:-1].detach().numpy()
idx = np.argmax(index)
(idx, np.max(index), index, (index-np.min(index))/(np.max(index)-np.min(index)))

(18,
 3.2985076e-11,
 array([3.0005255e-11, 3.0547957e-11, 3.0557164e-11, 2.9871293e-11,
        2.9863261e-11, 3.0578387e-11, 3.0578446e-11, 3.0000447e-11,
        2.8747331e-11, 2.8750840e-11, 3.0352058e-11, 2.8743492e-11,
        2.8807269e-11, 3.0010980e-11, 3.0170186e-11, 3.2307317e-11,
        3.2761797e-11, 2.8185124e-11, 3.2985076e-11, 3.2325809e-11,
        3.0165814e-11, 2.9989691e-11, 2.9985001e-11, 3.0156839e-11],
       dtype=float32),
 array([0.37919772, 0.49226162, 0.49417993, 0.35128862, 0.34961534,
        0.49860138, 0.49861366, 0.3781959 , 0.11712753, 0.11785865,
        0.45144895, 0.11632774, 0.12961477, 0.38039035, 0.4135586 ,
        0.8587986 , 0.9534829 , 0.        , 1.        , 0.86265117,
        0.41264784, 0.3759552 , 0.37497795, 0.41077796], dtype=float32))

In [32]:
for idx in np.flip(np.argsort(index)):
    print(tokenizer.decode(indexed_tokens[idx+1:idx+2]))

legendary
tribute
jazz
a
,
tracks
the
of
s
was
saxophonist
.
"
one
"
charlie
parker
stand
##out
,
'
parker
band
to


200000

In [122]:
import pandas as pd
testset = pd.read_csv("newdataset_withBias.csv")["text"].to_numpy()
# linedata = line.strip().split('\t')
# if len(linedata) == 7:
#     [revid, biased2, nonbiased2, biased, nonbiased, pos, rels] = linedata
#     indexed_tokens = tokenizer.encode(biased2.strip().replace(" ##",""), add_special_tokens=True)
# elif len(linedata) == 5:
#     [revid, biased2, nonbiased2, biased, nonbiased] = linedata
#     indexed_tokens = tokenizer.encode(biased2.strip().replace(" ##",""), add_special_tokens=True)
#     pos = ["<UNK>"]*len(indexed_tokens)
#     rels = ["<UNK>"]*len(indexed_tokens)
# else:
#     continue
randint = np.random.randint(0,len(testset))
biased2 = testset[randint]
punctuation = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
for punc in punctuation:
    biased2 = biased2.replace(punc,"")
indexed_tokens = tokenizer.encode(biased2.strip().replace(" ##",""), add_special_tokens=True)
pos = " ".join(["<UNK>"]*len(indexed_tokens))
rels = " ".join(["<UNK>"]*len(indexed_tokens))

tokens_tensor = torch.tensor([indexed_tokens])
lengths = (tokens_tensor == 102).nonzero(as_tuple=True)[1]
try: 
    segments_ids = [0] * (lengths[0] + 1) + [1] * (lengths[1] - lengths[0])
except:
    segments_ids = [0] * (lengths[0] + 1)
segments_tensors = torch.tensor([segments_ids])
rel_tensor = torch.tensor([REL2ID[x] for x in rels.strip().split(" ")])
pos_tensor = torch.tensor([POS2ID[x] for x in pos.strip().split(" ")])
while rel_tensor.size()[0] < len(indexed_tokens):
    rel_tensor = torch.cat((rel_tensor, torch.tensor([0])), dim=-1)
while pos_tensor.size()[0] < len(indexed_tokens):
    pos_tensor = torch.cat((pos_tensor, torch.tensor([0])), dim=-1)

# ground_truth_list = [(x[0],len(x[1])) for x in diff(biased2.strip().split(),nonbiased2.strip().split())]
# ground_truth = []
# for j in ground_truth_list:
#     if j[0] == "-":
#         ground_truth.extend([1]*j[1])
#     elif j[0] == "=":
#         ground_truth.extend([0]*j[1])

#print((tokens_tensor,segments_tensors,rel_tensor,pos_tensor))
i = i + 1

results = model(tokens_tensor,train="test",token_type_ids=segments_tensors,rel_ids=rel_tensor.unsqueeze(0),pos_ids=pos_tensor.unsqueeze(0))
_,token_logits = results

#optimizer.zero_grad()
#while len(ground_truth) + 2 < len(softmax(token_logits)[0]):
#    ground_truth.extend([0])
#loss = celossfn(softmax(token_logits)[0],torch.Tensor([0] + ground_truth + [0]).type('torch.LongTensor'))
#loss.backward()
#optimizer.step()
#    print(loss)
results

(tensor([[-0.1522,  0.3634]], grad_fn=<AddmmBackward>),
 tensor([[[ 12.0286, -12.1308],
          [ 12.0710, -12.2217],
          [ 12.0445, -12.1258],
          [ 12.0460, -12.1254],
          [ 12.0155, -12.1856],
          [ 12.0115, -12.1400],
          [ 12.0649, -12.1095],
          [ 12.0625, -12.2352],
          [ 12.0035, -12.1505],
          [ 12.0664, -12.1544],
          [ 12.0925, -12.1856],
          [ 12.0838, -12.1191],
          [ 12.0535, -12.1156],
          [ 12.1451, -12.3043],
          [ 12.0966, -12.0914],
          [ 12.0921, -12.0971],
          [ 12.0614, -12.1898],
          [ 12.0815, -12.2083],
          [ 12.0926, -12.1787],
          [ 12.0928, -12.1681],
          [ 12.0925, -12.1683],
          [ 12.0231, -12.1738],
          [ 12.0195, -12.1750],
          [ 12.0579, -12.1409],
          [ 12.0627, -12.2069],
          [ 12.0266, -12.1727],
          [ 12.0362, -12.1191],
          [ 12.0782, -12.1594],
          [ 12.0559, -12.1112],
          [ 12.0

In [123]:
index = softmax(token_logits)[:,:,1][0][1:-1].detach().numpy()
idx = np.argmax(index)
(idx, tokenizer.decode(indexed_tokens[idx:idx+3]), np.max(index), index, (index-np.min(index))/(np.max(index)-np.min(index)))

(40,
 'swallow vast swat',
 3.28825e-11,
 array([2.8172333e-11, 3.1839944e-11, 3.1804011e-11, 3.0874640e-11,
        3.2444901e-11, 3.1709156e-11, 2.8031579e-11, 3.2362328e-11,
        3.0273024e-11, 2.8587887e-11, 3.0818684e-11, 3.1879322e-11,
        2.4084942e-11, 3.1283177e-11, 3.1244122e-11, 2.9364910e-11,
        2.8254182e-11, 2.8783653e-11, 2.9082091e-11, 2.9084974e-11,
        3.1002988e-11, 3.1080787e-11, 3.0946152e-11, 2.8829475e-11,
        3.0931990e-11, 3.2323220e-11, 2.9768799e-11, 3.1943535e-11,
        3.1573306e-11, 2.8600268e-11, 2.8017092e-11, 2.8648913e-11,
        3.0788077e-11, 3.2268271e-11, 3.0120278e-11, 3.1218268e-11,
        3.1325033e-11, 2.5358883e-11, 2.8188403e-11, 2.8942753e-11,
        3.2882499e-11, 3.0950166e-11, 3.1015999e-11, 3.0030905e-11,
        2.9938711e-11], dtype=float32),
 array([0.46460515, 0.88149494, 0.87741053, 0.77177083, 0.95025915,
        0.8666286 , 0.44860598, 0.94087327, 0.70338637, 0.5118404 ,
        0.7654105 , 0.885971  , 0. 

In [124]:
print(randint)
print(testset[randint])
normindex = (index-np.min(index))/(np.max(index)-np.min(index))
for idx in np.flip(np.argsort(index)):
    print((tokenizer.decode(indexed_tokens[idx+1:idx+2]),tokenizer.decode(indexed_tokens[idx:idx+3]),index[idx],normindex[idx]))

3952
Research by Cornell University has led to the dire prediction, which could see two billion people displaced by 2100. Those who live on coastline or islands are most at risk from the rising sea levels, which threaten to swallow vast swathes of land.
('vast', 'swallow vast swat', 3.28825e-11, 1.0)
('has', 'university has led', 3.24449e-11, 0.95025915)
('the', 'to the dire', 3.2362328e-11, 0.94087327)
('or', 'coastline or islands', 3.232322e-11, 0.936428)
('rising', 'the rising sea', 3.226827e-11, 0.93018204)
('are', 'islands are most', 3.1943535e-11, 0.8932699)
('could', 'which could see', 3.1879322e-11, 0.885971)
('by', 'research by cornell', 3.1839944e-11, 0.88149494)
('cornell', 'by cornell university', 3.180401e-11, 0.87741053)
('led', 'has led to', 3.1709156e-11, 0.8666286)
('most', 'are most at', 3.1573306e-11, 0.8511868)
('which', 'levels which threaten', 3.1325033e-11, 0.8229661)
('two', 'see two billion', 3.1283177e-11, 0.81820846)
('billion', 'two billion people', 3.124412

In [16]:
softmax(token_logits)[:,:,1][0][:15]

tensor([3.0141e-11, 3.0005e-11, 3.0548e-11, 3.0557e-11, 2.9871e-11, 2.9863e-11,
        3.0578e-11, 3.0578e-11, 3.0000e-11, 2.8747e-11, 2.8751e-11, 3.0352e-11,
        2.8743e-11, 2.8807e-11, 3.0011e-11], grad_fn=<SliceBackward>)

In [17]:
len(tokens_tensor[0])

26

In [18]:
len(ground_truth)

5

In [19]:
ground_truth_list

[('-', 1), ('+', 1), ('=', 4)]

In [20]:
ground_truth

[1, 0, 0, 0, 0]

In [21]:
biased2

'one of the stand ##out tracks , " parker \' s band , " was a tribute to legendary jazz saxophonist charlie parker .'

In [22]:
nonbiased2

'taking of vie ##nti ##ane'

In [23]:
biased

'fall of vientiane'

In [24]:
tokenizer.encode(biased2.strip().replace("#",""))

[101,
 2028,
 1997,
 1996,
 3233,
 2041,
 3162,
 1010,
 1000,
 6262,
 1005,
 1055,
 2316,
 1010,
 1000,
 2001,
 1037,
 7050,
 2000,
 8987,
 4166,
 19977,
 4918,
 6262,
 1012,
 102]

In [25]:
biased2.replace(" ##","")

'one of the standout tracks , " parker \' s band , " was a tribute to legendary jazz saxophonist charlie parker .'

In [26]:
biased2.strip().split()

['one',
 'of',
 'the',
 'stand',
 '##out',
 'tracks',
 ',',
 '"',
 'parker',
 "'",
 's',
 'band',
 ',',
 '"',
 'was',
 'a',
 'tribute',
 'to',
 'legendary',
 'jazz',
 'saxophonist',
 'charlie',
 'parker',
 '.']