In [1]:
from sklearn.model_selection import train_test_split
import torch
from transformers import GPT2Tokenizer, GPT2Model, BertTokenizer, BertModel
from torch.utils import data
import numpy as np

In [2]:
model_name = "BERT"

In [3]:
data_file = []
with open("../data/train-conll200.txt") as file:
  data_file = file.readlines()

In [4]:
sentences = []
sentence_number = 0
sentences.append([])
for i,line in enumerate(data_file):
    if line!='\n':
        word,POS,prob = line.split(' ')
        sentences[sentence_number].append((word,POS))
    else:
        sentence_number = sentence_number + 1
        sentences.append([])
del sentences[-1]

In [5]:
tags = list(set(word_pos[1] for sent in sentences for word_pos in sent))

In [6]:
",".join(tags)

"JJS,VBG,CD,WP,#,RBR,VBN,VBD,DT,JJR,WP$,MD,PRP$,VBP,UH,POS,:,SYM,IN,NNS,``,VB,,,$,RB,NNP,NN,TO,RBS,PDT,'',RP,NNPS,JJ,FW,.,),EX,WDT,PRP,VBZ,CC,(,WRB"

In [7]:
tags = ["<pad>"] + tags

In [8]:
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [10]:
if model_name == "gpt2":
    special_tokens_dict = {'cls_token': '_classify_','bos_token': '_start_', 'pad_token': '<pad>'}
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)
    tokenizer.add_special_tokens(special_tokens_dict)
else:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [11]:
class PosDataset(data.Dataset):
    def __init__(self, tagged_sents):
        sents, tags_li = [], [] # list of lists
        for sent in tagged_sents:
            words = [word_pos[0] for word_pos in sent]
            tags = [word_pos[1] for word_pos in sent]
            if model_name == "gpt2":
                sents.append(["_start_"] + words + ["_classify_"])
                tags_li.append(["<pad>"] + tags + ["<pad>"])
            else:
                sents.append(["[CLS]"] + words + ["[SEP]"])
                tags_li.append(["<pad>"] + tags + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            if(len(tokens)==0): 
                tokens=[w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)
        if(not len(x)==len(y)==len(is_heads)):
            print(words, tags)
            print(x,y,is_heads)
        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen

In [12]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

In [13]:
dataset = PosDataset(sentences)
data_iter = data.DataLoader(dataset=dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)

In [14]:
if model_name == "gpt2":
    model = GPT2Model.from_pretrained("gpt2")
    model.output_hidden_states = True
    model.resize_token_embeddings(len(tokenizer))
else:
    #model = BertModel.from_pretrained('bert-base-cased')
    model = torch.load('../models/BERT_Classifier_Large.pt')
    model = model.bert
    model.config.output_hidden_states = True
    model.config.is_decoder = False
    model.encoder.output_hidden_states = True
    for i in range(0,len(model.encoder.layer)): 
        model.encoder.layer[i].is_decoder = False
        model.encoder.layer[i].output_hidden_states = True
model = model.to(device)




In [15]:
from sklearn.cluster import KMeans

In [16]:
from sklearn.metrics.cluster import normalized_mutual_info_score

In [None]:
all_nmi = []
for n in range(0,10):
    Tags, Embedds = [], [[] for i in range(0,13)]

    for i, batch in enumerate(data_iter):
        _, x, is_heads_b, _, y_b, _ = batch
        x = x.to(device)
        with torch.no_grad():
            output=model(x)
            embedds_b = output[2]
            for i in range(0,13):
                for embedds, is_heads, ys in zip(embedds_b[i].cpu().numpy(), is_heads_b, y_b):
                    embbed = [hat for head, hat in zip(is_heads, embedds) if head == 1]   
                    y = [hat.item() for head, hat in zip(is_heads, ys) if head == 1]  
                    Embedds[i].extend(embbed[1:-1])
                    if(i==0):
                        Tags.extend(y[1:-1])
            if(len(Embedds[i])>=3000):
                break
    nmi=[]
    for i in range(0,13):
        kmeans = KMeans(n_clusters=44, random_state=0).fit(Embedds[i])
        nmi.append(normalized_mutual_info_score(np.array(Tags),kmeans.labels_))
    all_nmi.append(nmi)



In [18]:
np.mean(np.array(all_nmi),axis=0)

array([0.56859442, 0.59539871, 0.60914659, 0.62571013, 0.62339938,
       0.62387285, 0.6119353 , 0.5948746 , 0.52705585, 0.43248878,
       0.26141565, 0.2374198 , 0.13714923])