In [1]:
import pickle

data_path = "/mnt/HDD/bportelli/lab_avanzato/beatrice.pkl"
with open(data_path, "rb") as o:
    data = pickle.load(o)


In [2]:
import pandas as pd
print(data.shape)
data.head()

(418153, 5)


Unnamed: 0,Text,Leaf,Category,Block,Chapter
0,"RIF. TRAUMA AL 3° DITO DELLA MANO SN, RISALENT...",9249,924,17.1,CH_17
1,RIFERISCE TRAUMA EMICOSTATO DX IN SEGUITO A PD...,78002,780,16.1,CH_16
2,RISCONRO DI HB 7.5 STENOSI PILORICA DI NDD,78900,789,16.1,CH_16
3,SPALLA SX TRAUMA CONTUSIVO EMICOSTATO SIN,9249,924,17.1,CH_17
4,DOLORE E BRUCIORE ALLA BOCCA DELLO STOMACO DA ...,7999,79,1.9,CH_1


In [3]:
column_names = data.columns.tolist()
print("number of rows: ", data.shape[0])
for col_name in column_names:
    print("unique values for {:10}:{:10}".format(col_name, len(data[col_name].unique())))

number of rows:  418153
unique values for Text      :    372413
unique values for Leaf      :      2390
unique values for Category  :       737
unique values for Block     :       116
unique values for Chapter   :        17


In [7]:
data["Chapter"].value_counts(normalize=True)


CH_17    0.273845
CH_16    0.202471
CH_6     0.101752
CH_1     0.093162
CH_7     0.077058
CH_13    0.054674
CH_8     0.051371
CH_9     0.043766
CH_10    0.026199
CH_12    0.024828
CH_5     0.021923
CH_3     0.012804
CH_4     0.007452
CH_2     0.004075
CH_11    0.003350
CH_14    0.001064
CH_15    0.000206
Name: Chapter, dtype: float64

In [6]:
def sort_by_chapter_number(c):
    return int(c.split("_")[1])

chapters = data["Chapter"].unique().tolist()
chapters.sort(key=sort_by_chapter_number)

map_chapter_to_label = { c : int(c.split("_")[1]) for c in chapters}
map_label_to_chapter = { int(c.split("_")[1]) : c for c in chapters}

def chapter_to_label(c):
    return map_chapter_to_label[c]

def label_to_chapter(l):
    return map_label_to_chapter[l]

In [8]:
import random
random.seed(42)

all_idx = list(range(0,data.shape[0]))
random.shuffle(all_idx)

dim_80 = int(data.shape[0]*0.8)
dim_20 = data.shape[0]-dim_80

training_loc = all_idx[:dim_80]
testing_loc = all_idx[dim_80:]

print("training set length", len(training_loc))
print("test set length    ", len(testing_loc))

training set length 334522
test set length     83631


In [20]:
class sample:
    def __init__(self, text, label):
        self.text = text
        self.label = label
        self.tokens = None
        self.token_ids = None
        self.id = None
    def __str__(self):
        return "sample(" + self.text + ", " + str(self.label) + ")"
    def __repr__(self):
        return "sample(" + self.text + ", " + str(self.label) + ")"

In [15]:
test_samples = []
for idx in testing_loc:
    d = data.loc[idx]
    s = sample(d["Text"], chapter_to_label(d["Chapter"]))
    test_samples.append(s)

In [68]:
from pytorch_transformers import BertTokenizer, BertModel, BertConfig

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
config = BertConfig.from_pretrained("bert-base-uncased", output_hidden_states = True)
model = BertModel.from_pretrained("bert-base-uncased", config = config)

In [96]:
text1 = "Apple is a fruit"
text2 = "Apple is a company"
text3 = "Oranges are tasty fruits"

tok1 = tokenizer.tokenize(text1)
tok2 = tokenizer.tokenize(text2)
tok3 = tokenizer.tokenize(text3)

(tok1, tok2, tok3)

(['apple', 'is', 'a', 'fruit'],
 ['apple', 'is', 'a', 'company'],
 ['orange', '##s', 'are', 'ta', '##sty', 'fruits'])

In [111]:
ids1 = tokenizer.convert_tokens_to_ids(tok1)
ids2 = tokenizer.convert_tokens_to_ids(tok2)
ids3 = tokenizer.convert_tokens_to_ids(tok3)

(ids1, ids2, ids3)

([6207, 2003, 1037, 5909],
 [6207, 2003, 1037, 2194],
 [4589, 2015, 2024, 11937, 21756, 10962])

In [112]:
import torch
import torch.nn.functional as F

input1 = torch.tensor([ids1])
input2 = torch.tensor([ids2])
input3 = torch.tensor([ids3])
(input1, input2, input3)

(tensor([[6207, 2003, 1037, 5909]]),
 tensor([[6207, 2003, 1037, 2194]]),
 tensor([[ 4589,  2015,  2024, 11937, 21756, 10962]]))

In [122]:
out1 = model(input1)
out2 = model(input2)
out3 = model(input3)

layer_number = list(range(13))

layers1 = {k:out1[2][-k][0] for k in layer_number}
layers2 = {k:out2[2][-k][0] for k in layer_number}
layers3 = {k:out3[2][-k][0] for k in layer_number}

In [141]:
dics = [layers1, layers2, layers3]

for d in dics:
    d["mean3"] = torch.mean(torch.stack( (d[1], d[2], d[3]) ), dim=0)
    d["mean5"] = torch.mean(torch.stack( (d[1], d[2], d[3], d[4], d[5]) ), dim=0)


In [142]:
print("1: same, 0: ortho, -1: opposite")

embedding_depth = "mean3"

for i in range(len(tok1)):
    
    v1,v2 = layers1[embedding_depth][i], layers2[embedding_depth][i]
    output = F.cosine_similarity(v1.unsqueeze(dim=0), v2.unsqueeze(dim=0))
    print("{}\t{}\t{}".format(tok1[i], tok2[i], round(float(output[0]),2)))
    
v1,v2 = layers1[embedding_depth][0], layers3[embedding_depth][0]
output = F.cosine_similarity(v1.unsqueeze(dim=0), v2.unsqueeze(dim=0))
print("{}\t{}\t{}".format(tok1[0], tok3[0], round(float(output[0]),2)))

v1,v2 = layers2[embedding_depth][0], layers3[embedding_depth][0]
output = F.cosine_similarity(v1.unsqueeze(dim=0), v2.unsqueeze(dim=0))
print("{}\t{}\t{}".format(tok2[0], tok3[0], round(float(output[0]),2)))

1: same, 0: ortho, -1: opposite
apple	apple	0.79
is	is	0.82
a	a	0.82
fruit	company	0.79
apple	orange	0.6
apple	orange	0.58


In [143]:
print("1: same, 0: ortho, -1: opposite")

embedding_depth = 1

for i in range(len(tok1)):
    
    v1,v2 = layers1[embedding_depth][i], layers2[embedding_depth][i]
    output = F.cosine_similarity(v1.unsqueeze(dim=0), v2.unsqueeze(dim=0))
    print("{}\t{}\t{}".format(tok1[i], tok2[i], round(float(output[0]),2)))
    
v1,v2 = layers1[embedding_depth][0], layers3[embedding_depth][0]
output = F.cosine_similarity(v1.unsqueeze(dim=0), v2.unsqueeze(dim=0))
print("{}\t{}\t{}".format(tok1[0], tok3[0], round(float(output[0]),2)))

v1,v2 = layers2[embedding_depth][0], layers3[embedding_depth][0]
output = F.cosine_similarity(v1.unsqueeze(dim=0), v2.unsqueeze(dim=0))
print("{}\t{}\t{}".format(tok2[0], tok3[0], round(float(output[0]),2)))

1: same, 0: ortho, -1: opposite
apple	apple	0.67
is	is	0.73
a	a	0.74
fruit	company	0.67
apple	orange	0.4
apple	orange	0.43


In [28]:
from tqdm import tqdm

max_tok_length = 0
for samp in tqdm(test_samples):
    t = tokenizer.tokenize(samp.text)
    samp.tokens = t
    samp.token_ids = tokenizer.convert_tokens_to_ids(t)
    if len(t) > max_tok_length:
        max_tok_length = len(t)
        
max_tok_length

100%|██████████| 83631/83631 [00:22<00:00, 3665.36it/s]


274

In [30]:
print(test_samples[0].tokens)
print(test_samples[0].token_ids)


['sin', '##cope']
[10742, 51965]
