In [1]:
import json
import csv
from typing import List
import numpy as np
import torch
from torch.nn import Softmax, Sigmoid, BCELoss
import pandas
from datasets import load_from_disk
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, BatchEncoding, PreTrainedModel, PretrainedConfig, BertModel, BertConfig, AdamW
import sys
import tqdm

sys.path.append('..')
from joint_score_func import SparseRetrieveSentForPairCoOccur
from tools.BasicUtils import ntopidx

In [2]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer.add_special_tokens({'additional_special_tokens' : ['<RELATION>']})

1

<h2> Training test

<h3> Score function 1

In [None]:
dataset = load_from_disk('data/single-ollie')
training_args = TrainingArguments("data/single-ollie", evaluation_strategy="epoch")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.resize_token_embeddings(len(tokenizer))

In [None]:
def preprocess_sf1(examples):
    # return BatchEncoding(tokenizer(examples['ent1'], examples['ent2'], padding=True, truncation=True, max_length=100, return_tensors="pt"))
    query = ['%s <RELATION> %s' % (ent1, ent2) for ent1, ent2 in zip(examples['ent1'], examples['ent2'])]
    return tokenizer(query, examples["sent"], padding=True, truncation=True, max_length=100)
    
train_dataset = dataset['train'].map(preprocess_sf1, batched=True)
valid_dataset = dataset['valid'].map(preprocess_sf1, batched=True)

In [None]:
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=valid_dataset)

In [None]:
trainer.train()

<h3> Score function 2

In [None]:
dataset = load_from_disk('data/single-ollie2')
training_args = TrainingArguments("data/single-ollie2", evaluation_strategy="epoch")

In [4]:
class ScoreFunction2(PreTrainedModel):
    def __init__(self, config:PretrainedConfig):
        super().__init__(config)
        self._context_encoder = BertModel(config)
        self._query_encoder = BertModel(config)
        self._sigmoid = Sigmoid()

    def forward(self, 
        context_input_ids,
        query_input_ids,
        context_token_type_ids=None,
        context_attention_mask=None,
        query_token_type_ids=None,
        query_attention_mask=None):
        context_inputs = {'input_ids': context_input_ids, 'token_type_ids': context_token_type_ids, 'attention_mask': context_attention_mask}
        query_inputs = {'input_ids': query_input_ids, 'token_type_ids': query_token_type_ids, 'attention_mask': query_attention_mask}
        context_emb = self._context_encoder(**context_inputs).last_hidden_state[:, 0, :]
        query_emb = self._query_encoder(**query_inputs).last_hidden_state[:, 0, :]
        score = self._sigmoid(torch.mul(context_emb, query_emb).sum(dim=1))
        return score

class ScoreFunction2Trainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss_function = BCELoss()
        loss = loss_function(outputs, labels)
        return (loss, outputs) if return_outputs else loss

def preprocess_sf2(examples):
    query = ['%s <RELATION> %s' % (ent1, ent2) for ent1, ent2 in zip(examples['ent1'], examples['ent2'])]
    context_tokenized = tokenizer(examples["sent"], padding=True, truncation=True, max_length=100)
    query_tokenized = tokenizer(query, padding=True, truncation=True, max_length=100)
    return {'context_input_ids': context_tokenized['input_ids'], 
            'context_token_type_ids': context_tokenized['token_type_ids'], 
            'context_attention_mask': context_tokenized['attention_mask'],
            'query_input_ids': query_tokenized['input_ids'], 
            'query_token_type_ids': query_tokenized['token_type_ids'], 
            'query_attention_mask': query_tokenized['attention_mask']}

In [None]:
train_dataset = dataset['train'].map(preprocess_sf2, batched=True)
valid_dataset = dataset['valid'].map(preprocess_sf2, batched=True)

In [4]:
model = ScoreFunction2(BertConfig())
model._query_encoder.resize_token_embeddings(len(tokenizer))

Embedding(30523, 768)

In [None]:
trainer = ScoreFunction2Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=valid_dataset)

In [None]:
trainer.train()

<h3> Temp

In [5]:
temp_dict = json.load(open('data/my_dataset.json'))
temp_dict['train'] = temp_dict['train'][:10000]
temp_dict['valid'] = temp_dict['valid'][:2000]
train_dataset = pandas.DataFrame.from_dict(temp_dict['train'])
valid_dataset = pandas.DataFrame.from_dict(temp_dict['valid'])

In [6]:
from tools.BasicUtils import batch

In [9]:
device = torch.device('cuda')
def preprocess_sf2_temp(examples):
    query = ['%s <RELATION> %s' % (ent1, ent2) for ent1, ent2 in zip(examples['ent1'], examples['ent2'])]
    context_tokenized = tokenizer(examples["sent"].to_list(), padding=True, truncation=True, max_length=100)
    query_tokenized = tokenizer(query, padding=True, truncation=True, max_length=100)
    return {'context_input_ids': torch.LongTensor(context_tokenized['input_ids']).to(device), 
            'context_token_type_ids': torch.LongTensor(context_tokenized['token_type_ids']).to(device), 
            'context_attention_mask': torch.LongTensor(context_tokenized['attention_mask']).to(device),
            'query_input_ids': torch.LongTensor(query_tokenized['input_ids']).to(device), 
            'query_token_type_ids': torch.LongTensor(query_tokenized['token_type_ids']).to(device), 
            'query_attention_mask': torch.LongTensor(query_tokenized['attention_mask']).to(device),
            'labels' : torch.Tensor(examples['labels'].to_list()).to(device)}

def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.pop('labels')
    outputs = model(**inputs)
    loss_function = BCELoss()
    loss = loss_function(outputs, labels)
    return (loss, outputs) if return_outputs else loss

model.to(device)
optim = AdamW(model.parameters(), lr=5e-5)
for epoch in range(3):
    loss_sum = 0
    i = 0
    batch_list = [item for item in batch(train_dataset, 16)]
    for i, b in enumerate(tqdm.tqdm(batch_list)):
        inputs = preprocess_sf2_temp(b)
        loss = compute_loss(model, inputs)
        loss.backward()
        loss_sum += loss.detach()
        optim.step()
    print(loss_sum / (i + 1))

100%|██████████| 625/625 [01:50<00:00,  5.64it/s]
  0%|          | 1/625 [00:00<01:37,  6.38it/s]

tensor(39.6400, device='cuda:0')


100%|██████████| 625/625 [01:51<00:00,  5.61it/s]
  0%|          | 1/625 [00:00<01:37,  6.37it/s]

tensor(39.6400, device='cuda:0')


100%|██████████| 625/625 [01:51<00:00,  5.61it/s]

tensor(39.6400, device='cuda:0')





In [10]:
model.save_pretrained('data/single-ollie2')

<h3> Testing

In [3]:
model = AutoModelForSequenceClassification.from_pretrained('data/single-ollie/checkpoint-3500')
# model = ScoreFunction2.from_pretrained('data/single-ollie2')
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30523, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [4]:
sparse_retriever = SparseRetrieveSentForPairCoOccur('../data/corpus/small_sent.txt', 'data/occur.json')

In [43]:
ent1 = 'data mining'
ent2 = 'machine learning'
s = 'in this paper, we show that by using the fuzzy statistics analysis and the data mining technology, the target - oriented fuzzy correlation rules can be obtained from a given database.'
# sent = sparse_retriever.retrieve(ent1, ent2)
# test_list = [{'sent' : s, 'ent1' : ent1, 'ent2' : ent2, 'labels' : 1} for s in sent]
test_list = [{'sent' : s, 'ent1' : ent1, 'ent2' : ent2, 'labels' : 1}]
print(len(test_list))

58


In [None]:
temp_dict = json.load(open('data/my_dataset.json'))
test_list = temp_dict['valid'][:200]

In [44]:
valid_df = pandas.DataFrame.from_dict(test_list)

In [7]:
# Function that help generate score
def get_score(sents:List[str], ent1s:List[str], ent2s:List[str]):
    query = ['%s <RELATION> %s' % (ent1, ent2) for ent1, ent2 in zip(ent1s, ent2s)]
    with torch.no_grad():
        inputs = BatchEncoding(tokenizer(query, sents, padding=True, truncation=True, max_length=80, return_tensors='pt'))
        output = model(**inputs)
        s = Softmax(1)
        return s(output.logits)

In [45]:
# Get logits score
val_output = get_score(valid_df.sent.to_list(), valid_df.ent1.to_list(), valid_df.ent2.to_list())
# Get prediction label
cls_result = np.argmax(val_output.numpy(), axis=1)
# Get prediction score
cls_score = val_output.numpy()[:, 1]
# Get ground truth
val_label = np.array(valid_df.labels.to_list())
# Get correct ones
correct_prediction = val_label == cls_result
# Sum the number of correct ones
correct_num = np.sum(correct_prediction)
# Get the wrong prediction idx
wrong_prediction_idx = np.arange(0, len(val_label))[val_label != cls_result]
# Get the wrong ones
wrong_samples = [(cls_result[idx], valid_df.labels[idx], valid_df.ent1[idx], valid_df.ent2[idx], valid_df.sent[idx]) for idx in wrong_prediction_idx]
# Write the wrong ones to file
with open('data/wrong_prediction.tsv', 'w') as f_out:
    w = csv.writer(f_out, delimiter='\t')
    w.writerows(wrong_samples)

# Get rank
rank_ids = ntopidx(len(cls_score), cls_score)
rank_list = [(cls_score[idx], valid_df.ent1[idx], valid_df.ent2[idx], valid_df.sent[idx]) for idx in rank_ids]
with open('data/rank_list.tsv', 'w') as f_out:
    w = csv.writer(f_out, delimiter='\t')
    w.writerows(rank_list)