# Train Score Function Using 1st Sentences in Wikipedia Page

In [1]:
import pandas as pd
import random
from transformers import BertTokenizer, BertForSequenceClassification, BatchEncoding, AdamW
import torch
from typing import Iterable, List
import tqdm
from torch.nn import Softmax
import numpy as np
import csv
import json
import sys
import os
from nltk import sent_tokenize


sys.path.append('..')
from tools.BasicUtils import my_read, my_json_read, my_csv_read, my_write, ntopidx
from tools.TextProcessing import clean_text, sent_lemmatize, find_dependency_path_from_tree, nlp

file_description = [
    "wiki_cs_ent.txt ---- CS wikipedia entities from data/kw_ent_map.json which have records in 1st-sents-new.json",
    "*.pt ---- Training result",
    "train.csv ---- The training dataset",
    "valid.csv ---- The validation dataset",
    "wrong_prediction.tsv ---- The sentences that are wrongly predicted by the model"
]

if not os.path.exists('../data/temp/1st_sent'):
    os.mkdir('../data/temp/1st_sent')
    my_write('../data/temp/1st_sent/readme.txt', file_description)

## Update readme.txt

In [18]:
my_write('../data/temp/1st_sent/readme.txt', file_description)

## Load two main data files: 1st-sents-new.json and kw_ent_map.json

In [2]:
# Load 1st-sents-new.json with all words lower-cased
first_sents_dict = json.loads(open('../data/raw_data/1st-sents-new.json', 'r').read().lower())

In [3]:
# Load kw_ent_map.json
kw_ent_map = my_json_read('../data/corpus/kw_ent_map.json')

## Collect CS Wikipedia entities that have records in 1st-sents-new.json

In [19]:
# Get cs terms that have wikipedia page
wiki_cs_ent_in_1st = [item.lower() for item in set(kw_ent_map.values()) if item.lower() in first_sents_dict]
my_write('../data/temp/1st_sent/wiki_cs_ent.txt', wiki_cs_ent_in_1st)

## Collect dataset

In [20]:
wiki_cs_ent_in_1st = my_read('../data/temp/1st_sent/wiki_cs_ent.txt')

In [9]:
wiki_pages = my_json_read('../data/corpus/wiki_pages.json')

In [14]:
# Collect negative sentences
remove_list = ['See also', 'References', 'Further reading', 'summary', 'title']

def collect_neg_sents_from_term(dic:dict, n:int=5):
    term = clean_text(dic['title'])
    neg_sents = []
    section_list = list(dic.keys())
    while len(neg_sents) < n and len(section_list) != 0:
        section = section_list.pop()
        if section in remove_list:
            continue
        section_text = dic[section]
        if not section_text:
            continue
        processed_text = clean_text(section_text)
        if term not in processed_text:
            continue
        temp_sents = sent_tokenize(processed_text)
        for sent in temp_sents:
            if term in sent:
                neg_sents.append('%s\t%s' % (term, sent))
    return neg_sents if neg_sents else None

neg_sents = []
for dic in wiki_pages:
    temp = collect_neg_sents_from_term(dic)
    if temp:
        neg_sents += temp

my_write('../data/test/neg_sents.txt', neg_sents)

In [22]:
# Collect positive sentences
my_write('../data/test/pos_sents.txt', ['%s\t%s' % (clean_text(term), clean_text(first_sents_dict[term]['sentence'])) for term in wiki_cs_ent_in_1st])

In [16]:
class Occurrence:
    def __init__(self, wordtree_file:str, keyword_file:str):
        self.wordtree = my_json_read(wordtree_file)
        self.keyword_list = my_read(keyword_file)
        self.keywords_dict = {word : i for i, word in enumerate(self.keyword_list)}

    def line_operation(self, reformed_sent:list):
        i = 0
        kw_set_for_line = set()
        while i < len(reformed_sent):
            if reformed_sent[i] in self.wordtree: # If the word is the start word of a keyword
                phrase_buf = []
                it = self.wordtree
                j = i
                while j < len(reformed_sent) and reformed_sent[j] in it:
                    # Add the word to the wait list
                    phrase_buf.append(reformed_sent[j])
                    if "" in it[reformed_sent[j]]: # If the word could be the last word of a keyword, update the list
                        # self.line_record[self.keywords_dict[' '.join(phrase_buf).replace(' - ', '-')]].add(int(line_idx) - 1)
                        kw_set_for_line.add(' '.join(phrase_buf).replace(' - ', '-'))
                    # Go down the tree to the next child
                    it = it[reformed_sent[j]]
                    j += 1
                    i = j - 1
            i += 1
        return kw_set_for_line if kw_set_for_line else None

In [25]:
o = Occurrence('../data/corpus/wordtree.json', '../data/corpus/keyword_f.txt')
o.line_operation(sent_lemmatize('a hidden markov model is a markov chain for which the state is only partially observable.'))

{'hidden markov model', 'markov chain', 'partially observable'}

In [39]:
# Generate positive samples
r = my_csv_read('../data/test/pos_sents.txt', delimiter='\t')
target_list = []
for item in r:
    reformed_list = sent_lemmatize(item[1])
    reformed_sent = ' '.join(reformed_list)
    temp_kw_set = o.line_operation(reformed_list)
    if temp_kw_set is None:
        continue
    if len(temp_kw_set) < 2:
        continue
    if item[0] not in temp_kw_set:
        continue
    temp_kw_set.remove(item[0])
    for kw in temp_kw_set:
        target_list.append((item[0], kw, reformed_sent))
        target_list.append((kw, item[0], reformed_sent))
with open('../data/test/pos_samples.tsv', 'w') as f_out:
    w = csv.writer(f_out, delimiter='\t')
    w.writerows(target_list)

In [43]:
# Generate negative samples
neg = my_csv_read('../data/test/neg_sents.txt', delimiter='\t')
pos = my_csv_read('../data/test/pos_sents.txt', delimiter='\t')
keyword_list = my_read('../data/corpus/keyword_f.txt')
target_list = []

for item in pos:
    reformed_list = sent_lemmatize(item[1])
    reformed_sent = ' '.join(reformed_list)
    temp_kw_set = o.line_operation(reformed_list)
    if temp_kw_set is None:
        continue
    random_false_kw = random.sample(keyword_list, 1)[0]
    random_true_kw = random.sample(temp_kw_set, 1)[0]
    target_list.append((random_false_kw, random_true_kw, reformed_sent))
    target_list.append((random_true_kw, random_false_kw, reformed_sent))

for i, item in enumerate(neg):
    reformed_list = sent_lemmatize(item[1])
    reformed_sent = ' '.join(reformed_list)
    temp_kw_set = o.line_operation(reformed_list)
    if temp_kw_set is None:
        continue
    if len(temp_kw_set) < 2:
        continue
    if item[0] not in temp_kw_set:
        continue
    temp_kw_set.remove(item[0])
    for kw in temp_kw_set:
        target_list.append((item[0], kw, reformed_sent))
        target_list.append((kw, item[0], reformed_sent))
    if i >= 10000:
        break
with open('../data/test/neg_samples.tsv', 'w') as f_out:
    w = csv.writer(f_out, delimiter='\t')
    w.writerows(target_list)

In [44]:
# Generate training data

# # Positive samples
pos = pd.DataFrame(my_csv_read('../data/test/pos_samples.tsv', delimiter='\t'), columns=['head_ent', 'tail_ent', 'sent'])
pos['label'] = 'T'

# Negative samples 1
neg = pd.DataFrame(my_csv_read('../data/test/neg_samples.tsv', delimiter='\t'), columns=['head_ent', 'tail_ent', 'sent'])
neg['label'] = 'F'


# df = pos.append(neg, ignore_index=True).sample(frac=1.0).reset_index(drop=True)
df = pd.concat([pos, neg], axis=0, ignore_index=True).sample(frac=1.0).reset_index(drop=True)
df['pair'] = df.apply(lambda x: '<HEAD_ENT> %s <TAIL_ENT> %s' % (x.head_ent, x.tail_ent), axis=1)

split_line = int(len(df) * 0.8)
train_df = df[:split_line].reset_index(drop=True)
valid_df = df[split_line:].reset_index(drop=True)

train_df.to_csv('../data/temp/1st_sent/train.csv', index=False)
valid_df.to_csv('../data/temp/1st_sent/valid.csv', index=False)

In [45]:
train_df.head()

Unnamed: 0,head_ent,tail_ent,sent,label,pair
0,flash memory,write amplification,write amplification is an undesirable phenomen...,T,<HEAD_ENT> flash memory <TAIL_ENT> write ampli...
1,maximum independent set,map graph,"however , the high exponent of the algorithm t...",F,<HEAD_ENT> maximum independent set <TAIL_ENT> ...
2,inductive logic programming,structured prediction,other algorithm and model for structured predi...,F,<HEAD_ENT> inductive logic programming <TAIL_E...
3,syn flood,denial-of-service attack,a syn flood is a form of denial - of - service...,T,<HEAD_ENT> syn flood <TAIL_ENT> denial-of-serv...
4,file sharing,visual art,"the visual art are art form such as painting ,...",F,<HEAD_ENT> file sharing <TAIL_ENT> visual art


In [46]:
len(df)

30780

## Train model

In [2]:
# Load training and validation data
train_df = pd.read_csv('../data/temp/1st_sent/train.csv')
valid_df = pd.read_csv('../data/temp/1st_sent/valid.csv')

In [49]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.add_special_tokens({'additional_special_tokens' : ['<HEAD_ENT>', '<TAIL_ENT>', '<DEP_PATH>']})

3

In [50]:
torch.cuda.is_available()

True

In [51]:
# Load model for training
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.resize_token_embeddings(len(tokenizer))
# model = BertForSequenceClassification.from_pretrained('temp2.pt')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Embedding(30525, 768)

In [5]:
# Function for batch generation
def batch(sents:Iterable, n:int):
    l = len(sents)
    for ndx in range(0, l, n):
        yield sents[ndx:min(ndx + n, l)]

In [None]:
# Train the model
model.to(device)
model.train()

optim = AdamW(model.parameters(), lr=5e-5)

batch_list = [item for item in batch(train_df, 32)]

for epoch in range(3):
    loss = 0
    batch_num = 0
    for batch_df in tqdm.tqdm(batch_list):
        optim.zero_grad()
        labels = torch.tensor([1 if i == 'T' else 0 for i in batch_df.label.to_list()]).unsqueeze(1).to(device)
        inputs = BatchEncoding(tokenizer(batch_df.sent.to_list(), batch_df.pair.to_list(), padding=True, truncation=True, max_length=80, return_tensors="pt")).to(device)
        output = model(**inputs, labels=labels)
        loss += output.loss
        output.loss.backward()
        optim.step()
    print(loss / len(batch_list))

In [None]:
# Save trained model
tokenizer.save_pretrained('../data/temp/1st_sent/test1.pt')
model.save_pretrained('../data/temp/1st_sent/test1.pt')

# Tests

In [3]:
# Reload trained model
reload_model = BertForSequenceClassification.from_pretrained('../data/temp/1st_sent/test1.pt')
tokenizer = BertTokenizer.from_pretrained('../data/temp/1st_sent/test1.pt')

In [6]:
# Validation check
reload_model.to('cpu')
reload_model.eval()
eval_loss = 0
eval_batch_num = 0
eval_batch_list = [item for item in batch(valid_df, 16)]
with torch.no_grad():
    for batch_df in tqdm.tqdm(eval_batch_list):
        labels = torch.tensor([1 if i == 'T' else 0 for i in batch_df.label.to_list()]).unsqueeze(1)
        inputs = BatchEncoding(tokenizer(batch_df.sent.to_list(), batch_df.pair.to_list(), padding=True, truncation=True, max_length=80, return_tensors='pt'))
        output = reload_model(**inputs, labels=labels)
        eval_loss += output.loss
    print(eval_loss / len(eval_batch_list))

100%|██████████| 385/385 [03:26<00:00,  1.87it/s]

tensor(0.0309)





In [7]:
# Function that help generate score
def get_score(sents:List[str], pairs:List[str]):
    with torch.no_grad():
        inputs = BatchEncoding(tokenizer(sents, pairs, padding=True, truncation=True, max_length=80, return_tensors='pt'))
        output = reload_model(**inputs)
        s = Softmax(1)
        return s(output.logits)

In [9]:
# Get logits score
val_output = get_score(valid_df.sent.to_list(), valid_df.pair.to_list())
# Get prediction label
cls_result = np.argmax(val_output.numpy(), axis=1)
# Get prediction score
cls_score = val_output.numpy()[:, 1]
# Get ground truth
val_label = np.array([1 if l == 'T' else 0 for l in valid_df.label.to_list()])
# Get correct ones
correct_prediction = val_label == cls_result
# Sum the number of correct ones
correct_num = np.sum(correct_prediction)
# Get the wrong prediction idx
wrong_prediction_idx = np.arange(0, len(val_label))[val_label != cls_result]
# Get the wrong ones
wrong_samples = [(cls_result[idx], valid_df.label[idx], valid_df.pair[idx], valid_df.sent[idx]) for idx in wrong_prediction_idx]
# Write the wrong ones to file
with open('../data/temp/1st_sent/wrong_prediction.tsv', 'w') as f_out:
    w = csv.writer(f_out, delimiter='\t')
    w.writerows(wrong_samples)

In [None]:
# test_sents = my_read('all_occurance.txt')
# test_pairs = ['<HEAD_ENT> %s <TAIL_ENT> %s' % ('python', 'programming language')] * len(test_sents)

# test_result = get_score(test_sents, test_pairs)
# test_cls_score = test_result.numpy()[:, 1]
# test_idx = ntopidx(len(test_cls_score), test_cls_score)
# test_sentences = [('%.8f' % test_cls_score[i], test_sents[i]) for i in test_idx]
# with open('test.tsv', 'w') as f_out:
#     w = csv.writer(f_out, delimiter='\t')
#     w.writerows(test_sentences)