# Train Score Function Using 1st Sentences in Wikipedia Page

In [1]:
import pandas as pd
import random
from transformers import BertTokenizer, BertForSequenceClassification, BatchEncoding, AdamW
import torch
from typing import Iterable, List
import tqdm
from torch.nn import Softmax
import numpy as np
import csv
import sys

sys.path.append('..')
from tools.BasicUtils import my_read, my_json_read, my_csv_read, MultiThreading, my_write, clean_sent, get_wiki_page_from_kw, ntopidx, nsmallidx
from py_1st_sent import collect_neg_sents_from_term

In [17]:
import json
from tools.TextProcessing import sent_lemmatize

class Occurrence:
    def __init__(self, wordtree_file:str, keyword_file:str):
        self.wordtree = json.load(open(wordtree_file, 'r'))
        self.keyword_list = open(keyword_file, 'r').read().strip().split('\n')
        self.keywords_dict = {word : i for i, word in enumerate(self.keyword_list)}

    def line_operation(self, line:str):
        reformed_sent = sent_lemmatize(line)
        i = 0
        kw_set_for_line = set()
        while i < len(reformed_sent):
            if reformed_sent[i] in self.wordtree: # If the word is the start word of a keyword
                phrase_buf = []
                it = self.wordtree
                j = i
                while j < len(reformed_sent) and reformed_sent[j] in it:
                    # Add the word to the wait list
                    phrase_buf.append(reformed_sent[j])
                    if "" in it[reformed_sent[j]]: # If the word could be the last word of a keyword, update the list
                        # self.line_record[self.keywords_dict[' '.join(phrase_buf).replace(' - ', '-')]].add(int(line_idx) - 1)
                        kw_set_for_line.add(' '.join(phrase_buf).replace(' - ', '-'))
                    # Go down the tree to the next child
                    it = it[reformed_sent[j]]
                    j += 1
                    i = j - 1
            i += 1
        return kw_set_for_line if kw_set_for_line else None

In [None]:
# Generate json file with all strings lowercased
!cat ../data/raw_data/1st-sents-new.json | tr '[:upper:]' '[:lower:]' > ../data/corpus/1st-sents-lowercase.json

In [None]:
# Load json file
first_sents_dict = my_json_read('../data/corpus/1st-sents-lowercase.json')

In [None]:
terms_cs_cfl = my_csv_read('../data/raw_data/terms-cs-cfl-epoch200.txt', delimiter='\t')

In [None]:
# Get cs terms that have wikipedia page
wiki_cs_terms = []
for item in terms_cs_cfl:
    kw = item[0]
    if kw in first_sents_dict:
        wiki_cs_terms.append(kw)
        if len(wiki_cs_terms) >= 5000:
            break

In [None]:
my_write('wiki_cs_terms.txt', wiki_cs_terms)

In [None]:
wiki_cs_terms = my_read('wiki_cs_terms.txt')

In [None]:
# Collect negative sentences
mt = MultiThreading()
my_write('../data/test/neg_sents.txt', mt.run(collect_neg_sents_from_term, wiki_cs_terms[:3000], 10).split('\n'))

In [None]:
# Collect positive sentences
my_write('../data/test/pos_sents.txt', ['%s\t%s' % (term, clean_sent(first_sents_dict[term]['sentence'])) for term in wiki_cs_terms[:3000]])

In [18]:
# Generate samples
o = Occurrence('../data/corpus/wordtree.json', '../data/corpus/keyword_f.txt')
r = my_csv_read('../data/test/pos_sents.txt', delimiter='\t')
target_list = []
target_file = '../data/test/pos_samples.tsv'
for item in r:
    temp_kw_set = o.line_operation(item[1])
    if temp_kw_set is None:
        continue
    if item[0] in temp_kw_set:
        temp_kw_set.remove(item[0])
    if temp_kw_set:
        for kw in temp_kw_set:
            target_list.append((item[0], kw, item[1]))
with open(target_file, 'w') as f_out:
    w = csv.writer(f_out, delimiter='\t')
    w.writerows(target_list)

In [42]:
o = Occurrence('../data/corpus/wordtree.json', '../data/corpus/keyword_f.txt')
o.line_operation('a hidden markov model is a markov chain for which the state is only partially observable.')

{'hidden markov model', 'markov chain'}

In [3]:
# Generate training data

# # Positive samples
pos = pd.DataFrame(my_csv_read('../data/test/pos_samples.tsv', delimiter='\t'), columns=['head_ent', 'tail_ent', 'sent'])
pos['label'] = 'T'

# Negative samples
neg = pd.DataFrame(my_csv_read('../data/test/neg_samples.tsv', delimiter='\t'), columns=['head_ent', 'tail_ent', 'sent'])
neg['label'] = 'F'

df = pos.append(neg, ignore_index=True).sample(frac=1.0).reset_index(drop=True)
df['pair'] = df.apply(lambda x: '<HEAD_ENT> %s <TAIL_ENT> %s' % (x.head_ent, x.tail_ent), axis=1)

split_line = int(len(df) * 0.8)
train_df = df[:split_line].reset_index(drop=True)
valid_df = df[split_line:].reset_index(drop=True)

train_df.to_csv('train.csv', index=False)
valid_df.to_csv('valid.csv', index=False)

In [None]:
train_df.head()

In [2]:
# Load training and validation data
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('valid.csv')

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.add_special_tokens({'additional_special_tokens' : ['<HEAD_ENT>', '<TAIL_ENT>', '<DEP_PATH>']})

3

In [7]:
# Load model for training
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.resize_token_embeddings(len(tokenizer))
# model = BertForSequenceClassification.from_pretrained('temp2.pt')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Embedding(30525, 768)

In [8]:
# Function for batch generation
def batch(sents:Iterable, n:int):
    l = len(sents)
    for ndx in range(0, l, n):
        yield sents[ndx:min(ndx + n, l)]

In [9]:
# Train the model
model.to(device)
model.train()

optim = AdamW(model.parameters(), lr=5e-5)

batch_list = [item for item in batch(train_df, 32)]

for epoch in range(3):
    loss = 0
    batch_num = 0
    for batch_df in tqdm.tqdm(batch_list):
        optim.zero_grad()
        labels = torch.tensor([1 if i == 'T' else 0 for i in batch_df.label.to_list()]).unsqueeze(1).to(device)
        inputs = BatchEncoding(tokenizer(batch_df.sent.to_list(), batch_df.pair.to_list(), padding=True, truncation=True, max_length=80, return_tensors="pt")).to(device)
        output = model(**inputs, labels=labels)
        loss += output.loss
        output.loss.backward()
        optim.step()
    print(loss / len(batch_list))

100%|██████████| 201/201 [00:49<00:00,  4.07it/s]
  0%|          | 0/201 [00:00<?, ?it/s]

tensor(0.1808, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 201/201 [00:50<00:00,  3.98it/s]
  0%|          | 0/201 [00:00<?, ?it/s]

tensor(0.0633, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 201/201 [00:50<00:00,  4.01it/s]

tensor(0.0359, device='cuda:0', grad_fn=<DivBackward0>)





In [10]:
# Save trained model
tokenizer.save_pretrained('temp3.pt')
model.save_pretrained('temp3.pt')

# Tests

In [2]:
# Reload trained model
reload_model = BertForSequenceClassification.from_pretrained('temp3.pt')
tokenizer = BertTokenizer.from_pretrained('temp3.pt')

In [12]:
# Validation check
reload_model.to('cpu')
reload_model.eval()
eval_loss = 0
eval_batch_num = 0
eval_batch_list = [item for item in batch(valid_df, 16)]
with torch.no_grad():
    for batch_df in tqdm.tqdm(eval_batch_list):
        labels = torch.tensor([1 if i == 'T' else 0 for i in batch_df.label.to_list()]).unsqueeze(1)
        inputs = BatchEncoding(tokenizer(batch_df.sent.to_list(), batch_df.pair.to_list(), padding=True, truncation=True, max_length=80, return_tensors='pt'))
        output = reload_model(**inputs, labels=labels)
        eval_loss += output.loss
    print(eval_loss / len(eval_batch_list))

100%|██████████| 101/101 [00:59<00:00,  1.70it/s]

tensor(0.1032)





In [3]:
# Function that help generate score
def get_score(sents:List[str], pairs:List[str]):
    with torch.no_grad():
        inputs = BatchEncoding(tokenizer(sents, pairs, padding=True, truncation=True, max_length=80, return_tensors='pt'))
        output = reload_model(**inputs)
        s = Softmax(1)
        return s(output.logits)

In [5]:
# Get logits score
val_output = get_score(valid_df.sent.to_list(), valid_df.pair.to_list())
# Get prediction label
cls_result = np.argmax(val_output.numpy(), axis=1)
# Get prediction score
cls_score = val_output.numpy()[:, 1]
# Get ground truth
val_label = np.array([1 if l == 'T' else 0 for l in valid_df.label.to_list()])
# Get correct ones
correct_prediction = val_label == cls_result
# Sum the number of correct ones
correct_num = np.sum(correct_prediction)
# Get the wrong prediction idx
wrong_prediction_idx = np.arange(0, len(val_label))[val_label != cls_result]
# Get the wrong ones
wrong_samples = [(valid_df.sent[idx], valid_df.pair[idx], valid_df.label[idx], cls_result[idx]) for idx in wrong_prediction_idx]
# Write the wrong ones to file
with open('wrong_prediction.tsv', 'w') as f_out:
    w = csv.writer(f_out, delimiter='\t')
    w.writerows(wrong_samples)

In [5]:
test_sents = my_read('all_occurance.txt')
test_pairs = ['<HEAD_ENT> %s <TAIL_ENT> %s' % ('python', 'programming language')] * len(test_sents)

test_result = get_score(test_sents, test_pairs)
test_cls_score = test_result.numpy()[:, 1]
test_idx = ntopidx(len(test_cls_score), test_cls_score)
test_sentences = [('%.8f' % test_cls_score[i], test_sents[i]) for i in test_idx]
with open('test.tsv', 'w') as f_out:
    w = csv.writer(f_out, delimiter='\t')
    w.writerows(test_sentences)

In [None]:
# Collect 1st_sentence like sentences
all_sents = open('../data/corpus/small_sent.txt', 'r').read().strip().split('\n')
random.shuffle(all_sents)
sents = all_sents[:2000]
output = get_score(sents)

In [None]:
score = output[:, 1]

In [None]:
sum(score > 0.5)

In [None]:
score = score.numpy()

In [None]:
idx = np.arange(len(score))[score > 0.5]

In [None]:
good_sents = [sents[i] for i in idx]

In [None]:
good_sents

In [None]:
score[score > 0.5]