# Train Score Function Using 1st Sentences in Wikipedia Page

In [3]:
import pandas as pd
import random
from transformers import BertTokenizer, BertForSequenceClassification, BatchEncoding, AdamW
import torch
from typing import Iterable, List
import tqdm
from torch.nn import Softmax
import numpy as np
import csv
import sys
import json
from nltk import sent_tokenize


sys.path.append('..')
from tools.BasicUtils import my_read, my_json_read, my_csv_read, my_write, ntopidx
from tools.TextProcessing import clean_text, sent_lemmatize

In [4]:
# Load json file
first_sents_dict = my_json_read('../data/corpus/1st-sents-lowercase.json')

In [5]:
terms_cs_cfl = my_csv_read('../data/raw_data/terms-cs-cfl-epoch200.txt', delimiter='\t')

In [6]:
# Get cs terms that have wikipedia page
wiki_cs_terms = []
for item in terms_cs_cfl:
    kw = item[0]
    if kw in first_sents_dict:
        wiki_cs_terms.append(kw)

In [7]:
my_write('wiki_cs_terms.txt', wiki_cs_terms)

In [2]:
wiki_cs_terms = my_read('wiki_cs_terms.txt')

In [8]:
wiki_pages = my_json_read('../data/temp/wiki_pages.json')

In [11]:
# Collect negative sentences
remove_list = ['See also', 'References', 'Further reading', 'summary']

def collect_neg_sents_from_term(dic:dict, n:int=5):
    term = clean_text(dic['title'])
    neg_sents = []
    section_list = list(dic.keys())
    for item in remove_list:
        if item in section_list:
            section_list.remove(item)
    while len(neg_sents) < n and len(section_list) != 0:
        section = section_list.pop()
        section_text = dic[section]
        if not section_text:
            continue
        processed_text = clean_text(section_text)
        if term not in processed_text:
            continue
        temp_sents = sent_tokenize(processed_text)
        for sent in temp_sents:
            if term in sent:
                neg_sents.append('%s\t%s' % (term, sent))
    return neg_sents if neg_sents else None

neg_sents = []
for dic in wiki_pages:
    temp = collect_neg_sents_from_term(dic)
    if temp:
        neg_sents += temp

my_write('../data/test/neg_sents.txt', neg_sents)

In [14]:
# Collect positive sentences
my_write('../data/test/pos_sents.txt', ['%s\t%s' % (term, clean_text(first_sents_dict[term]['sentence'])) for term in wiki_cs_terms])

In [5]:
class Occurrence:
    def __init__(self, wordtree_file:str, keyword_file:str):
        self.wordtree = my_json_read(wordtree_file)
        self.keyword_list = my_read(keyword_file)
        self.keywords_dict = {word : i for i, word in enumerate(self.keyword_list)}

    def line_operation(self, reformed_sent:list):
        i = 0
        kw_set_for_line = set()
        while i < len(reformed_sent):
            if reformed_sent[i] in self.wordtree: # If the word is the start word of a keyword
                phrase_buf = []
                it = self.wordtree
                j = i
                while j < len(reformed_sent) and reformed_sent[j] in it:
                    # Add the word to the wait list
                    phrase_buf.append(reformed_sent[j])
                    if "" in it[reformed_sent[j]]: # If the word could be the last word of a keyword, update the list
                        # self.line_record[self.keywords_dict[' '.join(phrase_buf).replace(' - ', '-')]].add(int(line_idx) - 1)
                        kw_set_for_line.add(' '.join(phrase_buf).replace(' - ', '-'))
                    # Go down the tree to the next child
                    it = it[reformed_sent[j]]
                    j += 1
                    i = j - 1
            i += 1
        return kw_set_for_line if kw_set_for_line else None

In [8]:
# Generate samples
o = Occurrence('../data/corpus/wordtree.json', '../data/corpus/keyword_f.txt')
r = my_csv_read('../data/test/neg_sents.txt', delimiter='\t')
target_list = []
target_file = '../data/test/neg_samples.tsv'
for item in r:
    reformed_list = sent_lemmatize(item[1])
    reformed_sent = ' '.join(reformed_list)
    temp_kw_set = o.line_operation(reformed_list)
    if temp_kw_set is None:
        continue
    temp_kw_list = list(temp_kw_set)
    length = len(temp_kw_list)
    if length > 1:
        for i in range(length):
            for j in range(length):
                if i != j:
                    target_list.append((temp_kw_list[i], temp_kw_list[j], reformed_sent))
with open(target_file, 'w') as f_out:
    w = csv.writer(f_out, delimiter='\t')
    w.writerows(target_list)

In [13]:
o = Occurrence('../data/corpus/wordtree.json', '../data/corpus/keyword_f.txt')
o.line_operation(sent_lemmatize('a hidden markov model is a markov chain for which the state is only partially observable.'))

{'hidden markov model', 'markov chain', 'partially observable'}

In [9]:
# Generate training data

# # Positive samples
pos = pd.DataFrame(my_csv_read('../data/test/pos_samples.tsv', delimiter='\t'), columns=['head_ent', 'tail_ent', 'sent'])
pos['label'] = 'T'

# Negative samples 1
neg_1 = pd.DataFrame(my_csv_read('../data/test/neg_samples.tsv', delimiter='\t'), columns=['head_ent', 'tail_ent', 'sent'])
neg_1['label'] = 'F'

# Negative samples 2
neg_2 = pd.concat([pos.sent.to_frame(), 
                    pos.head_ent.sample(frac=1).reset_index(drop=True).to_frame(), 
                    pos.tail_ent.to_frame()], axis=1)
neg_2['label'] = 'F'

# Negative samples 3
neg_3 = pd.concat([pos.sent.to_frame(), 
                    pos.head_ent.to_frame(), 
                    pos.tail_ent.sample(frac=1).reset_index(drop=True).to_frame()], axis=1)
neg_3['label'] = 'F'

# Negative samples 4
neg_4 = pd.concat([pos.sent.to_frame(), 
                    pos.head_ent.sample(frac=1).reset_index(drop=True).to_frame(), 
                    pos.tail_ent.sample(frac=1).reset_index(drop=True).to_frame()], axis=1)
neg_4['label'] = 'F'

# df = pos.append(neg, ignore_index=True).sample(frac=1.0).reset_index(drop=True)
df = pd.concat([pos, neg_1, neg_2, neg_3, neg_4], axis=0, ignore_index=True).sample(frac=1.0).reset_index(drop=True)
df['pair'] = df.apply(lambda x: '<HEAD_ENT> %s <TAIL_ENT> %s' % (x.head_ent, x.tail_ent), axis=1)

split_line = int(len(df) * 0.8)
train_df = df[:split_line].reset_index(drop=True)
valid_df = df[split_line:].reset_index(drop=True)

train_df.to_csv('train.csv', index=False)
valid_df.to_csv('valid.csv', index=False)

In [15]:
train_df.head()

Unnamed: 0,head_ent,tail_ent,sent,label,pair
0,turing degree,theory of computation,"computability theory , also known as recursion...",T,<HEAD_ENT> turing degree <TAIL_ENT> theory of ...
1,engineered system,root cause,"in science and engineering , root cause analys...",F,<HEAD_ENT> engineered system <TAIL_ENT> root c...
2,system biology,pseudorandom number,a confusion network is a natural language proc...,F,<HEAD_ENT> system biology <TAIL_ENT> pseudoran...
3,standard ml,kernel space,a finite - state transducer is a finite - stat...,F,<HEAD_ENT> standard ml <TAIL_ENT> kernel space
4,internet service,optimal control,an internet exchange point is the physical inf...,F,<HEAD_ENT> internet service <TAIL_ENT> optimal...


In [16]:
len(df)

162206

In [None]:
# Load training and validation data
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('valid.csv')

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.add_special_tokens({'additional_special_tokens' : ['<HEAD_ENT>', '<TAIL_ENT>', '<DEP_PATH>']})

  return torch._C._cuda_getDeviceCount() > 0


3

In [12]:
torch.cuda.is_available()

False

In [None]:
# Load model for training
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.resize_token_embeddings(len(tokenizer))
# model = BertForSequenceClassification.from_pretrained('temp2.pt')

In [None]:
# Function for batch generation
def batch(sents:Iterable, n:int):
    l = len(sents)
    for ndx in range(0, l, n):
        yield sents[ndx:min(ndx + n, l)]

In [None]:
# Train the model
model.to(device)
model.train()

optim = AdamW(model.parameters(), lr=5e-5)

batch_list = [item for item in batch(train_df, 32)]

for epoch in range(3):
    loss = 0
    batch_num = 0
    for batch_df in tqdm.tqdm(batch_list):
        optim.zero_grad()
        labels = torch.tensor([1 if i == 'T' else 0 for i in batch_df.label.to_list()]).unsqueeze(1).to(device)
        inputs = BatchEncoding(tokenizer(batch_df.sent.to_list(), batch_df.pair.to_list(), padding=True, truncation=True, max_length=80, return_tensors="pt")).to(device)
        output = model(**inputs, labels=labels)
        loss += output.loss
        output.loss.backward()
        optim.step()
    print(loss / len(batch_list))

In [None]:
# Save trained model
tokenizer.save_pretrained('temp3.pt')
model.save_pretrained('temp3.pt')

# Tests

In [None]:
# Reload trained model
reload_model = BertForSequenceClassification.from_pretrained('temp3.pt')
tokenizer = BertTokenizer.from_pretrained('temp3.pt')

In [None]:
# Validation check
reload_model.to('cpu')
reload_model.eval()
eval_loss = 0
eval_batch_num = 0
eval_batch_list = [item for item in batch(valid_df, 16)]
with torch.no_grad():
    for batch_df in tqdm.tqdm(eval_batch_list):
        labels = torch.tensor([1 if i == 'T' else 0 for i in batch_df.label.to_list()]).unsqueeze(1)
        inputs = BatchEncoding(tokenizer(batch_df.sent.to_list(), batch_df.pair.to_list(), padding=True, truncation=True, max_length=80, return_tensors='pt'))
        output = reload_model(**inputs, labels=labels)
        eval_loss += output.loss
    print(eval_loss / len(eval_batch_list))

In [None]:
# Function that help generate score
def get_score(sents:List[str], pairs:List[str]):
    with torch.no_grad():
        inputs = BatchEncoding(tokenizer(sents, pairs, padding=True, truncation=True, max_length=80, return_tensors='pt'))
        output = reload_model(**inputs)
        s = Softmax(1)
        return s(output.logits)

In [None]:
# Get logits score
val_output = get_score(valid_df.sent.to_list(), valid_df.pair.to_list())
# Get prediction label
cls_result = np.argmax(val_output.numpy(), axis=1)
# Get prediction score
cls_score = val_output.numpy()[:, 1]
# Get ground truth
val_label = np.array([1 if l == 'T' else 0 for l in valid_df.label.to_list()])
# Get correct ones
correct_prediction = val_label == cls_result
# Sum the number of correct ones
correct_num = np.sum(correct_prediction)
# Get the wrong prediction idx
wrong_prediction_idx = np.arange(0, len(val_label))[val_label != cls_result]
# Get the wrong ones
wrong_samples = [(valid_df.sent[idx], valid_df.pair[idx], valid_df.label[idx], cls_result[idx]) for idx in wrong_prediction_idx]
# Write the wrong ones to file
with open('wrong_prediction.tsv', 'w') as f_out:
    w = csv.writer(f_out, delimiter='\t')
    w.writerows(wrong_samples)

In [None]:
test_sents = my_read('all_occurance.txt')
test_pairs = ['<HEAD_ENT> %s <TAIL_ENT> %s' % ('python', 'programming language')] * len(test_sents)

test_result = get_score(test_sents, test_pairs)
test_cls_score = test_result.numpy()[:, 1]
test_idx = ntopidx(len(test_cls_score), test_cls_score)
test_sentences = [('%.8f' % test_cls_score[i], test_sents[i]) for i in test_idx]
with open('test.tsv', 'w') as f_out:
    w = csv.writer(f_out, delimiter='\t')
    w.writerows(test_sentences)

In [None]:
# Collect 1st_sentence like sentences
all_sents = open('../data/corpus/small_sent.txt', 'r').read().strip().split('\n')
random.shuffle(all_sents)
sents = all_sents[:2000]
output = get_score(sents)

In [None]:
score = output[:, 1]

In [None]:
sum(score > 0.5)

In [None]:
score = score.numpy()

In [None]:
idx = np.arange(len(score))[score > 0.5]

In [None]:
good_sents = [sents[i] for i in idx]

In [None]:
good_sents

In [None]:
score[score > 0.5]