# Train Score Function Using 1st Sentences in Wikipedia Page

In [None]:
import pandas as pd
import random
from transformers import BertTokenizer, BertForSequenceClassification, BatchEncoding, AdamW
import torch
from typing import Iterable, List
import tqdm
from torch.nn import Softmax
import numpy as np
import sys

sys.path.append('..')
from tools.BasicUtils import my_read, my_json_read, my_csv_read, MultiThreading, my_write, get_wiki_page_from_kw, clean_sent
from py_1st_sent import collect_neg_sents_from_term

In [None]:
# Generate json file with all strings lowercased
!cat ../data/raw_data/1st-sents-new.json | tr '[:upper:]' '[:lower:]' > ../data/corpus/1st-sents-lowercase.json

In [None]:
# Load json file
first_sents_dict = my_json_read('../data/corpus/1st-sents-lowercase.json')

In [None]:
terms_cs_cfl = my_csv_read('../data/raw_data/terms-cs-cfl-epoch200.txt', delimiter='\t')

In [None]:
# Get cs terms that have wikipedia page
wiki_cs_terms = []
for item in terms_cs_cfl:
    kw = item[0]
    if kw in first_sents_dict:
        wiki_cs_terms.append(kw)
        if len(wiki_cs_terms) >= 5000:
            break

In [None]:
my_write('wiki_cs_terms.txt', wiki_cs_terms)

In [None]:
wiki_cs_terms = my_read('wiki_cs_terms.txt')

In [None]:
# Collect negative sentences
mt = MultiThreading()
my_write('neg_sents.txt', mt.run(collect_neg_sents_from_term, wiki_cs_terms[:3000], 10).split('\n'))

In [None]:
# Collect positive sentences
my_write('pos_sents.txt', ['%s\t%s' % (term, clean_sent(first_sents_dict[term]['sentence'])) for term in wiki_cs_terms[:3000]])

In [9]:
# Generate training data

# # Positive samples
pos = pd.DataFrame(my_csv_read('pos_sents.txt', delimiter='\t'), columns=['head', 'sent'])
pos['label'] = 'T'

# Negative samples
neg = pd.DataFrame(my_csv_read('neg_sents.txt', delimiter='\t'), columns=['head', 'sent'])
neg['label'] = 'F'

df = pos.append(neg, ignore_index=True).sample(frac=1.0).reset_index(drop=True)

split_line = int(len(df) * 0.8)
train_df = df[:split_line].reset_index(drop=True)
valid_df = df[split_line:].reset_index(drop=True)

train_df.to_csv('train.csv', index=False)
valid_df.to_csv('valid.csv', index=False)

In [10]:
train_df.head()

Unnamed: 0,head,sent,label
0,anomaly detection,anomaly detection benchmark data repository of...,F
1,cholesky decomposition,c programming language the gnu scientific libr...,F
2,smoothing,"in statistics and image processing , to smooth...",T
3,visual perception,another type of the unconscious inference hypo...,F
4,texture synthesis,"since then, the field of texture synthesis has...",F


In [None]:
# Load training and validation data
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('valid.csv')

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Load model for training
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('temp2.pt')

In [None]:
# Function for batch generation
def batch(sents:Iterable, n:int):
    l = len(sents)
    for ndx in range(0, l, n):
        yield sents[ndx:min(ndx + n, l)]

In [None]:
# Train the model
model.to(device)
model.train()

optim = AdamW(model.parameters(), lr=5e-5)

batch_list = [item for item in batch(train_df, 32)]

for epoch in range(3):
    loss = 0
    batch_num = 0
    for batch_df in tqdm.tqdm(batch_list):
        optim.zero_grad()
        labels = torch.tensor([1 if i == 'T' else 0 for i in batch_df.label.to_list()]).unsqueeze(1).to(device)
        inputs = BatchEncoding(tokenizer(batch_df.sent.to_list(), padding=True, truncation=True, max_length=80, return_tensors='pt')).to(device)
        output = model(**inputs, labels=labels)
        loss += output.loss
        output.loss.backward()
        optim.step()
    print(loss / len(batch_list))

In [None]:
# Save trained model
model.save_pretrained('temp2.pt')

# Tests

In [None]:
# Reload trained model
reload_model = BertForSequenceClassification.from_pretrained('temp2.pt')

In [None]:
# Validation check
reload_model.to('cpu')
reload_model.eval()
eval_loss = 0
eval_batch_num = 0
eval_batch_list = [item for item in batch(valid_df, 16)]
with torch.no_grad():
    for batch_df in tqdm.tqdm(eval_batch_list):
        labels = torch.tensor([1 if i == 'T' else 0 for i in batch_df.label.to_list()]).unsqueeze(1)
        inputs = BatchEncoding(tokenizer(batch_df.sent.to_list(), padding=True, truncation=True, max_length=80, return_tensors='pt'))
        output = reload_model(**inputs, labels=labels)
        eval_loss += output.loss
    print(eval_loss / len(eval_batch_list))

temp.pt: 0.0526

temp2.pt: 0.0511

In [None]:
# Function that help generate score
def get_score(sents:List[str]):
    with torch.no_grad():
        inputs = BatchEncoding(tokenizer(sents, padding=True, truncation=True, max_length=80, return_tensors='pt'))
        output = reload_model(**inputs)
        s = Softmax(1)
        return s(output.logits)

In [None]:
val_output = get_score(valid_df.sent.to_list())

In [None]:
cls_result = np.argmax(val_output.numpy(), axis=1)

In [None]:
cls_result.shape

In [None]:
val_label = np.array([1 if l == 'T' else 0 for l in valid_df.label.to_list()])

In [None]:
correct_prediction = val_label == cls_result

In [None]:
np.sum(correct_prediction)

In [None]:
# 10 sentences from 1st_wiki
sents = ['andre kirk agassi is an american retired professional tennis player and former world no.',
'the austroasiatic languages , also known as mon–khmer , are a large language family of mainland southeast asia , also scattered throughout parts of india , bangladesh , nepal , and southern china .',
'afroasiatic , also known as afrasian or hamito - semitic or semito - hamitic, is a large language family of about 300 languages that are spoken predominantly in west asia , north africa , the horn of africa and parts of the sahel .',
'andorra , officially the principality of andorra , is a sovereign landlocked microstate on the iberian peninsula , in the eastern pyrenees , bordered by france to the north and spain to the south.',
'in mathematics and statistics , the arithmetic mean , or simply the mean or the average , is the sum of a collection of numbers divided by the count of numbers in the collection.',
'the american football conference is one of the two conferences of the national football league , the highest professional level of american football in the united states.',
'animal farm is an allegorical novella by george orwell , first published in england on 17 august 1945.',
'amphibians are ectotherm ic, tetrapod vertebrate s of the class amphibia.',
'alaska is a u.',
'agriculture is the science, art and practice of cultivating plants and livestock.']

In [None]:
# 10 sentences from small_sent.txt
sents = ['we describe a new algorithm, the - pebble game with colors, and use it obtain a characterization of the family of - sparse graphs and algorithmic solutions to a family of problems concerning tree decompositions of graphs.',
'special instances of sparse graphs appear in rigidity theory and have received increased attention in recent years.',
'in particular, our colored pebbles generalize and strengthen the previous results of lee and streinu and give a new proof of the tutte - nash - williams characterization of arboricity.',
'we also present a new decomposition that certifies sparsity based on the - pebble game with colors.',
'our work also exposes connections between pebble game algorithms and previous sparse graph algorithms by gabow, gabow and westermann and hendrickson.',
'in a quantum mechanical model, diosi, feldmann and kosloff arrived at a conjecture stating that the limit of the entropy of certain mixtures is the relative entropy as system size goes to infinity.',
'the conjecture is proven in this paper for density matrices.',
'the first proof is analytic and uses the quantum law of large numbers.',
'the second one clarifies the relation to channel capacity per unit cost for classical - quantum channels.',
'both proofs lead to generalization of the conjecture.']

In [None]:
get_score(sents)

In [None]:
# Collect 1st_sentence like sentences
all_sents = open('../data/corpus/small_sent.txt', 'r').read().strip().split('\n')
random.shuffle(all_sents)
sents = all_sents[:2000]
output = get_score(sents)

In [None]:
score = output[:, 1]

In [None]:
sum(score > 0.5)

In [None]:
score = score.numpy()

In [None]:
idx = np.arange(len(score))[score > 0.5]

In [None]:
good_sents = [sents[i] for i in idx]

In [None]:
good_sents

In [None]:
score[score > 0.5]