In [1]:
import argparse
from collections import defaultdict
import sys
sys.path.append('../../../')
from el_evaluation import *
from utils import MyBatchSampler
from torch.utils.data import DataLoader, TensorDataset
from RuleLNN_nway import *

In [21]:
def get_qald_metrics(pred_, m_labels_, ques_, mode='val'):
    """pred_ are 0/1 s after applying a threshold"""
    rows = []
    question_rows_map = {}
    question_mention_set = set()
    for i, pred in enumerate(pred_):
        pred = pred.data.tolist()[0]
        question = ques_[i]
        if question not in question_rows_map:
            question_rows_map[ques_[i]] = []
        if pred:
            men_entity_label = '_'.join(m_labels_[i].split(';')[-1].split())
            men_entity_mention = '_'.join(m_labels_[i].split(';')[0].split())
            if '-'.join([question, men_entity_mention]) in question_mention_set:
                question_rows_map[ques_[i]][-1].add(
                    ('http://dbpedia.org/resource/{}'.format(men_entity_label), pred))
            else:
                question_mention_set.add('-'.join([question, men_entity_mention]))
                question_rows_map[ques_[i]].append(set())
                question_rows_map[ques_[i]][-1].add(
                    ('http://dbpedia.org/resource/{}'.format(men_entity_label), pred))
    for key, preds_list_mentions in question_rows_map.items():
        if len(preds_list_mentions) > 1:
            rows.append([key, []])
            for preds_set in preds_list_mentions:
                sorted_values = sorted(list(preds_set), key=lambda x: x[1], reverse=True)[:5]
                rows[-1][1].append(sorted_values)
        elif len(preds_list_mentions) == 1:
            sorted_values = sorted(list(preds_list_mentions[0]), key=lambda x: x[1], reverse=True)[:5]
            rows.append([key, [sorted_values]])
        else:
            rows.append([key, []])

    df_output = pd.DataFrame(rows, columns=['Question', 'Entities'])
    df_output['Classes'] = str([])

    # generate the csv
    if mode == 'test':
        df_missing = pd.read_csv("data/missing.csv", header=None)
        df_missing.columns = ['Unnamed:0', 'Question', 'Entities', 'Classes']
        df_missing = df_missing[['Question', 'Entities', 'Classes']]
        df_output = df_output[['Question', 'Entities', 'Classes']]
        df_output = pd.concat([df_output, df_missing], ignore_index=True)
        print("df_output", df_output.shape)

    # gold
    benchmark = pd.read_csv('../../../data/gt_sparql.csv')
    benchmark = benchmark.set_index('Question')
    benchmark = benchmark.replace(np.nan, '', regex=True)
    benchmark['Entities'] = benchmark['Entities'].astype(object)
    is_qald_gt = True

    # pred
    predictions = df_output
    # print(df_output.shape)
    predictions = predictions.set_index('Question')
    predictions['Entities'] = predictions['Entities']
    predictions['Classes'] = predictions['Classes']

    metrics = compute_metrics(benchmark=benchmark, predictions=predictions, limit=410, is_qald_gt=is_qald_gt, eval='full')

    scores = metrics['macro']['named']
    prec, recall, f1 = scores['precision'], scores['recall'], scores['f1']
    return prec, recall, f1, df_output



In [22]:
def test(x_test, m_labels_test, ques_test, best_tuned_threshold, alpha, checkpoint_name, model_name, output_file_name):
    """make predictions on test set"""
    bestModel = pick_model(model_name, alpha)
    bestModel.load_state_dict(torch.load(checkpoint_name))
    bestModel.eval()
    best_scores = {}

    with torch.no_grad():
        test_pred = bestModel(x_test, m_labels_test)
        prec, recall, f1, df_output = get_qald_metrics(test_pred, m_labels_test, ques_test, mode='test')
        df_output.to_csv(output_file_name)
        print("Test -- f1 is {} ".format(f1))
        print("Test -- prec, recall, f1", prec, recall, f1)
        best_scores['precision'] = prec
        best_scores['recall'] = recall
        best_scores['f1'] = f1

    # for name, mod in bestModel.named_modules():
    #     if type(mod) == nn.ModuleList:
    #         for name1, mod1 in mod.named_modules():
    #             if 'cdd' not in name1 and 'AND' not in name1:
    #                 if 'batch' in name1.lower():
    #                     continue
    #                 elif 'or_max' in name1.lower():
    #                     continue
    #                 elif 'and' in name1.lower():
    #                     print(name1, mod1.cdd())
    #                 elif 'or' in name1.lower():
    #                     print(name1, mod1.AND.cdd())
    #     else:
    #         if 'cdd' not in name and 'AND' not in name:
    #             if 'batch' in name.lower():
    #                 continue
    #             elif 'or_max' in name.lower():
    #                 continue
    #             elif 'and' in name.lower():
    #                 print(name, mod.cdd())
    #             elif 'or' in name.lower():
    #                 print(name, mod.AND.cdd())
    return test_pred, best_scores

In [23]:
def evaluate(eval_model, x_, y_, m_labels_, ques_, loss_fn):
    """evaluate a model on validation data"""
    eval_model.eval()
    with torch.no_grad():
        pred_ = eval_model(x_, m_labels_)
        loss = loss_fn(pred_, y_)
        prec, recall, f1, _ = get_qald_metrics(pred_, m_labels_, ques_, mode='val') # train and val both use 'val' mode

    return loss, f1, pred_


def pick_model(model_name, alpha):
    if model_name == "purename":
        return PureNameLNN(alpha, -1, False)
    elif model_name == "context":
        return ContextLNN(alpha, -1, False)
    elif model_name == "complex":
        return ComplexRuleLNN(alpha, -1, False)
    elif model_name == "lr":
        return LogitsRegression()
    else:
        print("WRONG name input")
        return None

In [72]:
from torch.utils.data import Dataset, Sampler
import numpy as np
class QuestionSampler(Sampler):
    r"""Samples elements sequentially, always in the same order.

    Arguments:
        data_source (Dataset): dataset to sample from
    """

    def __init__(self, sampler, labels, drop_last):
        self.sampler = sampler
        self.labels = labels
        self.drop_last = drop_last

    def __iter__(self):
        batch = []
        for idx in self.sampler:
            batch.append(idx)
            if self.labels[idx] == 1:
                yield batch
                batch = []
        if len(batch) > 0 and not self.drop_last:
            yield batch

    def __len__(self):
        if self.drop_last:
            return len(self.sampler) // self.batch_size
        else:
            return (len(self.sampler) + self.batch_size - 1) // self.batch_size
    

dataset_train = TensorDataset(x_train, y_train)

# for idxes in QuestionSampler(torch.utils.data.SequentialSampler(range(len(y_train))), y_train, False):
#     print(y_train[idxes])
# list(iter([2,2,3]))

loader = DataLoader(dataset_train, batch_sampler=QuestionSampler(torch.utils.data.SequentialSampler(range(len(y_train))), y_train, False), shuffle=False)
for batch in loader:
    print(batch[0].shape, batch[1].shape)
    break

torch.Size([23, 6]) torch.Size([23, 1])


In [95]:
def train(model, train_data, val_data, test_data, checkpoint_name, num_epochs):
    """train model and tune on validation set"""
    
    # unwrapping the data
    x_train, y_train, m_labels_train, ques_train = train_data
    x_val, y_val, m_labels_val, ques_val = val_data
    x_test, y_test, m_labels_test, ques_test = test_data
    
    
    # inialize the loss function and optimizer    
    loss_fn = nn.BCELoss()  # MSELoss(), did not work neither
    optimizer = optim.Adam(model.parameters(), lr=0.1)
    best_pred = None
    best_val_f1, best_val_loss = 0, 1000
    batch_size = 256

    # stats before training
    print("=========BEFORE TRAINING============")
    train_loss, train_f1, train_pred = evaluate(model, x_train, y_train, m_labels_train, ques_train, loss_fn)
    print("Train -- loss is {}; F1 is {}".format(train_loss, train_f1))
    val_loss, val_f1, val_pred = evaluate(model, x_val, y_val, m_labels_val, ques_val, loss_fn)
    print("Val --  loss is {}; F1 is {}".format(val_loss, val_f1))
    test_loss, test_f1, test_pred = evaluate(model, x_test, y_test, m_labels_test, ques_test, loss_fn)
    print("Test -- loss is {}; F1 is {}".format(test_loss, test_f1))
    
    # start training
    print("=========TRAINING============")
    dataset_train = TensorDataset(x_train, y_train)
    loader = DataLoader(dataset_train, sampler=MyBatchSampler(y_train), batch_size=256, shuffle=False)  # always False
#     loader = DataLoader(dataset_train, sampler=torch.utils.data.SequentialSampler(dataset_train), batch_size=batch_size, shuffle=False)  # always False
    # loader = DataLoader(dataset_train, sampler=torch.utils.data.WeightedRandomSampler(torch.FloatTensor([1, 100]), len(x_train), replacement=True), batch_size=64, shuffle=False)  # always False

    for epoch in range(num_epochs):
        total_loss = 0.0
        for xb, yb in loader:
            model.train()  # set train to true
            optimizer.zero_grad()
            yhat = model(xb, yb)
            yb = yb.reshape(-1, 1)
#             print('yb', yb.shape)
#             print('yhat', yhat.shape)
            loss = loss_fn(yhat, yb)
            total_loss += loss.item()*batch_size
            loss.backward()
            optimizer.step()

        # for name, param in model.named_parameters():
        #     if param.requires_grad:
        #         print(name, 'param -- data', param.data, 'grad -- ', param.grad)
        
        # show status after each epoch
        avg_loss = total_loss / (len(loader)*batch_size)
        train_loss, train_f1, train_pred = evaluate(model, x_train, y_train, m_labels_train, ques_train, loss_fn)
        print("Epoch " + str(epoch) + ": avg train loss -- " + str(avg_loss))
        print("Train -- loss is {}; F1 is {}".format(train_loss, train_f1))
        val_loss, val_f1, val_pred = evaluate(model, x_val, y_val, m_labels_val, ques_val, loss_fn)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_val_f1 = val_f1
            best_pred = val_pred
            torch.save(model.state_dict(), checkpoint_name)
        print("Val --  best loss is {}; F1 is {}".format(best_val_loss, best_val_f1))
    
    # show stats after training
    print("=========AFTER TRAINING============")
    train_loss, train_f1, train_pred = evaluate(model, x_train, y_train, m_labels_train, ques_train, loss_fn)
    print("Train -- loss is {}; F1 is {}".format(train_loss, train_f1))
    val_loss, val_f1, val_pred = evaluate(model, x_val, y_val, m_labels_val, ques_val, loss_fn)
    print("Val --  loss is {}; F1 is {}".format(val_loss, val_f1))
    test_loss, test_f1, test_pred = evaluate(model, x_test, y_test, m_labels_test, ques_test, loss_fn)
    print("Test -- loss is {}; F1 is {}".format(test_loss, test_f1))


In [96]:
args.num_epoch = 10
args.alpha = 0.85
args.model_name = "complex"

# train model and evaluate
model = pick_model(args.model_name, args.alpha)
print("model: ", args.model_name, args.alpha)
print(model(x_train, m_labels_train))
print(x_train.shape, x_val.shape)

best_tuned_threshold = train(model, train_data, val_data, test_data, args.checkpoint_name, args.num_epoch)


model:  complex 0.85
tensor([[9.9432e-01],
        [4.2742e-04],
        [4.3708e-04],
        ...,
        [4.2063e-04],
        [4.2099e-04],
        [4.2099e-04]], grad_fn=<RsubBackward1>)
torch.Size([25078, 6]) torch.Size([4991, 6])
Train -- loss is 1.002746343612671; F1 is 0.7086
Val --  loss is 0.6703747510910034; F1 is 0.7312
Test -- loss is 0.7552366256713867; F1 is 0.754
Epoch 0: avg train loss -- 0.47536606574669865
Train -- loss is 0.24638526141643524; F1 is 0.7569
Val --  best loss is 0.2525876760482788; F1 is 0.7933
Epoch 1: avg train loss -- 0.45008327762285866
Train -- loss is 0.22747252881526947; F1 is 0.7583
Val --  best loss is 0.23762181401252747; F1 is 0.7748
Epoch 2: avg train loss -- 0.4456175231016599
Train -- loss is 0.20541809499263763; F1 is 0.7613
Val --  best loss is 0.21387669444084167; F1 is 0.7748
Epoch 3: avg train loss -- 0.45831723763392523
Train -- loss is 0.12779732048511505; F1 is 0.7657
Val --  best loss is 0.11590004712343216; F1 is 0.7933
Epoch 4

In [97]:
test_pred, best_scores = test(x_test, m_labels_test, ques_test, best_tuned_threshold, args.alpha, args.checkpoint_name, args.model_name, args.output_file_name)
with open("output_w_spacy.txt", 'a') as f:
    f.write("model={}; use_binary={}; alpha={}; p={}; r={}; f1={}\n".format(args.model_name, args.use_binary, args.alpha,
                                                                best_scores['precision'], best_scores['recall'],
                                                                best_scores['f1']))
    print("model={}; use_binary={}; alpha={}; p={}; r={}; f1={}\n".format(args.model_name, args.use_binary, args.alpha,
                                                                best_scores['precision'], best_scores['recall'],
                                                                best_scores['f1']))


df_output (157, 3)
Test -- f1 is 0.7834 
Test -- prec, recall, f1 0.8047 0.7633 0.7834
model=complex; use_binary=False; alpha=0.85; p=0.8047; r=0.7633; f1=0.7834



# load data

In [2]:
parser = argparse.ArgumentParser(description="main training script for training lnn entity linking models")
parser.add_argument("--train_data", type=str, default="./data/train.csv", help="train csv")
parser.add_argument("--test_data", type=str, default="./data/test.csv", help="test csv")
parser.add_argument("--checkpoint_name", type=str, default="checkpoint/best_model.pt", help="checkpoint path")
parser.add_argument("--output_file_name", type=str, default="output/purename_nway_alpha09.txt", help="checkpoint path")
parser.add_argument("--model_name", type=str, default="purename", help="which model we choose")
# args for dividing the corpus
parser.add_argument('--alpha', type=float, default=0.9, help='alpha for LNN')
parser.add_argument('--num_epoch', type=int, default=50, help='training epochs for LNN')
parser.add_argument("--use_binary", action="store_true", help="default is to use binary`, otherwise use stem")
parser.add_argument("-f")
args = parser.parse_args()


df_train_val = pd.read_csv(args.train_data)
train_ques_set = list(df_train_val.Question.unique())[:166]
val_ques_set = list(df_train_val.Question.unique())[166:]
df_train = df_train_val[df_train_val.Question.isin(train_ques_set)]
df_val = df_train_val[df_train_val.Question.isin(val_ques_set)]
df_test = pd.read_csv(args.test_data)

# train
features_train = np.array([np.fromstring(s[1:-1], dtype=np.float, sep=', ') for s in df_train.Features.values])
x_train = torch.from_numpy(features_train).float()
y_train = torch.from_numpy(df_train.Label.values).float().reshape(-1, 1)
m_labels_train = df_train.Mention_label.values
ques_train = df_train.Question.values

# val
features_val = np.array([np.fromstring(s[1:-1], dtype=np.float, sep=', ') for s in df_val.Features.values])
x_val = torch.from_numpy(features_val).float()
y_val = torch.from_numpy(df_val.Label.values).float().reshape(-1, 1)
m_labels_val = df_val.Mention_label.values
ques_val = df_val.Question.values

# train
# features_train_val = np.array(
#     [np.fromstring(s[1:-1], dtype=np.float, sep=', ') for s in df_train_val.Features.values])
# X_train_val = torch.from_numpy(features_train_val).float()
# Y_train_val = torch.from_numpy(df_train_val.Label.values).float().reshape(-1, 1)
# mention_labels_train_val = df_train_val.Mention_label.values
# questions_train_val = df_train_val.Question.values
# x_train, x_val, y_train, y_val, m_labels_train, m_labels_val, ques_train, ques_val = \
#     train_test_split(X_train_val, Y_train_val, mention_labels_train_val, questions_train_val,
#                      test_size=0.2, train_size=0.8, random_state=200, stratify=Y_train_val)


# test
features_test = np.array([np.fromstring(s[1:-1], dtype=np.float, sep=', ') for s in df_test.Features.values])
x_test = torch.from_numpy(features_test).float()
y_test = torch.from_numpy(df_test.Label.values).float().reshape(-1, 1)
m_labels_test = df_test.Mention_label.values
ques_test = df_test.Question.values

In [3]:
train_data = (x_train, y_train, m_labels_train, ques_train)
print("train:", x_train.shape, y_train.shape, m_labels_train.shape, ques_train.shape)
val_data = (x_val, y_val, m_labels_val, ques_val)
print("val:", x_val.shape, y_val.shape, m_labels_val.shape, ques_val.shape)
test_data = (x_test, y_test, m_labels_test, ques_test)
print("test:", x_test.shape, y_test.shape, m_labels_test.shape, ques_test.shape)

train: torch.Size([25078, 6]) torch.Size([25078, 1]) (25078,) (25078,)
val: torch.Size([4991, 6]) torch.Size([4991, 1]) (4991,) (4991,)
test: torch.Size([17808, 6]) torch.Size([17808, 1]) (17808,) (17808,)


In [4]:
# check class distribution
print("y_train sum", sum(y_train), sum(y_train)/len(y_train))
print("y_val sum", sum(y_val), sum(y_val)/len(y_val))
# print("TRAIN_VAL", sum(Y_train_val), sum(Y_train_val)/len(Y_train_val))
print("y_test sum", sum(y_test), sum(y_test)/len(y_test))

y_train sum tensor([175.]) tensor([0.0070])
y_val sum tensor([38.]) tensor([0.0076])
y_test sum tensor([162.]) tensor([0.0091])


In [6]:
len(ques_train) / len(set(ques_train))

151.0722891566265

# my summary