In [1]:
import pandas as pd
from tqdm import tqdm
import torch
import pytorch_utils
DATA_DIR = 'data/'

[32m[PyTorch-Utils][0m: Loading anchor candidate data...
[32m[PyTorch-Utils][0m: Loading wikipedia title embedings...
[32m[PyTorch-Utils][0m: Loading wikipedia items...


In [2]:
# is cuda available?
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
dataset = pytorch_utils.EntityDataset(device='cpu')
test_dataset = pytorch_utils.EntityDataset(train=False, device='cpu')

[32m[PyTorch-Utils][0m: Loading train set...
[32m[PyTorch-Utils][0m: Loading train_context_text_150.pkl from data/pkl...
[32m[PyTorch-Utils][0m: Now generating entity embeddings...


Batches:   0%|          | 0/572 [00:00<?, ?it/s]

Entity Length: 18288
Entity shape: torch.Size([18288, 384])
[32m[PyTorch-Utils][0m: Now computing syntax candidates for each entity...
[32m[PyTorch-Utils][0m: Loading test set...
[32m[PyTorch-Utils][0m: Loading test_context_text_150.pkl from data/pkl...
[32m[PyTorch-Utils][0m: Now generating entity embeddings...


Batches:   0%|          | 0/287 [00:00<?, ?it/s]

Entity Length: 9166
Entity shape: torch.Size([9166, 384])
[32m[PyTorch-Utils][0m: Now computing syntax candidates for each entity...


In [4]:
pytorch_utils.delete_corpus_embeds()

[32m[PyTorch-Utils][0m: Deleting corpus embeddings...


In [5]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda x: pytorch_utils.EntityDataset.collate_fn_train(x, device=device, syntax_candidates_list=dataset.syntax_candidates))
# test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=lambda x: pytorch_utils.EntityDataset.collate_fn_test(x, device=device))

In [6]:
model = pytorch_utils.EntityClassifier(transformer_model='distilbert-base-uncased', hidden_size=256, device=device)

In [7]:
"""
Freeze everything except for:
transformer.transformer.layer.5.attention.q_lin.weight
transformer.transformer.layer.5.attention.q_lin.bias
transformer.transformer.layer.5.attention.k_lin.weight
transformer.transformer.layer.5.attention.k_lin.bias
transformer.transformer.layer.5.attention.v_lin.weight
transformer.transformer.layer.5.attention.v_lin.bias
transformer.transformer.layer.5.attention.out_lin.weight
transformer.transformer.layer.5.attention.out_lin.bias
transformer.transformer.layer.5.sa_layer_norm.weight
transformer.transformer.layer.5.sa_layer_norm.bias
transformer.transformer.layer.5.ffn.lin1.weight
transformer.transformer.layer.5.ffn.lin1.bias
transformer.transformer.layer.5.ffn.lin2.weight
transformer.transformer.layer.5.ffn.lin2.bias
transformer.transformer.layer.5.output_layer_norm.weight
transformer.transformer.layer.5.output_layer_norm.bias
classifier.0.weight
classifier.0.bias
classifier.3.weight
classifier.3.bias

Basically, we want to fine-tune the last layer of the transformer and the classifier
"""
for name, param in model.named_parameters():
    if 'transformer' in name:
        if 'layer.5' in name:
            param.requires_grad = True
        elif 'pooler' in name:
            param.requires_grad = True
        else:
            param.requires_grad = False
    elif 'classifier' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

# Check if all parameters are frozen except for the ones we want
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)


transformer.embeddings.word_embeddings.weight
transformer.embeddings.position_embeddings.weight
transformer.embeddings.LayerNorm.weight
transformer.embeddings.LayerNorm.bias
transformer.transformer.layer.0.attention.q_lin.weight
transformer.transformer.layer.0.attention.q_lin.bias
transformer.transformer.layer.0.attention.k_lin.weight
transformer.transformer.layer.0.attention.k_lin.bias
transformer.transformer.layer.0.attention.v_lin.weight
transformer.transformer.layer.0.attention.v_lin.bias
transformer.transformer.layer.0.attention.out_lin.weight
transformer.transformer.layer.0.attention.out_lin.bias
transformer.transformer.layer.0.sa_layer_norm.weight
transformer.transformer.layer.0.sa_layer_norm.bias
transformer.transformer.layer.0.ffn.lin1.weight
transformer.transformer.layer.0.ffn.lin1.bias
transformer.transformer.layer.0.ffn.lin2.weight
transformer.transformer.layer.0.ffn.lin2.bias
transformer.transformer.layer.0.output_layer_norm.weight
transformer.transformer.layer.0.output_la

In [8]:
# loss and optimizer
from torch import optim
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# training loop
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

desc = f'Epoch {1} loss: ?? Avg shape: ??'
epochs = 10
model.train()
for epoch in range(epochs):
    running_loss = 0.0
    shapes = []
    pbar2 = tqdm(enumerate(dataloader), leave=False, total=len(dataloader), desc=desc)
    for i, data in pbar2:
        # get the inputs; data is a list of [inputs, labels]
        tokenized_inputs_input_ids, tokenized_inputs_attention_mask, labels = data
        shapes.append(tokenized_inputs_input_ids.shape[0])
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        outputs = model(tokenized_inputs_input_ids, tokenized_inputs_attention_mask, '')
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # print statistics
        running_loss += loss.item()
        pbar2.update(1)
    desc = f'Epoch {epoch+1} loss: {running_loss / len(dataloader)} Avg shape: {sum(shapes) / len(shapes)}'


Epoch 1 loss: ?? Avg shape: ??:   2%|▏         | 9/572 [00:14<21:44,  2.32s/it]

In [None]:
# let's do predictions on the test set
model.eval()
with torch.no_grad():
    predictions = []
    prediction_indexes = []
    for i, data in tqdm(enumerate(test_dataloader)):
        document_embeds, sentence_embeddings, entity_embeddings, candidate_ids, candidate_description_embeddings = data
        outputs = model((document_embeds, sentence_embeddings, entity_embeddings, candidate_ids, candidate_description_embeddings))
        # which index has the highest value?
        _, predicted = torch.max(outputs, 1)
        prediction_indexes.extend(predicted.tolist())
        # predicted is shape (batch_size)
        # we want to grab the best candidate for batch item i
        # i.e the candidate_ids is of shape (batch_size, 5)
        # we want to grab the best candidate for batch item i
        # i.e get it to shape (batch_size)
        best_candidates = candidate_ids[torch.arange(candidate_ids.size(0)), predicted]

        # Append the best candidates to the predictions list
        predictions.extend(best_candidates.tolist())
        

144it [00:09, 14.82it/s]


In [None]:
# predictions_from_statistics = []

# for i, row in tqdm(test_dataset.entity_df.iterrows()):
#     full_mention = row['full_mention'].strip().lower()
#     if full_mention in pytorch_utils.anchor_to_candidate:
#         best_candidate = pytorch_utils.anchor_to_candidate[full_mention][0]
#     else:
#         best_candidate = 0
#     predictions_from_statistics.append(best_candidate)

In [None]:
wiki_items = pd.read_csv(DATA_DIR + 'wiki_lite/wiki_items.csv')
# index wiki_items by id
wiki_items = wiki_items.set_index('item_id')
# Create item_id to wikipedia_title map
item_id_to_title = wiki_items['wikipedia_title'].to_dict()

In [None]:
enwiki_redirects = pd.read_csv(DATA_DIR + 'wiki_lite/enwiki_redirects.tsv', sep='\t', header=None, names=['source', 'target'])
# index enwiki_redirects by source
enwiki_redirects = enwiki_redirects.set_index('source')
# create source to target map
source_to_target = enwiki_redirects['target'].to_dict()

In [None]:
wiki_urls = []
not_found = 0
found = 0
redirection = 0
# Now we will map these into wikipedia_urls
for i in tqdm(range(len(predictions))):
    if predictions[i] == 0:
        # if the prediction is 0, we will append a blank url
        wiki_urls.append('NOT_FOUND')
        not_found += 1
        continue
    wikipedia_title = item_id_to_title[predictions[i]]
    # does this wikipedia title exist in the redirects?
    if wikipedia_title in source_to_target:
        # if it does, we will replace it with the redirect
        wikipedia_title = source_to_target[wikipedia_title]
        redirection += 1
    # Now replace the spaces with underscores
    wikipedia_title = wikipedia_title.replace(' ', '_')
    # And add the wikipedia url
    wiki_urls.append(f'http://en.wikipedia.org/wiki/{wikipedia_title}')
    found += 1

print(f'Found {found} wikipedia urls')
print(f'Not found {not_found} wikipedia urls')
print(f'Percentage of wikipedia urls found: {found / (found + not_found)}')
print(f'Percentage of wikipedia urls redirected: {redirection / found}')

100%|██████████| 9166/9166 [00:00<00:00, 1663205.30it/s]

Found 8803 wikipedia urls
Not found 363 wikipedia urls
Percentage of wikipedia urls found: 0.9603971197905302
Percentage of wikipedia urls redirected: 0.0029535385663978187





In [None]:
# not_found = 0
# found = 0
# train_wiki_urls = []
# # Now we will map these into wikipedia_urls
# for i in tqdm(range(len(predictions_train))):
#     if predictions_train[i] == 0:
#         # if the prediction is 0, we will append a blank url
#         train_wiki_urls.append('NOT_FOUND')
#         not_found += 1
#         continue
#     wikipedia_title = item_id_to_title[predictions_train[i]]
#     # does this wikipedia title exist in the redirects?
#     if wikipedia_title in source_to_target:
#         # if it does, we will replace it with the redirect
#         new_title = source_to_target[wikipedia_title]
#     # Now replace the spaces with underscores
#     wikipedia_title = wikipedia_title.replace(' ', '_')
#     # And add the wikipedia url
#     train_wiki_urls.append(f'http://en.wikipedia.org/wiki/{wikipedia_title}')
#     found += 1

# print(f'Found {found} wikipedia urls')
# print(f'Not found {not_found} wikipedia urls')
# print(f'Percentage of wikipedia urls found: {found / (found + not_found)}')

In [None]:
test = pd.read_csv(DATA_DIR + 'test.csv')
# train = pd.read_csv(DATA_DIR + 'train.csv')

In [None]:
not_nan = test['wiki_url'].notna()
not_nme = test['wiki_url'] != '--NME--'
# train_not_nan = train['wiki_url'].notna()
# train_not_nme = train['wiki_url'] != '--NME--'
test.loc[(not_nan & not_nme) & (test.id == 65002)]

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url
65002,65002,Dejan,B,Dejan Koturovic,?


In [None]:
test.loc[not_nan & not_nme, 'wiki_url'] = wiki_urls
# train.loc[train_not_nan & train_not_nme, 'wiki_url'] = train_wiki_urls

In [None]:
# replace NaN or --NME-- with NOT_FOUND
test['wiki_url'] = test['wiki_url'].fillna('NOT_FOUND')
test['wiki_url'] = test['wiki_url'].replace('--NME--', 'NOT_FOUND')
# train['wiki_url'] = train['wiki_url'].fillna('NOT_FOUND')
# train['wiki_url'] = train['wiki_url'].replace('--NME--', 'NOT_FOUND')

In [None]:
test[test.id == 65002]

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url
65002,65002,Dejan,B,Dejan Koturovic,http://en.wikipedia.org/wiki/Dejan_Koturović


In [None]:
' '.join(test.token.fillna('', inplace=False).to_list()[:250])

"-DOCSTART- (947testa CRICKET) CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY .  LONDON 1996-08-30  West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship .  Their stay on top , though , may be short-lived as title rivals Essex , Derbyshire and Surrey all closed in on victory while Kent made up for lost time in their rain-affected match against Nottinghamshire .  After bowling Somerset out for 83 on the opening morning at Grace Road , Leicestershire extended their first innings by 94 runs before being bowled out for 296 with England discard Andy Caddick taking three for 83 .  Trailing by 213 , Somerset got a solid start to their second innings before Simmons stepped in to bundle them out for 174 .  Essex , however , look certain to regain their top spot after Nasser Hussain and Peter Such gave them a firm grip on their match against Yorkshi

In [None]:
# # One problem, Simmons does not retrieve the cricket player, but querying for Phil Simmons does.
# # TODO: Fix this, one idea is to group by per doc id, check if this token has a better previous full mention
# display(wiki_items[wiki_items.index.isin(test_dataset.anchor_to_candidate['simmons'])])

# print('____')

# display(wiki_items[wiki_items.index.isin(test_dataset.anchor_to_candidate['phil simmons'])])

In [None]:
# now create a .csv file from id, wiki_url
test[['id', 'wiki_url']].to_csv('submission_with_doc_lower_score_limit_sample_skip.csv', index=False)
# train[['id', 'wiki_url']].to_csv('train_with_doc.csv', index=False)