In [1]:
import pandas as pd
from tqdm import tqdm
import torch
import pytorch_utils
DATA_DIR = 'data/'

[32m[PyTorch-Utils][0m: Loading anchor candidate data...
[32m[PyTorch-Utils][0m: Loading item_id to statement embedding data...
[32m[PyTorch-Utils][0m: Loading item_id to description embedding data...
[32m[PyTorch-Utils][0m: Loading wikipedia title embedings...
[32m[PyTorch-Utils][0m: Loading wikipedia items...


In [2]:
# is cuda available?
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
dataset = pytorch_utils.EntityDataset(device='cuda')
test_dataset = pytorch_utils.EntityDataset(train=False, device='cuda')

[32m[PyTorch-Utils][0m: Loading all-MiniLM-L6-v2 model & Generating sentence embeddings of train set...


100%|██████████| 7375/7375 [00:02<00:00, 2816.78it/s]


Batches:   0%|          | 0/231 [00:00<?, ?it/s]

[32m[PyTorch-Utils][0m: Now generating Doc embeddings...


100%|██████████| 946/946 [00:04<00:00, 204.45it/s]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

[32m[PyTorch-Utils][0m: Now generating entity embeddings...


Batches:   0%|          | 0/572 [00:00<?, ?it/s]

[32m[PyTorch-Utils][0m: Loading all-MiniLM-L6-v2 model & Generating sentence embeddings of test set...


100%|██████████| 3501/3501 [00:01<00:00, 2769.46it/s]


Batches:   0%|          | 0/110 [00:00<?, ?it/s]

[32m[PyTorch-Utils][0m: Now generating Doc embeddings...


100%|██████████| 447/447 [00:01<00:00, 357.94it/s]


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

[32m[PyTorch-Utils][0m: Now generating entity embeddings...


Batches:   0%|          | 0/287 [00:00<?, ?it/s]

In [4]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=lambda x: pytorch_utils.EntityDataset.collate_fn_train(x, device=device))
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=lambda x: pytorch_utils.EntityDataset.collate_fn_test(x, device=device))

In [5]:
model = pytorch_utils.EntityClassifier(device=device)

In [6]:
# loss and optimizer
from torch import optim
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# training loop
from tqdm import tqdm


epochs = 20
pbar = tqdm(range(epochs))
model.train()
for epoch in pbar:
    running_loss = 0.0
    shapes = []
    for i, data in enumerate(dataloader):
        # get the inputs; data is a list of [inputs, labels]
        document_embeds, sentence_embeddings, entity_embeddings, candidate_ids, candidate_description_embeddings, labels = data
        shapes.append(document_embeds.shape[0])
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        outputs = model((document_embeds, sentence_embeddings, entity_embeddings, candidate_ids, candidate_description_embeddings))
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # print statistics
        running_loss += loss.item()
    pbar.set_description(f'Epoch {epoch} loss: {running_loss / len(dataloader)} Avg shape: {sum(shapes) / len(shapes)}')

  document_embed = torch.tensor(document_embed, dtype=torch.float32, device=device)
Epoch 19 loss: 0.04415383163458583 Avg shape: 55.32517482517483: 100%|██████████| 20/20 [06:46<00:00, 20.34s/it]


In [7]:
# let's do predictions on the test set
model.eval()
with torch.no_grad():
    predictions = []
    prediction_indexes = []
    for i, data in tqdm(enumerate(test_dataloader)):
        document_embeds, sentence_embeddings, entity_embeddings, candidate_ids, candidate_description_embeddings = data
        outputs = model((document_embeds, sentence_embeddings, entity_embeddings, candidate_ids, candidate_description_embeddings))
        # which index has the highest value?
        _, predicted = torch.max(outputs, 1)
        prediction_indexes.extend(predicted.tolist())
        # predicted is shape (batch_size)
        # we want to grab the best candidate for batch item i
        # i.e the candidate_ids is of shape (batch_size, 5)
        # we want to grab the best candidate for batch item i
        # i.e get it to shape (batch_size)
        best_candidates = candidate_ids[torch.arange(candidate_ids.size(0)), predicted]

        # Append the best candidates to the predictions list
        predictions.extend(best_candidates.tolist())
        

144it [00:11, 12.95it/s]


In [10]:
# predictions_from_statistics = []

# for i, row in tqdm(test_dataset.entity_df.iterrows()):
#     full_mention = row['full_mention'].strip().lower()
#     if full_mention in pytorch_utils.anchor_to_candidate:
#         best_candidate = pytorch_utils.anchor_to_candidate[full_mention][0]
#     else:
#         best_candidate = 0
#     predictions_from_statistics.append(best_candidate)

In [11]:
wiki_items = pd.read_csv(DATA_DIR + 'wiki_lite/wiki_items.csv')
# index wiki_items by id
wiki_items = wiki_items.set_index('item_id')
# Create item_id to wikipedia_title map
item_id_to_title = wiki_items['wikipedia_title'].to_dict()

In [12]:
enwiki_redirects = pd.read_csv(DATA_DIR + 'wiki_lite/enwiki_redirects.tsv', sep='\t', header=None, names=['source', 'target'])
# index enwiki_redirects by source
enwiki_redirects = enwiki_redirects.set_index('source')
# create source to target map
source_to_target = enwiki_redirects['target'].to_dict()

In [13]:
wiki_urls = []
not_found = 0
found = 0
# Now we will map these into wikipedia_urls
for i in tqdm(range(len(predictions))):
    if predictions[i] == 0:
        # if the prediction is 0, we will append a blank url
        wiki_urls.append('NOT_FOUND')
        not_found += 1
        continue
    wikipedia_title = item_id_to_title[predictions[i]]
    # does this wikipedia title exist in the redirects?
    if wikipedia_title in source_to_target:
        # if it does, we will replace it with the redirect
        new_title = source_to_target[wikipedia_title]
    # Now replace the spaces with underscores
    wikipedia_title = wikipedia_title.replace(' ', '_')
    # And add the wikipedia url
    wiki_urls.append(f'http://en.wikipedia.org/wiki/{wikipedia_title}')
    found += 1

print(f'Found {found} wikipedia urls')
print(f'Not found {not_found} wikipedia urls')
print(f'Percentage of wikipedia urls found: {found / (found + not_found)}')

100%|██████████| 9166/9166 [00:00<00:00, 963726.82it/s]

Found 9138 wikipedia urls
Not found 28 wikipedia urls
Percentage of wikipedia urls found: 0.9969452323805368





In [16]:
# not_found = 0
# found = 0
# train_wiki_urls = []
# # Now we will map these into wikipedia_urls
# for i in tqdm(range(len(predictions_train))):
#     if predictions_train[i] == 0:
#         # if the prediction is 0, we will append a blank url
#         train_wiki_urls.append('NOT_FOUND')
#         not_found += 1
#         continue
#     wikipedia_title = item_id_to_title[predictions_train[i]]
#     # does this wikipedia title exist in the redirects?
#     if wikipedia_title in source_to_target:
#         # if it does, we will replace it with the redirect
#         new_title = source_to_target[wikipedia_title]
#     # Now replace the spaces with underscores
#     wikipedia_title = wikipedia_title.replace(' ', '_')
#     # And add the wikipedia url
#     train_wiki_urls.append(f'http://en.wikipedia.org/wiki/{wikipedia_title}')
#     found += 1

# print(f'Found {found} wikipedia urls')
# print(f'Not found {not_found} wikipedia urls')
# print(f'Percentage of wikipedia urls found: {found / (found + not_found)}')

In [17]:
test = pd.read_csv(DATA_DIR + 'test.csv')
# train = pd.read_csv(DATA_DIR + 'train.csv')

In [18]:
not_nan = test['wiki_url'].notna()
not_nme = test['wiki_url'] != '--NME--'
# train_not_nan = train['wiki_url'].notna()
# train_not_nme = train['wiki_url'] != '--NME--'
test.loc[(not_nan & not_nme) & (test.id == 65002)]

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url
65002,65002,Dejan,B,Dejan Koturovic,?


In [19]:
test.loc[not_nan & not_nme, 'wiki_url'] = wiki_urls
# train.loc[train_not_nan & train_not_nme, 'wiki_url'] = train_wiki_urls

In [20]:
# replace NaN or --NME-- with NOT_FOUND
test['wiki_url'] = test['wiki_url'].fillna('NOT_FOUND')
test['wiki_url'] = test['wiki_url'].replace('--NME--', 'NOT_FOUND')
# train['wiki_url'] = train['wiki_url'].fillna('NOT_FOUND')
# train['wiki_url'] = train['wiki_url'].replace('--NME--', 'NOT_FOUND')

In [21]:
test[test.id == 65002]

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url
65002,65002,Dejan,B,Dejan Koturovic,http://en.wikipedia.org/wiki/Dejan_Koturović


In [22]:
' '.join(test.token.fillna('', inplace=False).to_list()[:250])

"-DOCSTART- (947testa CRICKET) CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY .  LONDON 1996-08-30  West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship .  Their stay on top , though , may be short-lived as title rivals Essex , Derbyshire and Surrey all closed in on victory while Kent made up for lost time in their rain-affected match against Nottinghamshire .  After bowling Somerset out for 83 on the opening morning at Grace Road , Leicestershire extended their first innings by 94 runs before being bowled out for 296 with England discard Andy Caddick taking three for 83 .  Trailing by 213 , Somerset got a solid start to their second innings before Simmons stepped in to bundle them out for 174 .  Essex , however , look certain to regain their top spot after Nasser Hussain and Peter Such gave them a firm grip on their match against Yorkshi

In [23]:
# # One problem, Simmons does not retrieve the cricket player, but querying for Phil Simmons does.
# # TODO: Fix this, one idea is to group by per doc id, check if this token has a better previous full mention
# display(wiki_items[wiki_items.index.isin(test_dataset.anchor_to_candidate['simmons'])])

# print('____')

# display(wiki_items[wiki_items.index.isin(test_dataset.anchor_to_candidate['phil simmons'])])

In [24]:
# now create a .csv file from id, wiki_url
test[['id', 'wiki_url']].to_csv('submission_with_doc_lower_score_limit_sample_skip.csv', index=False)
# train[['id', 'wiki_url']].to_csv('train_with_doc.csv', index=False)