In [1]:
import pandas as pd
from tqdm import tqdm

DATA_DIR = 'data/'

In [2]:
import torch

# is cuda available?
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
import pytorch_utils
import importlib
importlib.reload(pytorch_utils)

dataset = pytorch_utils.EntityDataset(device='cuda')
test_dataset = pytorch_utils.EntityDataset(train=False, device='cuda')

[32m[PyTorch-Utils][0m: Loading all-MiniLM-L6-v2 model & Generating sentence embeddings of Train set...


100%|██████████| 7375/7375 [00:02<00:00, 3654.50it/s]


Batches:   0%|          | 0/231 [00:00<?, ?it/s]

[32m[PyTorch-Utils][0m: Now generating entity embeddings...


Batches:   0%|          | 0/572 [00:00<?, ?it/s]

[32m[PyTorch-Utils][0m: Generating DataFrame for candidate generation...
[32m[PyTorch-Utils][0m: Generating Embedding for candidate descriptions...
[32m[PyTorch-Utils][0m: Loading all-MiniLM-L6-v2 model & Generating sentence embeddings of Test set...


100%|██████████| 3501/3501 [00:00<00:00, 3553.10it/s]


Batches:   0%|          | 0/110 [00:00<?, ?it/s]

[32m[PyTorch-Utils][0m: Now generating entity embeddings...


Batches:   0%|          | 0/287 [00:00<?, ?it/s]

[32m[PyTorch-Utils][0m: Generating DataFrame for candidate generation...
[32m[PyTorch-Utils][0m: Generating Embedding for candidate descriptions...


In [4]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda x: pytorch_utils.EntityDataset.collate_fn(x, device=device))
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=lambda x: pytorch_utils.EntityDataset.collate_fn(x, device=device))

In [5]:
importlib.reload(pytorch_utils)

<module 'pytorch_utils' from 'c:\\Users\\aybar\\Documents\\CS423-Project-3\\pytorch_utils.py'>

In [6]:
model = pytorch_utils.EntityClassifier(device=device)

In [7]:
# loss and optimizer
from torch import optim
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# training loop
from tqdm import tqdm


epochs = 20
pbar = tqdm(range(epochs))

for epoch in pbar:
    running_loss = 0.0
    for i, data in enumerate(dataloader):
        # get the inputs; data is a list of [inputs, labels]
        sentence_embeddings, entity_embeddings, candidate_ids, candidate_description_embeddings, labels = data
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        outputs = model((sentence_embeddings, entity_embeddings, candidate_ids, candidate_description_embeddings))
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # print statistics
        running_loss += loss.item()
    pbar.set_description(f'Epoch {epoch} loss: {running_loss / len(dataloader)}')

  sentence_embeddings = torch.tensor(sentence_embeddings, dtype=torch.float32, device=device)
Epoch 19 loss: 0.08988286826132827: 100%|██████████| 20/20 [04:17<00:00, 12.89s/it]


In [28]:
# let's do predictions on the test set

with torch.no_grad():
    predictions = []
    prediction_indexes = []
    for i, data in tqdm(enumerate(test_dataloader)):
        sentence_embeddings, entity_embeddings, candidate_ids, candidate_description_embeddings, labels = data
        outputs = model((sentence_embeddings, entity_embeddings, candidate_ids, candidate_description_embeddings))
        # which index has the highest value?
        _, predicted = torch.max(outputs, 1)
        prediction_indexes.extend(predicted.tolist())
        # predicted is shape (batch_size)
        # we want to grab the best candidate for batch item i
        # i.e the candidate_ids is of shape (batch_size, 5)
        # we want to grab the best candidate for batch item i
        # i.e get it to shape (batch_size)
        best_candidates = candidate_ids[torch.arange(candidate_ids.size(0)), predicted]

        # Append the best candidates to the predictions list
        predictions.extend(best_candidates.tolist())
        

287it [00:05, 54.98it/s]


In [37]:
wiki_items = pd.read_csv(DATA_DIR + 'wiki_lite/wiki_items.csv')
# index wiki_items by id
wiki_items = wiki_items.set_index('item_id')
# Create item_id to wikipedia_title map
item_id_to_title = wiki_items['wikipedia_title'].to_dict()

In [40]:
enwiki_redirects = pd.read_csv(DATA_DIR + 'wiki_lite/enwiki_redirects.tsv', sep='\t', header=None, names=['source', 'target'])
# index enwiki_redirects by source
enwiki_redirects = enwiki_redirects.set_index('source')
# create source to target map
source_to_target = enwiki_redirects['target'].to_dict()

In [77]:
wiki_urls = []

# Now we will map these into wikipedia_urls
for i in tqdm(range(len(predictions))):
    if predictions[i] == 0:
        # if the prediction is 0, we will append a blank url
        wiki_urls.append('NOT_FOUND')
        continue
    wikipedia_title = item_id_to_title[predictions[i]]
    # does this wikipedia title exist in the redirects?
    if wikipedia_title in source_to_target:
        # if it does, we will replace it with the redirect
        new_title = source_to_target[wikipedia_title]
    # Now replace the spaces with underscores
    wikipedia_title = wikipedia_title.replace(' ', '_')
    # And add the wikipedia url
    wiki_urls.append(f'http://en.wikipedia.org/wiki/{wikipedia_title}')

100%|██████████| 9166/9166 [00:00<00:00, 1527595.28it/s]


In [78]:
test = pd.read_csv(DATA_DIR + 'test.csv')

In [79]:
not_nan = test['wiki_url'].notna()
not_nme = test['wiki_url'] != '--NME--'
test.loc[(not_nan & not_nme) & (test.id == 65002)]

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url
65002,65002,Dejan,B,Dejan Koturovic,?


In [80]:
test.loc[not_nan & not_nme, 'wiki_url'] = wiki_urls

In [81]:
# replace NaN or --NME-- with NOT_FOUND
test['wiki_url'] = test['wiki_url'].fillna('NOT_FOUND')
test['wiki_url'] = test['wiki_url'].replace('--NME--', 'NOT_FOUND')

In [82]:
test[test.id == 65002]

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url
65002,65002,Dejan,B,Dejan Koturovic,NOT_FOUND


In [83]:
# now create a .csv file from id, wiki_url
test[['id', 'wiki_url']].to_csv('submission.csv', index=False)
