In [163]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time

In [4]:
train_data = pd.read_json("train.json")

In [5]:
train_data.shape

(25793, 5)

In [6]:
train_data.head()

Unnamed: 0,authors,year,abstract,venue,title
0,"[42, 13720, 36]",9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20.0,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,"[1359, 15881, 45]",15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2.0,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
2,"[19166, 17763]",17,"[40, 1542, 1691, 2449, 1535, 2610, 1543, 1535,...",,"[2085, 1719, 1846, 1745, 2243, 1553, 1606, 159..."
3,[97],10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4.0,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,"[19617, 2]",10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9.0,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."


In [28]:
def get_prolific_authors(authors):
    """Filter a list of authors to contain prolific authors only (ID < 100)

    Parameters
    ----------        
    authors : 
        A list of authors. 

    -------
    Return 
    y : 
        A list of prolific authors.
    
    """
    return list(filter(lambda x: x < 100, authors))

In [36]:
y_train = list(map(get_prolific_authors, train_data["authors"].tolist()))
y_train[:5]

[[42, 36], [45], [], [97], [2]]

In [40]:
X_train = train_data["abstract"]
X_train[:5]

0    [2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...
1    [40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...
2    [40, 1542, 1691, 2449, 1535, 2610, 1543, 1535,...
3    [46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...
4    [37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...
Name: abstract, dtype: object

In [220]:
X_train_tensor = [ torch.tensor(X, dtype=torch.float32) for X in X_train ]
X_train_tensor[0].size()

torch.Size([87])

In [221]:
type(X_train.values.tolist())

list

Prepare y output and loss function

In [430]:
def author_ids_to_multi_hot(authors):
    """Convert author id to multi-hot representation

    Parameters
    ----------        
    authors : 
        A list of author ids (<100). e.g. [23, 47]

    -------
    Return 
    y : 
        A list of zeors and ones of length 100. [0, 0, 0, .... 1, 0, 0, 0, ... 1, 0, 0, .....]
    
    """
    tensor = torch.zeros(1, 100, dtype=torch.float32)
    for i, author_id in enumerate(authors):
        tensor[0][author_id] = 1  
    return torch.squeeze(tensor)

In [431]:
author_ids_to_multi_hot(y_train[0])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [432]:
multi_hot_y_train = list(map(author_ids_to_multi_hot, y_train))
multi_hot_y_train[0].shape

torch.Size([100])

In [275]:
criterion = nn.BCELoss()

In [276]:
guess = torch.zeros(1, 100, dtype=torch.float32)
guess[0][1] = 0.8
guess[0][2] = 1
guess2 = torch.zeros(1, 100, dtype=torch.float32)
guess2[0][1] = 1
# truth = author_ids_to_multi_hot(y_train[0])
criterion(input=guess, target=guess2)

tensor(1.0022)

In [277]:
m = nn.Sigmoid()
m(guess)

tensor([[0.5000, 0.6900, 0.7311, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000]])

Load test data set

In [361]:
test_data = pd.read_json("test.json")

In [362]:
test_data.shape

(800, 6)

In [363]:
test_data.head(5)

Unnamed: 0,identifier,coauthors,year,abstract,venue,title
0,0,"[16336, 1762, 4357, 12564]",19,"[37, 1662, 3207, 10, 33, 2037, 1738, 1642, 155...",223.0,"[3207, 24, 1798, 1738, 37, 2375, 1568, 11, 53,..."
1,1,"[21189, 14088]",19,"[1731, 2130, 3674, 1705, 1656, 3077, 1546, 367...",223.0,"[40, 1560, 1536, 1544, 1609, 1705, 1658, 1543,..."
2,2,"[3625, 1198, 19889, 794, 2749, 7801]",19,"[1551, 1728, 3920, 1542, 1535, 1656, 1543, 153...",7.0,"[47, 1574, 1729, 1641, 11, 37, 2533, 2015, 47,..."
3,3,"[19810, 15173, 5876, 111]",19,"[51, 1535, 2115, 1543, 1811, 1700, 1657, 1684,...",21.0,"[1770, 53, 2054, 1549, 1529, 1723, 2796, 1547,..."
4,4,"[10932, 7668, 11907, 19601, 15307, 10492, 1049...",19,"[1775, 1746, 1842, 1525, 33, 2551, 1882, 1542,...",,"[18, 1924, 23, 1544, 3927, 2686, 1543, 1535, 1..."


In [424]:
X_test = test_data["abstract"]

In [425]:
X_test_tensor = [ torch.tensor(X, dtype=torch.float32) for X in X_test ]

In [427]:
test_ids = test_data["identifier"]

Prepare data loader

In [440]:
trainset = list(zip(X_train_tensor[:2000], multi_hot_y_train[:2000]))
testset = list(zip(X_test_tensor, test_ids))
# testset = list(zip(X_train_tensor[20000:21000], multi_hot_y_train[20000:21000]))

In [441]:
train_loader = torch.utils.data.DataLoader(trainset, batch_size=1, shuffle=False, drop_last=True)
test_loader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False, drop_last=True)
counter = 0
for i, data in enumerate(train_loader):
    if counter >= 2:
        break
    counter += 1
    abstract, author = data
    print(abstract[:5])
    print("author:", author)
    print(criterion(input=author, target=author))

tensor([[2455., 1858., 2335., 1543., 1800., 1860., 2000., 2867., 1546., 1874.,
         2059., 1525., 2590., 4196.,   12., 2634., 1543., 1800., 1586., 2866.,
         3595., 1866., 1670., 2000., 3743., 1542., 1650., 1527.,   33., 4407.,
         1543., 1535., 1962., 1961., 1543.,   33., 1700., 1543., 1535., 1647.,
         1546., 1580., 4720.,   12., 1731., 4231., 2601., 1553., 1704., 1605.,
         2456., 1543., 3281., 1594., 4407., 2168., 1542., 1586., 3781., 2471.,
         1525., 1859., 1669., 2512., 4572., 1546., 1609., 3781., 2471., 1525.,
         3393.,   12.,   37., 1712., 1586., 4196., 1650., 1527., 3281., 1594.,
         4407., 1800., 4708., 1904., 2059., 2411.,   12.]])
author: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

Baseline RNN (only use abstract as input)

In [442]:
class GRUClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUClassifier, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
    def forward(self, input_sequence):
        # apply GRU to full input sequence, and retain final hidden state
        _, hidden = self.gru(input_sequence)
        # couple final hidden state to multiclass classifier, i.e., softmax output
        output = self.h2o(hidden.view(1, -1)) 
        output = self.sigmoid(output)
        return output

In [443]:
input_size = 1 # integer encoding
hidden_size = 32 
output_size = 100 # 100 prolific authors
learning_rate = 0.005
n_iters = 1000

In [444]:
def train(model, train_loader, test_loader, optimizer, n_epochs=2):
    """
    Generic training loop for supervised multiclass learning
    """
    LOG_INTERVAL = 250
    running_loss, running_accuracy = list(), list()
    start_time = time.time()
    criterion = nn.BCELoss()
    
    counter = 0 # for debugging

    for epoch in range(n_epochs):  # Loop over training dataset `n_epochs` times

        epoch_loss = 0.

        for i, data in enumerate(train_loader):  # Loop over elements in training set

            x, y = data
            
            x = torch.squeeze(x)
            x = torch.unsqueeze(x, dim=1)
            x = torch.unsqueeze(x, dim=2)
#             if counter < 8 :
#                 print(type(x))
#                 print(x.shape)
#                 counter += 1
            results = model(x)

#             predictions = torch.argmax(logits, dim=1)
            train_acc = 0 # torch.mean(torch.eq(predictions, labels).float()).item()

            loss = criterion(input=results, target=y)

            loss.backward()               # Backward pass (compute parameter gradients)
            optimizer.step()              # Update weight parameter using SGD
            optimizer.zero_grad()         # Reset gradients to zero for next iteration


            # ============================================================================
            # You can safely ignore the boilerplate code below - just reports metrics over
            # training and test sets

            running_loss.append(loss.item())
            running_accuracy.append(train_acc)

            epoch_loss += loss.item()

            if i % LOG_INTERVAL == 0:  # Log training stats
                deltaT = time.time() - start_time
                mean_loss = epoch_loss / (i+1)
                print('[TRAIN] Epoch {} [{}/{}]| Mean loss {:.4f} | Train accuracy {:.5f} | Time {:.2f} s'.format(epoch, 
                    i, len(train_loader), mean_loss, train_acc, deltaT))

#         print('Epoch complete! Mean loss: {:.4f}'.format(epoch_loss/len(train_loader)))

        test(model, criterion, test_loader)
        
    return running_loss, running_accuracy

In [446]:
def test(model, criterion, test_loader):
    test_loss = 0.
    test_preds, test_labels = list(), list()
    for i, data in enumerate(test_loader):
        x, y = data

        with torch.no_grad():
            
            ### to get the right shape ### 
            x = torch.squeeze(x)
            x = torch.unsqueeze(x, dim=1)
            x = torch.unsqueeze(x, dim=2)
            ### to get the right shape ### 
            
            results = model(x)  # Compute scores
#             test_loss += criterion(input=results, target=y).item()
            predictions = torch.where(results > 0.5, 1, 0)
            test_preds.append(predictions)
#             test_labels.append(y)

    test_preds = torch.cat(test_preds)
#     test_labels = torch.cat(test_labels)

#     test_accuracy = torch.eq(test_preds, test_labels).float().mean().item()

#     print('[TEST] Mean loss {:.4f} | Accuracy {:.4f}'.format(test_loss/len(test_loader), test_accuracy))
    
    return test_preds

In [439]:
model = GRUClassifier(input_size , hidden_size, output_size)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
model_loss, model_acc = train(model, train_loader, test_loader, optimizer)

[TRAIN] Epoch 0 [0/5000]| Mean loss 0.7233 | Train accuracy 0.00000 | Time 0.02 s
[TRAIN] Epoch 0 [250/5000]| Mean loss 0.6781 | Train accuracy 0.00000 | Time 5.20 s
[TRAIN] Epoch 0 [500/5000]| Mean loss 0.6440 | Train accuracy 0.00000 | Time 11.18 s
[TRAIN] Epoch 0 [750/5000]| Mean loss 0.6124 | Train accuracy 0.00000 | Time 16.84 s
[TRAIN] Epoch 0 [1000/5000]| Mean loss 0.5834 | Train accuracy 0.00000 | Time 22.41 s
[TRAIN] Epoch 0 [1250/5000]| Mean loss 0.5563 | Train accuracy 0.00000 | Time 28.06 s
[TRAIN] Epoch 0 [1500/5000]| Mean loss 0.5309 | Train accuracy 0.00000 | Time 33.86 s
[TRAIN] Epoch 0 [1750/5000]| Mean loss 0.5073 | Train accuracy 0.00000 | Time 39.66 s
[TRAIN] Epoch 0 [2000/5000]| Mean loss 0.4855 | Train accuracy 0.00000 | Time 45.20 s
[TRAIN] Epoch 0 [2250/5000]| Mean loss 0.4653 | Train accuracy 0.00000 | Time 50.81 s
[TRAIN] Epoch 0 [2500/5000]| Mean loss 0.4467 | Train accuracy 0.00000 | Time 56.51 s
[TRAIN] Epoch 0 [2750/5000]| Mean loss 0.4294 | Train accuracy

ValueError: Using a target size (torch.Size([1])) that is different to the input size (torch.Size([1, 100])) is deprecated. Please ensure they have the same size.

Prepare output format

In [451]:
def multi_hot_authors_to_integer(authors):
    """Convert multi hot authors to integer representation

    Parameters
    ----------        
    authors : 
        A list of zeors and ones of length 100. [0, 0, 0, .... 1, 0, 0, 0, ... 1, 0, 0, .....]
        
    -------
    Return  
        A list of author ids (<100). e.g. [23, 47]
        return [-1] for empty multi hot representation
    
    """
    author_ids = torch.where(authors == 1)[1]
    if len(author_ids) > 0:
        return author_ids.tolist()
    else:
        return [-1]

def list_of_integers_to_string_format(integers):
    """Convert integer author ids to string representation

    Parameters
    ----------        
    list of integers : e.g. [1, 2]
        
    -------
    Return: string of format e.g. "1 2"
    
    """
    return " ".join([str(author_id) for author_id in integers])
    
def multi_hot_to_csv(all_test_predictions, test_ids):
    test_integers = [ multi_hot_authors_to_integer(pred) for pred in all_test_predictions]
    string_integers = list(map(list_of_integers_to_string_format, test_integers))
    
    header = ["ID", "Predict"]
    test_ids = [str(test_id) for test_id in test_ids]
    results = list(zip(test_ids, string_integers))
    output = [header] + results
    a = np.asarray(output)
    np.savetxt("foo.csv", a, delimiter=",", fmt='%s')
    

In [408]:
guess = torch.zeros(1, 100, dtype=torch.float32)
guess[0][1] = 1
guess[0][2] = 1
multi_hot_authors_to_integer(guess)

tensor([[0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
(tensor([0, 0]), tensor([1, 2]))


[1, 2]

In [423]:
predictions = preds[:5].unsqueeze(dim=1)
multi_hot_to_csv(predictions, [1, 2, 3, 4, 5])

Generate test ouputs

In [447]:
preds = test(model, criterion, test_loader)

In [452]:
predictions = preds.unsqueeze(dim=1)
multi_hot_to_csv(predictions, test_ids.tolist())