In [97]:
import torch
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import math
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
import collections

In [117]:
max_prolific = 99
n_text = 4999
n_authors = 21245
n_prolific = 100
n_coauthors = n_authors - n_prolific + 1
n_years = 19
n_venues = 466
embedding_dim = 50
batch_size = 40
input_dim = embedding_dim
hidden_dim = 200
output_dim = 100
learning_rate = 0.001
num_epochs = 100

In [48]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [89]:
train_data_path = './data/train.json'
test_data_path = './data/test.json'
# read train json file
with open(train_data_path, 'r', encoding='utf-8') as f:
    raw_train = json.load(f)
# read test json file
with open(test_data_path, 'r', encoding='utf-8') as f:
    raw_test = json.load(f)
    
# extract coauthors as a new key from train.json
for i in range(len(raw_train)):
    coauthors = []
    prolific_authors = []
    for auth in raw_train[i]['authors']:
        if auth >= max_prolific:
            coauthors.append(auth)
        else:
            prolific_authors.append(auth)
    if len(prolific_authors) == 0:
        prolific_authors = -1
    raw_train[i]['coauthors'] = coauthors
    raw_train[i]['prolific_authors'] = prolific_authors

train_df = pd.DataFrame.from_dict(raw_train)
#train_df['COMBO'] = train_df['title'] + train_df['abstract']
train_df = train_df.drop(['authors'], axis=1)
train_df['venue'] = train_df['venue'].replace('', 465)

test_df = pd.DataFrame.from_dict(raw_test)
#test_df['COMBO'] = test_df['title'] + test_df['abstract']
#test_df = test_df.drop(['title', 'abstract'], axis=1)
test_df['venue'] = test_df['venue'].replace('', 465)


train_df.head()

Unnamed: 0,year,abstract,venue,title,coauthors,prolific_authors
0,9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1...",[13720],"[42, 36]"
1,15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3...","[1359, 15881]",[45]
2,17,"[40, 1542, 1691, 2449, 1535, 2610, 1543, 1535,...",465,"[2085, 1719, 1846, 1745, 2243, 1553, 1606, 159...","[19166, 17763]",-1
3,10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5...",[],[97]
4,10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,...",[19617],[2]


In [107]:
train_df_prolific = train_df[train_df['prolific_authors'] != -1]
train_df_noprolific = train_df[train_df['prolific_authors'] == -1]
train_df_combine = pd.concat([train_df_prolific, train_df_noprolific.tail(1000)], axis=0)
train_df_combine = train_df_combine.reset_index(drop=True)
train_df_combine

Unnamed: 0,year,abstract,venue,title,coauthors,prolific_authors
0,9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1...",[13720],"[42, 36]"
1,15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3...","[1359, 15881]",[45]
2,10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5...",[],[97]
3,10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,...",[19617],[2]
4,18,"[1731, 2021, 1543, 11, 1546, 11, 1647, 2163, 1...",0,"[1560, 1694, 11, 1546, 11, 3066, 1728, 47, 160...","[9641, 5623]","[44, 2]"
...,...,...,...,...,...,...
8419,14,"[44, 4221, 1535, 1848, 1543, 1839, 1866, 2828,...",20,"[3207, 3592, 1535, 2499, 1543, 4199, 1546, 237...","[11958, 20085]",-1
8420,11,"[46, 1605, 1691, 10, 2551, 4168, 2030, 3134, 1...",278,"[47, 1574, 1541, 1549, 47, 1966, 2114, 112, 12...","[18357, 11563, 7905]",-1
8421,7,"[1611, 1543, 1535, 2817, 2073, 1647, 11, 2933,...",17,"[56, 1718, 3012, 56, 1902, 2160, 1546, 1623, 1...","[2236, 20436, 10501]",-1
8422,16,"[37, 1662, 33, 2007, 1669, 4981, 1650, 1527, 1...",0,"[3066, 1728, 11, 1546, 11, 3066, 1728, 1531, 1...","[3921, 8760]",-1


In [51]:
test_df.head()

Unnamed: 0,identifier,coauthors,year,venue,COMBO
0,0,"[16336, 1762, 4357, 12564]",19,223,"[3207, 24, 1798, 1738, 37, 2375, 1568, 11, 53,..."
1,1,"[21189, 14088]",19,223,"[40, 1560, 1536, 1544, 1609, 1705, 1658, 1543,..."
2,2,"[3625, 1198, 19889, 794, 2749, 7801]",19,7,"[47, 1574, 1729, 1641, 11, 37, 2533, 2015, 47,..."
3,3,"[19810, 15173, 5876, 111]",19,21,"[1770, 53, 2054, 1549, 1529, 1723, 2796, 1547,..."
4,4,"[10932, 7668, 11907, 19601, 15307, 10492, 1049...",19,465,"[18, 1924, 23, 1544, 3927, 2686, 1543, 1535, 1..."


In [98]:
class AuthorDataset(Dataset):

    def __init__(self, dataframe, istrain):
        self.data = dataframe
        self.x = dataframe[['year', 'venue', 'coauthors', 'title', 'abstract']]
        self.istrain = istrain
        if self.istrain == True:
            self.y = self.data.prolific_authors

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, index):
        title = self.data.title[index]
        abstract = self.data.abstract[index]
        # abstract and title
        abstract_list = [0] * n_text
        title_list = [0] * n_text
        abstract_freq = collections.Counter(abstract)
        title_freq = collections.Counter(title)
        for key, value in dict(abstract_freq).items():
            abstract_list[key-1] = value
        for key, value in dict(title_freq).items():
            title_list[key-1] = value
            
        
        year = self.data.year[index]
        venue = self.data.venue[index]
        # coauthors
        coauthors = self.data.coauthors[index]
        coauthor_list = [0] * (n_authors - n_prolific + 2) # 21147 elements with the last element for empty coauthors
        if coauthors == []:
            coauthor_list[-1] = 1
        else:
            for coauthor in coauthors:
                coauthor_list[coauthor-n_prolific] = 1
                
        x_output = {"title": title_list, "abstract": abstract_list, "year": year, "venue": venue, "coauthors": coauthor_list}
        
        if self.istrain == True:
            prolific_list = [0] * n_prolific
            if self.y[index] != -1:
            #if len(self.y[index]) != 0:
                for prolific in self.y[index]:
                    prolific_list[prolific] = 1
            y_output = prolific_list
            return x_output, y_output
        else:
            return x_output

In [108]:
training_df = AuthorDataset(train_df_combine, istrain = True)
testing_df = AuthorDataset(test_df, istrain = False)
print(training_df[0])

({'title': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [109]:
def my_collate(batch):
    # for training set
    if len(batch[0]) == 2:
        output = {"title": [], "abstract": [], "year": [], "venue": [], "coauthors": [], "target": []}
        
        for data in batch:
            x = data[0]
            target = data[1]
            
            output['title'] += [torch.tensor(x['title'], dtype=torch.long)]
            output['abstract'] += [torch.tensor(x['abstract'], dtype=torch.long)]
            output['year'] += [x['year']]
            output['venue'] += [x['venue']]
            output['coauthors'] += [torch.tensor(x['coauthors'], dtype=torch.long)]
            output['target'] += [target]
            
        output['year'] = torch.tensor(output['year'], dtype=torch.long)
        output['venue'] = torch.tensor(output['venue'], dtype=torch.long)
        output['target'] = torch.tensor(output['target'], dtype=torch.float)
        return output
    
    # for testing set
    else:
        output = {"title": [], "abstract": [], "year": [], "venue": [], "coauthors": []}
        
        for data in batch:
            output['title'] += [torch.tensor(data['title'], dtype=torch.long)]
            output['abstract'] += [torch.tensor(data['abstract'], dtype=torch.long)]
            output['year'] += [data['year']]
            output['venue'] += [data['venue']]
            output['coauthors'] += [torch.tensor(data['coauthors'], dtype=torch.long)]
            
        output['year'] = torch.tensor(output['year'], dtype=torch.long)
        output['venue'] = torch.tensor(output['venue'], dtype=torch.long)
        return output


In [118]:
train_dataloader = DataLoader(dataset = training_df, batch_size = batch_size, shuffle=True, collate_fn = my_collate)
test_dataloader = DataLoader(dataset = testing_df, batch_size = batch_size, shuffle=False, collate_fn = my_collate)
dataiter = next(iter(test_dataloader))
dataiter

{'title': [tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([1, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0,  ..., 0, 0, 0]),
  tensor([0, 0, 0, 

In [121]:
class MultilabelModel(torch.nn.Module):

    def __init__(self, embed_dim, hidden_size, num_layers, num_classes):
        super(MultilabelModel, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.text_embedding = torch.nn.Embedding(n_text+1, embed_dim)
        self.year_embedding = torch.nn.Embedding(n_years+1, embed_dim)
        self.venue_embedding = torch.nn.Embedding(n_venues, embed_dim)
        self.coauthor_embedding = torch.nn.Embedding(n_authors+1, embed_dim)

        #self.linear1 = torch.nn.Linear(embed_dim*4, 128)
        #self.activation = torch.nn.ReLU()
        #self.linear2 = torch.nn.Linear(128, output_dim)
        #self.softmax = torch.nn.Softmax()
        self.lstm = torch.nn.LSTM(embed_dim, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, num_classes)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        #embed_year = self.year_embedding(x['year'])
        #embed_venue = self.venue_embedding(x['venue'])
        
        #embed_title_list = []
        #for curr_title in x['title']:
        #    embed_curr_title = self.text_embedding(curr_title)
        #    embed_title_list.append(embed_curr_title.mean(dim=0))
        #embed_title = torch.stack(embed_title_list)
        #print(embed_title.shape)
        
        embed_coauthor_list = []
        for curr_coauthor in x['coauthors']:
            embed_curr_coauthor = self.coauthor_embedding(curr_coauthor)
            embed_coauthor_list.append(embed_curr_coauthor.mean(dim=0))
        embed_coauthor=torch.stack(embed_coauthor_list)
        #print(embed_coauthor.shape)
        out = embed_coauthor.reshape([40,1,100])
        #print(out.shape)
        #combined_features = torch.cat((embed_year, embed_venue, embed_text, embed_coauthor), dim=-1)
        
        h0 = torch.zeros(self.num_layers, out.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, out.size(0), self.hidden_size)
        
        # Forward propagate LSTM
        out, (hn, cn) = self.lstm(out, (h0, c0))  
        # out: tensor of shape (batch_size, seq_length, hidden_size)
        out = self.fc(out[:, -1, :])
        out = self.sigmoid(out)
        
        return out

In [None]:
clf = MultilabelModel(100, 128, 1, 100)
# loss and optimizer
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(clf.parameters(),lr=learning_rate)

# training loop
n_total_steps = len(train_dataloader)
for epoch in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        #batch = batch.to(device)

        # forward
        outputs = clf(batch)
        
        loss = criterion(outputs, batch['target'])
        
        predictions = np.where(outputs.detach().numpy()>0.5, 1, 0)
        f1_acc = f1_score(batch['target'].detach().numpy(), predictions, average="samples", zero_division=1)

        
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #if (i + 1) % 100 == 0:
    print(f'epoch {epoch + 1} / {num_epochs}, loss = {loss.item():.4f}, training f1 score = {f1_acc:.4f}')

In [33]:
# testing
test_preds = []
with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        outputs = clf(batch)
        
        predictions = np.where(outputs.detach().numpy()>0, 1, 0)
        test_preds.append(predictions)

In [34]:
test_preds[0]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0,

In [35]:
def find(lst, num):
    result = []
    for i, x in enumerate(lst):
        if x==num:
            result.append(i)
    return result

In [36]:
NN_result = test_df[['identifier']]
NN_result.loc[:,'Predict'] = ''
final_result = []
for i in range(len(test_preds)):
    for j in range(batch_size):
        final_result.append(test_preds[i][j])
print(len(final_result))
for i in range(len(final_result)):
    result = final_result[i]
    if len(find(list(result), 1)) == 0:
        NN_result.loc[i,'Predict'] = -1
    else:
        NN_result.loc[i,'Predict'] = ' '.join(str(e) for e in find(list(result), 1))
        
NN_result = NN_result.rename(columns={'identifier':'ID'})
NN_result.to_csv('./NN_results.csv', index=False)

800


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [37]:
NN_result

Unnamed: 0,ID,Predict
0,0,-1
1,1,-1
2,2,-1
3,3,78
4,4,-1
...,...,...
795,795,-1
796,796,-1
797,797,-1
798,798,-1
