In [901]:
import torch
from torch import nn
import numpy as np
import pandas as pd
import json
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import math
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score

In [1390]:
max_prolific = 99
n_text = 4999
n_authors = 21245
n_prolific = 100
n_coauthors = n_authors - n_prolific + 1
n_years = 19
n_venues = 466
batch_size = 32
learning_rate = 0.001
num_epochs = 100

In [444]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [1291]:
train_data_path = './data/train.json'
test_data_path = './data/test.json'
# read train json file
with open(train_data_path, 'r', encoding='utf-8') as f:
    raw_train = json.load(f)
# read test json file
with open(test_data_path, 'r', encoding='utf-8') as f:
    raw_test = json.load(f)
    
# extract coauthors as a new key from train.json
title_list = []
abstract_list = []
word_list = []
for i in range(len(raw_train)):
    coauthors = []
    prolific_authors = []
    for auth in raw_train[i]['authors']:
        if auth >= max_prolific:
            coauthors.append(auth)
        else:
            prolific_authors.append(auth)
    if len(prolific_authors) == 0:
        prolific_authors = -1
    raw_train[i]['coauthors'] = coauthors
    raw_train[i]['prolific_authors'] = prolific_authors
    
train_df = pd.DataFrame.from_dict(raw_train)
train_df = train_df.drop(['authors'], axis=1)
train_df['venue'] = train_df['venue'].replace('', 465)
for i in range(len(train_df)):
    title_list.append(train_df['title'][i])
    abstract_list.append(train_df['abstract'][i])
    word_list.append(train_df['title'][i])
    word_list.append(train_df['abstract'][i])

    
test_df = pd.DataFrame.from_dict(raw_test)
test_df['venue'] = test_df['venue'].replace('', 465)
for i in range(len(test_df)):
    title_list.append(test_df['title'][i])
    abstract_list.append(test_df['abstract'][i])
    word_list.append(test_df['title'][i])
    word_list.append(test_df['abstract'][i])

train_df.shape

(25793, 6)

In [1358]:
print(title_list[1:20])

[[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3522, 2223, 1653], [2085, 1719, 1846, 1745, 2243, 1553, 1606, 1596, 42, 41, 1606, 1665, 40, 1615, 1677, 40, 127, 43, 140, 50, 1583, 43], [40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 56, 1687, 1644, 1546, 46, 1624, 1547, 4226], [38, 1592, 2088, 1543, 1574, 1727, 1597, 1813, 1926, 1527, 1623, 1621, 50, 1620, 1632], [46, 1617, 1667, 3979, 2073, 37, 53, 2080, 1545, 40, 1804, 1530, 2587, 57, 4624], [34, 3646, 2073, 2035, 2346, 1886, 1543, 57, 1627, 11, 53, 1584, 1903, 3628, 1724], [1615, 1966, 11, 3495, 1656, 4345, 24, 2353, 1826, 2156, 3781, 4692], [3591, 4914, 46, 2421, 1608, 37, 1740, 1825, 1549, 57, 45, 2303, 1539, 2154, 51], [1560, 1694, 11, 1546, 11, 3066, 1728, 47, 1603, 1553, 11, 1546, 11, 1594, 1531, 1532, 1547], [1751, 44, 3474, 1854, 1872, 1538, 24, 2574, 52, 1918, 57, 1527, 46, 1528, 1727, 1525, 2149], [47, 1570, 40, 1733, 1735, 1540, 1525, 1535, 1540, 1863, 1543, 47, 1574, 1541, 1854, 1549, 2796, 1854, 53, 1540, 1537, 4454, 2062, 15

In [1361]:
from gensim.models import Word2Vec, Doc2Vec
titlemodel = Word2Vec(title_list, min_count=1, vector_size=128)
input_size = 128
word_vectors = titlemodel.wv
keyword_list = []
for i in range(len(train_df)):
    title_i = title_list[i]
    keyword_vec = np.zeros(input_size)
    for word in title_i:
        keyword_vec += word_vectors[word]
    keyword_vec = keyword_vec/len(title_i)
    keyword_list.append(keyword_vec)
keyword_list = np.array(keyword_list)



#wordmodel = Word2Vec(word_list, min_count=1)
#wordmodel.save("word_word2vec.model")

In [1383]:
abstractmodel = Word2Vec(abstract_list, min_count=1, vector_size=128)
input_size = 128
word_vectors2 = abstractmodel.wv
keyword_list2 = []
for i in range(len(train_df)):
    abstract_i = abstract_list[i]
    keyword_vec = np.zeros(input_size)
    for word in abstract_i:
        keyword_vec += word_vectors2[word]
    keyword_vec = keyword_vec/len(abstract_i)
    keyword_list2.append(keyword_vec)
keyword_list2 = np.array(keyword_list2)

In [1384]:
keywords = keyword_list + keyword_list2
print(keywords)

[[-0.48349956 -0.51170217  0.11739261 ...  0.60842679 -0.13248341
   0.71195911]
 [-0.42123375 -0.51167642  0.72696966 ... -0.17041562  0.27131101
   0.20101655]
 [ 0.33492874 -0.41464009  0.30027379 ...  0.27510964 -0.09162126
   0.18025668]
 ...
 [ 0.08501742 -0.39033312  0.44491885 ... -0.27592777  0.33888846
   0.23611297]
 [-0.60112178 -0.60781407  0.48994653 ...  0.24793403  0.22349386
   0.41398034]
 [ 0.08621521 -0.3738243   0.1974229  ...  0.33539617  0.09805431
   0.30733421]]


In [1040]:
train_df_prolific = train_df[train_df['prolific_authors'] != -1]
train_df_noprolific = train_df[train_df['prolific_authors'] == -1]
train_df_combine = pd.concat([train_df_prolific, train_df_noprolific.tail(3000)], axis=0)
train_df_combine = train_df_combine.reset_index(drop=True)
train_df_combine.head(3)

Unnamed: 0,year,abstract,venue,title,coauthors,prolific_authors
0,9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1...",[13720],"[42, 36]"
1,15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3...","[1359, 15881]",[45]
2,10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5...",[],[97]


In [1022]:
test_df.head(3)

Unnamed: 0,identifier,coauthors,year,abstract,venue,title
0,0,"[16336, 1762, 4357, 12564]",19,"[37, 1662, 3207, 10, 33, 2037, 1738, 1642, 155...",223,"[3207, 24, 1798, 1738, 37, 2375, 1568, 11, 53,..."
1,1,"[21189, 14088]",19,"[1731, 2130, 3674, 1705, 1656, 3077, 1546, 367...",223,"[40, 1560, 1536, 1544, 1609, 1705, 1658, 1543,..."
2,2,"[3625, 1198, 19889, 794, 2749, 7801]",19,"[1551, 1728, 3920, 1542, 1535, 1656, 1543, 153...",7,"[47, 1574, 1729, 1641, 11, 37, 2533, 2015, 47,..."


In [1147]:
class AuthorDataset(Dataset):

    def __init__(self, dataframe, istrain):
        self.data = dataframe
        self.x = dataframe[['year', 'venue', 'coauthors', 'abstract', 'title']]
        self.istrain = istrain
        if self.istrain == True:
            self.y = self.data.prolific_authors

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, index):
        year = self.data.year[index]
        venue = self.data.venue[index]
        title = self.data.title[index]
        
        # word2vec on abstract
        #abstractmodel = Word2Vec.load("abstract_word2vec.model")
        abstract = self.data.abstract[index]
        #abstract_vector = abstractmodel.wv[abstract]
        
        # coauthors to one hot
        coauthors = self.data.coauthors[index]
        coauthor_list = [0] * (n_authors - n_prolific + 2) # 21147 elements with the last element for empty coauthors
        if coauthors == []:
            coauthor_list[-1] = 1
        else:
            for coauthor in coauthors:
                coauthor_list[coauthor-n_prolific] = 1
                
        x_output = {"title": title, "abstract": abstract, "year": year, "venue": venue, "coauthors": coauthor_list}
        
        # target to one hot
        if self.istrain == True:
            prolific_list = [0] * n_prolific
            if self.y[index] != -1:
                for prolific in self.y[index]:
                    prolific_list[prolific] = 1
            y_output = prolific_list
            return x_output, y_output
        else:
            return x_output

In [1184]:
training_df = AuthorDataset(train_df, istrain = True)
testing_df = AuthorDataset(test_df, istrain = False)

In [1185]:
def my_collate(batch):
    # for training set
    if len(batch[0]) == 2:
        output = {"title": [], "abstract": [], "year": [], "venue": [], "coauthors": [], "target": []}
        
        for data in batch:
            x = data[0]
            target = data[1]
            
            output['title'] += [torch.tensor(x['title'], dtype=torch.long)]
            output['abstract'] += [torch.tensor(x['abstract'], dtype=torch.long)]
            output['year'] += [x['year']]
            output['venue'] += [x['venue']]
            output['coauthors'] += [torch.tensor(x['coauthors'], dtype=torch.long)]
            output['target'] += [target]
            
        output['year'] = torch.tensor(output['year'], dtype=torch.long)
        output['venue'] = torch.tensor(output['venue'], dtype=torch.long)
        output['target'] = torch.tensor(output['target'], dtype=torch.float)
        return output
    
    # for testing set
    else:
        output = {"title": [], "abstract": [], "year": [], "venue": [], "coauthors": []}
        
        for data in batch:
            output['title'] += [torch.tensor(data['title'], dtype=torch.long)]
            output['abstract'] += [torch.tensor(data['abstract'], dtype=torch.long)]
            output['year'] += [data['year']]
            output['venue'] += [data['venue']]
            output['coauthors'] += [torch.tensor(data['coauthors'], dtype=torch.long)]
            
        output['year'] = torch.tensor(output['year'], dtype=torch.long)
        output['venue'] = torch.tensor(output['venue'], dtype=torch.long)
        return output


In [1388]:
train_dataloader = DataLoader(dataset = training_df, batch_size = batch_size, shuffle=True, collate_fn = my_collate)
test_dataloader = DataLoader(dataset = testing_df, batch_size = batch_size, shuffle=False, collate_fn = my_collate)
dataiter = next(iter(train_dataloader))

In [1385]:
class LSTM(nn.Module):

    def __init__(self, input_size, embed_dim, hidden_dim, num_layers):
        super(LSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.weights = torch.FloatTensor(keywords)
        self.embedding = nn.Embedding.from_pretrained(self.weights)
        self.embedding.requires_grad = False
        self.embedding = nn.Embedding(n_text+1, embed_dim)
        
        self.linear1 = nn.Linear(input_size, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, 100)
        self.activation=nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        #self.lstm = nn.LSTM(input_size, hidden_dim, num_layers, batch_first=True)
        #self.fc = nn.Linear(hidden_dim, 100)
        #self.sigmoid = nn.Sigmoid()

    def forward(self, _input):
        titles = _input['title']
        word2vec_title_list = []
        for curr_title in titles:
            title_vec = self.embedding(curr_title)
            word2vec_title_list.append(title_vec.mean(dim = 0))
        embed_title = torch.stack(word2vec_title_list)
        
        #out = embed_title.reshape([embed_title.shape[0],1,embed_title.shape[1]])
        
        
        #embed_abstract_list = []
        #for curr_abstract in x['abstract']:
        #    embed_abstract_list.append(curr_abstract.mean(dim = 0))
        #embed_abstract = torch.stack(embed_abstract_list) # torch.Size([4, 100])
        #print(pad_sequence(x['title']).size())
        #print(pad_sequence(x['abstract']).size())
        #out = pad_sequence(_input['abstract'], batch_first=True)
        #print(out.size())
        #out = torch.cat((pad_sequence(x['title']), pad_sequence(x['abstract'])), dim = 0)
        #print(out)
        
        #h0 = torch.zeros(self.num_layers, out.size(0), self.hidden_dim)
        #c0 = torch.zeros(self.num_layers, out.size(0), self.hidden_dim)
        
        # Forward propagate LSTM
        #out, (hn, cn) = self.lstm(out, (h0, c0))  
        # out: tensor of shape (batch_size, seq_length, hidden_size)
        #out = self.fc(out[:, -1, :]) 
        #out = self.sigmoid(out)
        #return out
        logits=self.linear1(embed_title)
        logits=self.activation(logits)
        logits=self.linear2(logits)
        return torch.sigmoid(logits)
        

In [1386]:
clf = LSTM(64, 64, 128, 1)
print(clf(dataiter))
predictions = np.where(clf(dataiter).detach().numpy()>=0.5, 1, 0)
predictions

tensor([[0.4762, 0.4931, 0.4964,  ..., 0.5286, 0.4926, 0.5050],
        [0.4749, 0.4929, 0.5263,  ..., 0.5328, 0.4996, 0.5165],
        [0.4822, 0.4710, 0.5092,  ..., 0.5133, 0.5115, 0.5122],
        ...,
        [0.4887, 0.4891, 0.5316,  ..., 0.5226, 0.5045, 0.5177],
        [0.4652, 0.4945, 0.5133,  ..., 0.4973, 0.4881, 0.5011],
        [0.4824, 0.4771, 0.5178,  ..., 0.5319, 0.4966, 0.5232]],
       grad_fn=<SigmoidBackward0>)


array([[0, 0, 0, ..., 1, 0, 1],
       [0, 0, 1, ..., 1, 0, 1],
       [0, 0, 1, ..., 1, 1, 1],
       ...,
       [0, 0, 1, ..., 1, 1, 1],
       [0, 0, 1, ..., 0, 0, 1],
       [0, 0, 1, ..., 1, 0, 1]])

In [1387]:
# loss and optimizer
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(clf.parameters(),lr=0.1)

# training loop
n_total_steps = len(train_dataloader)
for epoch in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        #batch = batch.to(device)

        # forward
        #print(batch)
        outputs = clf(batch)
        
        loss = criterion(outputs, batch['target'])
        
        predictions = np.where(outputs.detach().numpy()>=0.5, 1, 0)
        #print(outputs.detach().numpy())
        #break
        f1_acc = f1_score(batch['target'].detach().numpy(), predictions, average="samples", zero_division=1)

        
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #if (i + 1) % 1000 == 0:
    print(f'epoch {epoch + 1} / {num_epochs}, loss = {loss.item():.4f}, training f1 score = {f1_acc:.4f}')
            
            

epoch 1 / 100, loss = 0.0313, training f1 score = 0.7391
epoch 2 / 100, loss = 0.0226, training f1 score = 0.6957
epoch 3 / 100, loss = 0.0249, training f1 score = 0.7826


KeyboardInterrupt: 

In [1378]:
# testing
test_preds = []
with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        outputs = clf(batch)
        
        predictions = np.where(outputs.detach().numpy()>0.5, 1, 0)
        test_preds.append(predictions)

In [1379]:
test_preds[20][21]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [1370]:
def find(lst, num):
    result = []
    for i, x in enumerate(lst):
        if x==num:
            result.append(i)
    return result

In [1371]:
NN_result = test_df[['identifier']]
NN_result.loc[:,'Predict'] = ''
final_result = []
for i in range(len(test_preds)):
    for j in range(batch_size):
        final_result.append(test_preds[i][j])
print(len(final_result))
for i in range(len(final_result)):
    result = final_result[i]
    if len(find(list(result), 1)) == 0:
        NN_result.loc[i,'Predict'] = -1
    else:
        NN_result.loc[i,'Predict'] = ' '.join(str(e) for e in find(list(result), 1))
        
NN_result = NN_result.rename(columns={'identifier':'ID'})
NN_result.to_csv('./NN_results.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


IndexError: index 20 is out of bounds for axis 0 with size 20

In [1267]:
NN_result

Unnamed: 0,identifier,Predict
0,0,
1,1,
2,2,
3,3,
4,4,
...,...,...
795,795,
796,796,
797,797,
798,798,
