In [3]:
import pandas as pd
import numpy as np
import json
import collections
from sklearn.model_selection import train_test_split
from skmultilearn.adapt import MLkNN
from sklearn.metrics import f1_score

In [4]:
max_prolific = 99
n_text = 4999
n_authors = 21245
n_prolific = 100
n_years = 19
n_venues = 464

In [5]:
# read train json file
train_filename = './data/train.json'
with open(train_filename, 'r', encoding='utf-8') as f:
    raw_train = json.load(f)
# read test json file
test_filename = './data/test.json'
with open(test_filename, 'r', encoding='utf-8') as f:
    raw_test = json.load(f)
    
# get a copy
train = raw_train.copy()
test = raw_test.copy()

# extract coauthors as a new key from train.json
for i in range(len(train)):
    coauthors = []
    prolific_authors = []
    for auth in train[i]['authors']:
        if auth >= max_prolific:
            coauthors.append(auth)
        else:
            prolific_authors.append(auth)
    train[i]['coauthors'] = coauthors
    #if len(prolific_authors) == 0:
        #prolific_authors.append(-1)
    train[i]['prolific_authors'] = prolific_authors

train_df = pd.DataFrame.from_dict(train)
train_df = train_df.drop(['authors'], axis=1)
test_df = pd.DataFrame.from_dict(test)

## Nerual Network

In [6]:
import torch
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import math
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score

In [7]:
batch_size = 40
hidden_dim = 200
output_dim = 101
learning_rate = 0.001
num_epochs = 100

In [8]:
def combine_features_torch(df, have_prolific):
    features = []
    targets = []
    for i in range(df.shape[0]):
        # abstract and title
        abstract_list = [0] * n_text
        title_list = [0] * n_text
        current_row = df.loc[i]
        abstract_freq = collections.Counter(current_row['abstract'])
        title_freq = collections.Counter(current_row['title'])
        for key, value in dict(abstract_freq).items():
            abstract_list[key-1] = value
        for key, value in dict(title_freq).items():
            title_list[key-1] = value
        # year
        year_list = [0] * n_years
        year_list[current_row['year']-1] = 1
        # venue
        venue_list = [0] * (n_venues + 2) # 466 elements with the last element for empty venue
        if current_row['venue'] == '':
            venue_list[-1] = 1
        else:
            venue_list[current_row['venue']] = 1
        # coauthors
        coauthor_list = [0] * (n_authors - n_prolific + 2) # 21147 elements with the last element for empty coauthors
        if current_row['coauthors'] == []:
            coauthor_list[-1] = 1
        else:
            for coauthor in current_row['coauthors']:
                coauthor_list[coauthor-n_prolific] = 1
           
        combined_features = title_list + coauthor_list
        features.append(np.array(combined_features))
        
        if have_prolific == True:
            # prolific authors
            prolific_list = [0] * (n_prolific + 1) # 101 elements with the last element for empty coauthors
            if current_row['prolific_authors'] == []:
                prolific_list[-1] = 1
            else:
                for prolific in current_row['prolific_authors']:
                    prolific_list[prolific] = 1
            targets.append(np.array(prolific_list))
    if have_prolific == True:
        X_train = np.vstack(features)
        y_train = np.array(targets)
        return X_train, y_train
    else:
        X_test = np.vstack(features)
        return X_test

In [9]:
class AuthorDataset(Dataset):

    def __init__(self, X, y, istrain):
        self.X = X
        self.istrain = istrain
        if self.istrain == True:
            self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        if self.istrain == True:
            return self.X[index], self.y[index]
        else:
            return self.X[index]

In [10]:
X_train, y_train = combine_features_torch(train_df, have_prolific=True)
X_test = combine_features_torch(test_df, have_prolific=False)
training_df = AuthorDataset(X_train, y_train, istrain = True)
testing_df = AuthorDataset(X_test, y_train, istrain = False)

In [11]:
def my_collate(batch):
    # for training set
    if len(batch[0]) == 2:
        output = {"x": [], "target": []}
        
        for data in batch:
            x = data[0]
            target = data[1]
            output['x'] += [x]
            output['target'] += [target]
            
        output['x'] = torch.tensor(output['x'], dtype=torch.float)
        output['target'] = torch.tensor(output['target'], dtype=torch.float)
        return output
    
    # for testing set
    else:
        output = {"x": []}
        for data in batch:
            output['x'] += [data]
        output['x'] = torch.tensor(output['x'], dtype=torch.float)
        return output

In [13]:
train_dataloader = DataLoader(dataset = training_df, batch_size = batch_size, shuffle=True, collate_fn = my_collate)
#val_dataloader = DataLoader(dataset = validation_df, batch_size = batch_size, shuffle=False, collate_fn = my_collate)
test_dataloader = DataLoader(dataset = testing_df, batch_size = batch_size, shuffle=False, collate_fn = my_collate)
dataiter = next(iter(train_dataloader))
dataiter

  output['x'] = torch.tensor(output['x'], dtype=torch.float)


{'x': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 'target': tensor([[0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.]])}

In [14]:
#class MultilabelModel(torch.nn.Module):
#
#    def __init__(self, input_dim, output_dim):
#        super(MultilabelModel, self).__init__()
#
#        self.linear1 = torch.nn.Linear(input_dim, 128)
#        self.Sigmoid = torch.nn.Sigmoid()
#        self.linear2 = torch.nn.Linear(128, output_dim)
#
#    def forward(self, x):
#        
#        output = self.linear1(x)
#        output = self.linear2(output)
#        output = self.Sigmoid(output)
#        return output

In [16]:
#clf = MultilabelModel(26146, output_dim)
#print(clf(dataiter['x']))

tensor([[0.5010, 0.4903, 0.4927,  ..., 0.5146, 0.5108, 0.4875],
        [0.4982, 0.4926, 0.4963,  ..., 0.5096, 0.5105, 0.4882],
        [0.4962, 0.4942, 0.4951,  ..., 0.5062, 0.5076, 0.4869],
        ...,
        [0.5000, 0.4911, 0.4916,  ..., 0.5132, 0.5087, 0.4865],
        [0.4930, 0.4922, 0.4956,  ..., 0.5104, 0.5096, 0.4833],
        [0.5013, 0.4897, 0.4945,  ..., 0.5062, 0.5109, 0.4850]],
       grad_fn=<SigmoidBackward0>)


In [327]:
class LSTM(torch.nn.Module):

    def __init__(self, input_size, embed_dim, hidden_dim, num_layers):
        super(LSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = torch.nn.LSTM(input_size, hidden_dim, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, 101)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        
        out = x.reshape([x.shape[0],1,x.shape[1]])
        h0 = torch.zeros(self.num_layers, out.size(0), self.hidden_dim)
        c0 = torch.zeros(self.num_layers, out.size(0), self.hidden_dim)
        
        # Forward propagate LSTM
        out, (hn, cn) = self.lstm(out, (h0, c0))  
        # out: tensor of shape (batch_size, seq_length, hidden_size)
        out = self.fc(out[:, -1, :]) 
        out = self.sigmoid(out)
        return out

In [328]:
clf = LSTM(26146, 64, 128, 1)
print(clf(dataiter['x']))

tensor([[0.4421, 0.4964, 0.4776,  ..., 0.5048, 0.5088, 0.5486],
        [0.4721, 0.5013, 0.5054,  ..., 0.5076, 0.5221, 0.5306],
        [0.5013, 0.4870, 0.4906,  ..., 0.5229, 0.4880, 0.5525],
        ...,
        [0.5169, 0.4927, 0.5296,  ..., 0.5313, 0.4994, 0.5138],
        [0.5356, 0.4562, 0.4910,  ..., 0.5017, 0.5111, 0.5173],
        [0.5174, 0.4782, 0.5048,  ..., 0.5377, 0.4981, 0.5026]],
       grad_fn=<SigmoidBackward0>)


In [17]:
# loss and optimizer
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(clf.parameters(),lr=learning_rate)

# training loop
n_total_steps = len(train_dataloader)
for epoch in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        #batch = batch.to(device)

        # forward
        outputs = clf(batch['x'])
        loss = criterion(outputs, batch['target'])
        
        predictions = np.where(outputs.detach().numpy()>=0.5, 1, 0)
        f1_acc = f1_score(batch['target'].detach().numpy(), predictions, average="samples", zero_division=1)

        
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #if (i + 1) % 100 == 0:
    print(f'epoch {epoch + 1} / {num_epochs}, loss = {loss.item():.4f}, training f1 score = {f1_acc:.4f}')

KeyboardInterrupt: 

In [279]:
# testing
test_preds = []
with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        outputs = clf(batch['x'])
        
        predictions = np.where(outputs.detach().numpy()>=0.5, 1, 0)
        test_preds.append(predictions)

In [287]:
len(test_preds[1][39])

101

In [284]:
test_final_result = []
for i in range(len(test_preds)):
    for j in range(4):
        test_final_result.append(test_preds[i][j])

In [285]:
NN_result = test_df[['identifier']]
NN_result.loc[:,'Predict'] = ''
final_result = []
for i in range(len(test_preds)):
    for j in range(batch_size):
        final_result.append(test_preds[i][j])
print(len(final_result))
for i in range(len(final_result)):
    result = final_result[i]
    if result[-1] == 1 or len(find(list(result), 1)) == 0:
        NN_result.loc[i,'Predict'] = -1
    else:
        NN_result.loc[i,'Predict'] = ' '.join(str(e) for e in find(list(result), 1))
        
NN_result = NN_result.rename(columns={'identifier':'ID'})
NN_result.to_csv('./NN_results.csv', index=False)

800


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [286]:
NN_result

Unnamed: 0,ID,Predict
0,0,-1
1,1,-1
2,2,-1
3,3,-1
4,4,-1
...,...,...
795,795,54
796,796,-1
797,797,-1
798,798,-1
