# Alternative Approaches
This notebook is for other possible combinations of one-hot encoding. For example, coauthor+title, coauthor+abstract, or coauthor+title+abstract.

In [26]:
import pandas as pd
import numpy as np
import json
import collections
from sklearn.model_selection import train_test_split
from skmultilearn.adapt import MLkNN
from sklearn.metrics import f1_score

In [2]:
max_prolific = 99
n_text = 4999
n_authors = 21245
n_prolific = 100
n_years = 19
n_venues = 464

In [97]:
# read train json file
train_filename = './data/train.json'
with open(train_filename, 'r', encoding='utf-8') as f:
    raw_train = json.load(f)
# read test json file
test_filename = './data/test.json'
with open(test_filename, 'r', encoding='utf-8') as f:
    raw_test = json.load(f)
    
# get a copy
train = raw_train.copy()
test = raw_test.copy()

# extract coauthors as a new key from train.json
for i in range(len(train)):
    coauthors = []
    prolific_authors = []
    for auth in train[i]['authors']:
        if auth >= max_prolific:
            coauthors.append(auth)
        else:
            prolific_authors.append(auth)
    train[i]['coauthors'] = coauthors
    #if len(prolific_authors) == 0:
        #prolific_authors.append(-1)
    train[i]['prolific_authors'] = prolific_authors

train_df = pd.DataFrame.from_dict(train)
train_df = train_df.drop(['authors'], axis=1)
test_df = pd.DataFrame.from_dict(test)

## Nerual Network

In [122]:
import torch
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import math
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score

In [248]:
batch_size = 40
hidden_dim = 200
output_dim = 101
learning_rate = 0.001
num_epochs = 100

In [330]:
def combine_features_torch(df, have_prolific):
    features = []
    targets = []
    for i in range(df.shape[0]):
        # abstract and title
        abstract_list = [0] * n_text
        title_list = [0] * n_text
        current_row = df.loc[i]
        abstract_freq = collections.Counter(current_row['abstract'])
        title_freq = collections.Counter(current_row['title'])
        for key, value in dict(abstract_freq).items():
            abstract_list[key-1] = value
        for key, value in dict(title_freq).items():
            title_list[key-1] = value
        # year
        year_list = [0] * n_years
        year_list[current_row['year']-1] = 1
        # venue
        venue_list = [0] * (n_venues + 2) # 466 elements with the last element for empty venue
        if current_row['venue'] == '':
            venue_list[-1] = 1
        else:
            venue_list[current_row['venue']] = 1
        # coauthors
        coauthor_list = [0] * (n_authors - n_prolific + 2) # 21147 elements with the last element for empty coauthors
        if current_row['coauthors'] == []:
            coauthor_list[-1] = 1
        else:
            for coauthor in current_row['coauthors']:
                coauthor_list[coauthor-n_prolific] = 1
           
        combined_features = coauthor_list + title_list
        features.append(np.array(combined_features))
        
        if have_prolific == True:
            # prolific authors
            prolific_list = [0] * (n_prolific + 1) # 101 elements with the last element for empty coauthors
            if current_row['prolific_authors'] == []:
                prolific_list[-1] = 1
            else:
                for prolific in current_row['prolific_authors']:
                    prolific_list[prolific] = 1
            targets.append(np.array(prolific_list))
    if have_prolific == True:
        X_train = np.vstack(features)
        y_train = np.array(targets)
        return X_train, y_train
    else:
        X_test = np.vstack(features)
        return X_test

In [331]:
class AuthorDataset(Dataset):

    def __init__(self, X, y, istrain):
        self.X = X
        self.istrain = istrain
        if self.istrain == True:
            self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        if self.istrain == True:
            return self.X[index], self.y[index]
        else:
            return self.X[index]

In [332]:
X_train, y_train = combine_features_torch(train_df, have_prolific=True)
X_test = combine_features_torch(test_df, have_prolific=False)
training_df = AuthorDataset(X_train, y_train, istrain = True)
testing_df = AuthorDataset(X_test, y_train, istrain = False)

In [333]:
def my_collate(batch):
    # for training set
    if len(batch[0]) == 2:
        output = {"x": [], "target": []}
        
        for data in batch:
            x = data[0]
            target = data[1]
            output['x'] += [x]
            output['target'] += [target]
            
        output['x'] = torch.tensor(output['x'], dtype=torch.float)
        output['target'] = torch.tensor(output['target'], dtype=torch.float)
        return output
    
    # for testing set
    else:
        output = {"x": []}
        for data in batch:
            output['x'] += [data]
        output['x'] = torch.tensor(output['x'], dtype=torch.float)
        return output

In [334]:
train_dataloader = DataLoader(dataset = training_df, batch_size = batch_size, shuffle=True, collate_fn = my_collate)
val_dataloader = DataLoader(dataset = validation_df, batch_size = batch_size, shuffle=False, collate_fn = my_collate)
test_dataloader = DataLoader(dataset = testing_df, batch_size = batch_size, shuffle=False, collate_fn = my_collate)
dataiter = next(iter(train_dataloader))
dataiter

{'x': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 'target': tensor([[0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.]])}

In [335]:
class MultilabelModel(torch.nn.Module):

    def __init__(self, input_dim, output_dim):
        super(MultilabelModel, self).__init__()

        self.linear1 = torch.nn.Linear(input_dim, 128)
        self.Sigmoid = torch.nn.Sigmoid()
        self.linear2 = torch.nn.Linear(128, output_dim)

    def forward(self, x):
        
        output = self.linear1(x)
        output = self.linear2(output)
        output = self.Sigmoid(output)
        return output

In [337]:
clf = MultilabelModel(21147, output_dim)
print(clf(dataiter['x']))

tensor([[0.5190, 0.5173, 0.4898,  ..., 0.5142, 0.4920, 0.4915],
        [0.5186, 0.5184, 0.4919,  ..., 0.5156, 0.4920, 0.4909],
        [0.5199, 0.5206, 0.4938,  ..., 0.5144, 0.4908, 0.4923],
        ...,
        [0.5196, 0.5190, 0.4886,  ..., 0.5163, 0.4912, 0.4900],
        [0.5205, 0.5187, 0.4918,  ..., 0.5156, 0.4915, 0.4930],
        [0.5201, 0.5189, 0.4915,  ..., 0.5164, 0.4920, 0.4912]],
       grad_fn=<SigmoidBackward0>)


In [338]:
class LSTM(torch.nn.Module):

    def __init__(self, input_size, embed_dim, hidden_dim, num_layers):
        super(LSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = torch.nn.LSTM(input_size, hidden_dim, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, 101)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        
        out = x.reshape([x.shape[0],1,x.shape[1]])
        h0 = torch.zeros(self.num_layers, out.size(0), self.hidden_dim)
        c0 = torch.zeros(self.num_layers, out.size(0), self.hidden_dim)
        
        # Forward propagate LSTM
        out, (hn, cn) = self.lstm(out, (h0, c0))  
        # out: tensor of shape (batch_size, seq_length, hidden_size)
        out = self.fc(out[:, -1, :]) 
        out = self.sigmoid(out)
        return out

In [340]:
clf = LSTM(21147, 64, 128, 1)
print(clf(dataiter['x']))

tensor([[0.4947, 0.5058, 0.4799,  ..., 0.5136, 0.4823, 0.5107],
        [0.4965, 0.5014, 0.4843,  ..., 0.5103, 0.4834, 0.5148],
        [0.5087, 0.5022, 0.4805,  ..., 0.5095, 0.4829, 0.5174],
        ...,
        [0.4999, 0.5150, 0.4762,  ..., 0.5089, 0.4848, 0.5166],
        [0.5069, 0.5051, 0.4810,  ..., 0.5079, 0.4850, 0.5123],
        [0.4981, 0.5034, 0.4786,  ..., 0.5089, 0.4820, 0.5131]],
       grad_fn=<SigmoidBackward0>)


In [341]:
# loss and optimizer
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(clf.parameters(),lr=learning_rate)

# training loop
n_total_steps = len(train_dataloader)
for epoch in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        #batch = batch.to(device)

        # forward
        outputs = clf(batch['x'])
        loss = criterion(outputs, batch['target'])
        
        predictions = np.where(outputs.detach().numpy()>=0.5, 1, 0)
        f1_acc = f1_score(batch['target'].detach().numpy(), predictions, average="samples", zero_division=1)

        
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #if (i + 1) % 100 == 0:
    print(f'epoch {epoch + 1} / {num_epochs}, loss = {loss.item():.4f}, training f1 score = {f1_acc:.4f}')

epoch 1 / 100, loss = 0.0368, training f1 score = 0.5152
epoch 2 / 100, loss = 0.0211, training f1 score = 0.7576
epoch 3 / 100, loss = 0.0144, training f1 score = 0.8788
epoch 4 / 100, loss = 0.0217, training f1 score = 0.6970
epoch 5 / 100, loss = 0.0134, training f1 score = 0.7576
epoch 6 / 100, loss = 0.0118, training f1 score = 0.7879
epoch 7 / 100, loss = 0.0115, training f1 score = 0.8182
epoch 8 / 100, loss = 0.0114, training f1 score = 0.7576
epoch 9 / 100, loss = 0.0083, training f1 score = 0.8182
epoch 10 / 100, loss = 0.0118, training f1 score = 0.8081
epoch 11 / 100, loss = 0.0059, training f1 score = 0.8990
epoch 12 / 100, loss = 0.0078, training f1 score = 0.8788
epoch 13 / 100, loss = 0.0018, training f1 score = 0.9394
epoch 14 / 100, loss = 0.0011, training f1 score = 0.9899
epoch 15 / 100, loss = 0.0060, training f1 score = 0.9293
epoch 16 / 100, loss = 0.0027, training f1 score = 0.9596
epoch 17 / 100, loss = 0.0035, training f1 score = 0.9394
epoch 18 / 100, loss = 

In [342]:
# testing
test_preds = []
with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        outputs = clf(batch['x'])
        
        predictions = np.where(outputs.detach().numpy()>=0.5, 1, 0)
        test_preds.append(predictions)

In [343]:
len(test_preds[1][39])

101

In [344]:
def find(lst, num):
    result = []
    for i, x in enumerate(lst):
        if x==num:
            result.append(i)
    return result

In [345]:
test_final_result = []
for i in range(len(test_preds)):
    for j in range(4):
        test_final_result.append(test_preds[i][j])

In [346]:
NN_result = test_df[['identifier']]
NN_result.loc[:,'Predict'] = ''
final_result = []
for i in range(len(test_preds)):
    for j in range(batch_size):
        final_result.append(test_preds[i][j])
print(len(final_result))
for i in range(len(final_result)):
    result = final_result[i]
    if result[-1] == 1 or len(find(list(result), 1)) == 0:
        NN_result.loc[i,'Predict'] = -1
    else:
        NN_result.loc[i,'Predict'] = ' '.join(str(e) for e in find(list(result), 1))
        
NN_result = NN_result.rename(columns={'identifier':'ID'})
NN_result.to_csv('./NN_results.csv', index=False)

800


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [347]:
NN_result

Unnamed: 0,ID,Predict
0,0,92
1,1,-1
2,2,31
3,3,-1
4,4,-1
...,...,...
795,795,54
796,796,97
797,797,13 71
798,798,71
