# Neural Network

In [1]:
import json
import collections
import torch
import math
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
# set numbers that will used
max_prolific = 99
n_text = 4999
n_authors = 21245
n_prolific = 100
n_years = 19
n_venues = 464
batch_size = 40
hidden_dim = 200
output_dim = 101
learning_rate = 0.001
num_epochs = 100

### Load Data 

In [3]:
#  data processing methods

# get prolific authors(0-99) from the author list
def retain_prolific(author_list: list):
    return [x for x in author_list if x < 100]

# get coauthors(>99) from the author list
def get_coauthor(author_list: list):
    return [x for x in author_list if x >= 100]

In [4]:
# read train json file
train_filename = 'train.json'
with open(train_filename, 'r', encoding='utf-8') as f:
    raw_train = json.load(f)
# read test json file
test_filename = 'test.json'
with open(test_filename, 'r', encoding='utf-8') as f:
    raw_test = json.load(f)
    
# get a copy
train = raw_train.copy()
test = raw_test.copy()

# extract prolific authors and coauthors
data_df = pd.DataFrame.from_dict(train)
data_df['prolific'] = data_df.apply(lambda x: retain_prolific(x['authors']), axis=1)
data_df['coauthors'] = data_df.apply(lambda x: get_coauthor(x['authors']), axis=1)
data_df = data_df.drop(['authors'],axis=1)
test_df = pd.DataFrame.from_dict(test)

In [5]:
train_df, validation_df = train_test_split(data_df, test_size=0.05,random_state=42)
length = train_df['prolific'].str.len()

# number of papers without any prolific author
count_empty = (length == 0).sum()

# number of papers with prolific authors
count_non_emtpy = (length >0).sum()
print(count_empty)
print(count_non_emtpy)
train_without = train_df[train_df['prolific'].str.len() == 0]
train_with = train_df[train_df['prolific'].str.len() > 0]

# sample same amount of data and concat
train_df = pd.concat([train_with, train_without.sample(len(train_with))])

# reset index from 0
train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)
validation_df = validation_df.iloc[10:]


17409
7094


In [6]:
# create validation set
validation_df = validation_df.reset_index(drop=True)
validating_prolifics = validation_df['prolific']
for element in validating_prolifics:
   if len(element) == 0:
    element.append(-1)
validation_df = validation_df.drop(['prolific'],axis=1)


### One-Hot Encoding 

In [7]:
def combine_features_torch(df, have_prolific):
    features = []
    targets = []
    for i in range(df.shape[0]):
        # abstract and title
        abstract_list = [0] * n_text
        title_list = [0] * n_text
        current_row = df.loc[i]
        # count word frequency in title and abstract
        abstract_freq = collections.Counter(current_row['abstract'])
        title_freq = collections.Counter(current_row['title'])
        for key, value in dict(abstract_freq).items():
            abstract_list[key-1] = value
        for key, value in dict(title_freq).items():
            title_list[key-1] = value
        # year
        year_list = [0] * n_years
        year_list[current_row['year']-1] = 1
        # venue
        venue_list = [0] * (n_venues + 2) # 466 elements with the last element for empty venue
        if current_row['venue'] == '':
            venue_list[-1] = 1
        else:
            venue_list[current_row['venue']] = 1
        # coauthors
        coauthor_list = [0] * (n_authors - n_prolific + 2) # 21147 elements with the last element for empty coauthors
        if current_row['coauthors'] == []:
            coauthor_list[-1] = 1
        else:
            for coauthor in current_row['coauthors']:
                coauthor_list[coauthor-n_prolific] = 1
           
        combined_features = coauthor_list
        features.append(np.array(combined_features))
        
        if have_prolific == True:
            # prolific authors
            prolific_list = [0] * (n_prolific + 1) # 101 elements with the last element for empty coauthors
            if current_row['prolific'] == []:
                prolific_list[-1] = 1
            else:
                for prolific in current_row['prolific']:
                    prolific_list[prolific] = 1
            targets.append(np.array(prolific_list))
    if have_prolific == True:
        X_train = np.vstack(features)
        y_train = np.array(targets)
        return X_train, y_train
    else:
        X_test = np.vstack(features)
        return X_test

In [8]:
class AuthorDataset(Dataset):

    def __init__(self, X, y, istrain):
        self.X = X
        self.istrain = istrain
        if self.istrain == True:
            self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        if self.istrain == True:
            return self.X[index], self.y[index]
        else:
            return self.X[index]

In [9]:
X_train, y_train = combine_features_torch(train_df, have_prolific=True)
X_validation = combine_features_torch(validation_df, have_prolific=False)
X_test = combine_features_torch(test_df, have_prolific=False)
training_df = AuthorDataset(X_train, y_train, istrain = True)
validating_df = AuthorDataset(X_validation, y_train, istrain=False)
testing_df = AuthorDataset(X_test, y_train, istrain = False)

In [10]:
# define collate function
def my_collate(batch):
    # for training set
    if len(batch[0]) == 2:
        output = {"x": [], "target": []}
        
        for data in batch:
            x = data[0]
            target = data[1]
            output['x'] += [x]
            output['target'] += [target]
            
        output['x'] = torch.tensor(output['x'], dtype=torch.float)
        output['target'] = torch.tensor(output['target'], dtype=torch.float)
        return output
    
    # for testing set
    else:
        output = {"x": []}
        for data in batch:
            output['x'] += [data]
        output['x'] = torch.tensor(output['x'], dtype=torch.float)
        return output

### Data Loader

In [11]:
train_dataloader = DataLoader(dataset = training_df, batch_size = batch_size, shuffle=True, collate_fn = my_collate)
validation_dataloader = DataLoader(dataset = validating_df, batch_size = batch_size, shuffle=True, collate_fn = my_collate)
test_dataloader = DataLoader(dataset = testing_df, batch_size = batch_size, shuffle=False, collate_fn = my_collate)
dataiter = next(iter(test_dataloader))
dataiter

  output['x'] = torch.tensor(output['x'], dtype=torch.float)


{'x': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]])}

In [12]:
class MultilabelModel(torch.nn.Module):

    def __init__(self, input_dim, output_dim):
        super(MultilabelModel, self).__init__()

        self.linear1 = torch.nn.Linear(input_dim, 128)
        self.Sigmoid = torch.nn.Sigmoid()
        self.linear2 = torch.nn.Linear(128, output_dim)

    def forward(self, x):
        
        output = self.linear1(x)
        output = self.linear2(output)
        output = self.Sigmoid(output)
        return output

In [13]:
clf = MultilabelModel(21147, output_dim)
print(clf(dataiter['x']))

tensor([[0.5174, 0.5193, 0.4917,  ..., 0.5064, 0.4899, 0.4857],
        [0.5152, 0.5195, 0.4895,  ..., 0.5066, 0.4904, 0.4873],
        [0.5166, 0.5191, 0.4859,  ..., 0.5073, 0.4917, 0.4854],
        ...,
        [0.5147, 0.5185, 0.4886,  ..., 0.5070, 0.4905, 0.4868],
        [0.5170, 0.5195, 0.4890,  ..., 0.5065, 0.4885, 0.4851],
        [0.5157, 0.5190, 0.4888,  ..., 0.5066, 0.4909, 0.4871]],
       grad_fn=<SigmoidBackward0>)


In [14]:
class LSTM(torch.nn.Module):

    def __init__(self, input_size, embed_dim, hidden_dim, num_layers):
        super(LSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = torch.nn.LSTM(input_size, hidden_dim, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, 101)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        
        out = x.reshape([x.shape[0],1,x.shape[1]])
        h0 = torch.zeros(self.num_layers, out.size(0), self.hidden_dim, device=out.device)
        c0 = torch.zeros(self.num_layers, out.size(0), self.hidden_dim, device=out.device)
        
        # Forward propagate LSTM
        out, (hn, cn) = self.lstm(out, (h0, c0))  
        # out: tensor of shape (batch_size, seq_length, hidden_size)
        out = self.fc(out[:, -1, :]) 
        out = self.sigmoid(out)
        return out

In [15]:
clf = LSTM(21147, 64, 128, 1)
print(clf(dataiter['x']))

tensor([[0.4940, 0.5066, 0.5206,  ..., 0.5018, 0.4944, 0.5181],
        [0.4908, 0.5010, 0.5140,  ..., 0.4994, 0.4991, 0.5259],
        [0.4949, 0.5022, 0.5114,  ..., 0.5062, 0.4960, 0.5180],
        ...,
        [0.4982, 0.5026, 0.5132,  ..., 0.4971, 0.5015, 0.5190],
        [0.4954, 0.5106, 0.5176,  ..., 0.4992, 0.4975, 0.5236],
        [0.4987, 0.4915, 0.5164,  ..., 0.5033, 0.4988, 0.5225]],
       grad_fn=<SigmoidBackward0>)


### GPU Mode if Available

In [16]:
# If there's a GPU available...
if torch.cuda.is_available():

# Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3060 Laptop GPU


In [17]:
gpu_mode=torch.cuda.is_available()
if gpu_mode:
    clf.cuda()

### Model Training

In [18]:
# loss and optimizer
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(clf.parameters(),lr=learning_rate)

# training loop
n_total_steps = len(train_dataloader)
for epoch in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        # if using GPU...
        if gpu_mode == True:
            gpu_batch = {}
            gpu_batch['x']=batch['x'].cuda()
            gpu_batch['target']=batch['target'].cuda()
            batch = gpu_batch

        # forward
        outputs = clf(batch['x'])
        loss = criterion(outputs, batch['target'])
        
        predictions = np.where(outputs.detach().cpu().numpy()>=0.5, 1, 0)
        f1_acc = f1_score(batch['target'].detach().cpu().numpy(), predictions, average="samples", zero_division=1)

        
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #if (i + 1) % 100 == 0:
    print(f'epoch {epoch + 1} / {num_epochs}, loss = {loss.item():.4f}, training f1 score = {f1_acc:.4f}')

epoch 1 / 100, loss = 0.0439, training f1 score = 0.0000
epoch 2 / 100, loss = 0.0370, training f1 score = 0.5000
epoch 3 / 100, loss = 0.0535, training f1 score = 0.4286
epoch 4 / 100, loss = 0.0451, training f1 score = 0.4286
epoch 5 / 100, loss = 0.0365, training f1 score = 0.4643
epoch 6 / 100, loss = 0.0343, training f1 score = 0.3929
epoch 7 / 100, loss = 0.0342, training f1 score = 0.3929
epoch 8 / 100, loss = 0.0247, training f1 score = 0.5714
epoch 9 / 100, loss = 0.0181, training f1 score = 0.6429
epoch 10 / 100, loss = 0.0234, training f1 score = 0.5738
epoch 11 / 100, loss = 0.0130, training f1 score = 0.7024
epoch 12 / 100, loss = 0.0099, training f1 score = 0.6429
epoch 13 / 100, loss = 0.0094, training f1 score = 0.6786
epoch 14 / 100, loss = 0.0088, training f1 score = 0.7381
epoch 15 / 100, loss = 0.0112, training f1 score = 0.8214
epoch 16 / 100, loss = 0.0087, training f1 score = 0.7810
epoch 17 / 100, loss = 0.0048, training f1 score = 0.8929
epoch 18 / 100, loss = 

### Model Prediction

In [19]:
# prediction
def generate_preds(dataloader):
    preds = []
    with torch.no_grad():
        for index, batch in enumerate(dataloader):
            if gpu_mode == True:
                gpu_batch = {}
                gpu_batch['x']=batch['x'].cuda()
                batch = gpu_batch
            outputs = clf(batch['x'])

            predictions = np.where(outputs.detach().cpu().numpy()>=0.5, 1, 0)
            preds.append(predictions)
    return preds

In [20]:
validation_preds = generate_preds(validation_dataloader)
test_preds = generate_preds(test_dataloader)

In [21]:
# generate dataframe of output
def prediction_df(predictions:list, isString: bool):
    identifiers = np.shape(predictions)[0]*np.shape(predictions)[1]
    NN_output = pd.DataFrame([*range(0,identifiers)], columns=['ID']) # column of ID
    #NN_output['Predict'] = ''  # column of predict      
    final_results = []

    for i in range(len(predictions)):
        for j in range(batch_size):
            final_results.append(predictions[i][j])
    predict = []
    for i in range(len(final_results)):
        
        result = final_results[i]
        if isString == True:
            if result[-1] == 1 or len(np.where(result == 1)[0]) == 0:  
                predict.append(-1)
            else:
                predict.append(' '.join(str(e) for e in np.where(result==1)[0]))
        else:
            if result[-1] == 1 or len(np.where(result == 1)[0]) == 0:  
                predict.append([-1])
                
            else:
                predict.append(list(np.where(result==1)[0]))
    NN_output['Predict'] = predict
    return NN_output

### Model Validation

In [22]:
# validation accuracy
validation_prediction = prediction_df(validation_preds, isString=False)['Predict']
comparison = (validation_prediction == validating_prolifics)
comparison.sum()/len(comparison)

0.50546875

### Output Result

In [23]:
NN_test_output = prediction_df(test_preds,isString=True)
NN_test_output

Unnamed: 0,ID,Predict
0,0,92
1,1,-1
2,2,31
3,3,-1
4,4,-1
...,...,...
795,795,54
796,796,97
797,797,13
798,798,71


In [24]:
NN_test_output.to_csv('./NN_split_results.csv', index=False)