In [26]:
import pandas as pd
import numpy as np
import json
import collections
from sklearn.model_selection import train_test_split
from skmultilearn.adapt import MLkNN
from sklearn.metrics import f1_score

In [2]:
max_prolific = 99
n_text = 4999
n_authors = 21245
n_prolific = 100
n_years = 19
n_venues = 464

In [97]:
# read train json file
train_filename = './data/train.json'
with open(train_filename, 'r', encoding='utf-8') as f:
    raw_train = json.load(f)
# read test json file
test_filename = './data/test.json'
with open(test_filename, 'r', encoding='utf-8') as f:
    raw_test = json.load(f)
    
# get a copy
train = raw_train.copy()
test = raw_test.copy()

# extract coauthors as a new key from train.json
for i in range(len(train)):
    coauthors = []
    prolific_authors = []
    for auth in train[i]['authors']:
        if auth >= max_prolific:
            coauthors.append(auth)
        else:
            prolific_authors.append(auth)
    train[i]['coauthors'] = coauthors
    #if len(prolific_authors) == 0:
        #prolific_authors.append(-1)
    train[i]['prolific_authors'] = prolific_authors

train_df = pd.DataFrame.from_dict(train)
train_df = train_df.drop(['authors'], axis=1)
test_df = pd.DataFrame.from_dict(test)

In [98]:
def combine_features(df, have_prolific):
    features = []
    targets = []
    for i in range(df.shape[0]):
        # abstract and title
        abstract_list = [0] * n_text
        title_list = [0] * n_text
        current_row = df.loc[i]
        abstract_freq = collections.Counter(current_row['abstract'])
        title_freq = collections.Counter(current_row['title'])
        for key, value in dict(abstract_freq).items():
            abstract_list[key-1] = value
        for key, value in dict(title_freq).items():
            title_list[key-1] = value
        # year
        year_list = [0] * n_years
        year_list[current_row['year']-1] = 1
        # venue
        venue_list = [0] * (n_venues + 2) # 466 elements with the last element for empty venue
        if current_row['venue'] == '':
            venue_list[-1] = 1
        else:
            venue_list[current_row['venue']] = 1
        # coauthors
        coauthor_list = [0] * (n_authors - n_prolific + 2) # 21147 elements with the last element for empty coauthors
        if current_row['coauthors'] == []:
            coauthor_list[-1] = 1
        else:
            for coauthor in current_row['coauthors']:
                coauthor_list[coauthor-n_prolific] = 1
           
        combined_features = abstract_list + title_list + year_list + venue_list + coauthor_list
        features.append(np.array(combined_features))
        
        if have_prolific == True:
            # prolific authors
            prolific_list = [0] * (n_prolific + 1) # 101 elements with the last element for empty coauthors
            if current_row['prolific_authors'] == []:
                prolific_list[-1] = 1
            else:
                for prolific in current_row['prolific_authors']:
                    prolific_list[prolific] = 1
            targets.append(np.array(prolific_list))
    if have_prolific == True:
        X_train = np.vstack(features)
        y_train = np.array(targets)
        return X_train, y_train
    else:
        X_test = np.vstack(features)
        return X_test

In [120]:
X_train, y_train = combine_features(train_df, have_prolific=True)
X_test = combine_features(test_df, have_prolific=False)
X_train.shape

(25793, 31630)

In [121]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.24, random_state = 0)

In [110]:
mlknn_clf = MLkNN(k = 10, s = 0.3)

# train
mlknn_clf.fit(X_train, y_train)

# predict
mlknn_pred = mlknn_clf.predict(X_val)
mlknn_acc = f1_score(mlknn_pred, y_val, average = "samples")
print(f"f1 score under MLKNN classifier is:{mlknn_acc}")



f1 score under MLKNN classifier is:0.6542169508733876


In [111]:
# predict on test set
mlknn_pred_test = mlknn_clf.predict(X_test)

In [112]:
def find(lst, num):
    result = []
    for i, x in enumerate(lst):
        if x==num:
            result.append(i)
    return result

In [113]:
mlknn_result = test_df[['identifier']]
mlknn_result.loc[:,'Predict'] = ''

for i in range(len(mlknn_pred_test.toarray())):
    result = mlknn_pred_test.toarray()[i]
    if result[-1] == 1 or len(find(list(result), 1)) == 0:
        mlknn_result.loc[i,'Predict'] = -1
    else:
        mlknn_result.loc[i,'Predict'] = ' '.join(str(e) for e in find(list(result), 1))
        
mlknn_result = mlknn_result.rename(columns={'identifier':'ID'})
mlknn_result.to_csv('./results.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [114]:
mlknn_result

Unnamed: 0,ID,Predict
0,0,-1
1,1,-1
2,2,-1
3,3,-1
4,4,-1
...,...,...
795,795,-1
796,796,-1
797,797,-1
798,798,-1


## Nerual Network

In [122]:
import torch
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import math
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score

In [248]:
batch_size = 40
hidden_dim = 200
output_dim = 101
learning_rate = 0.001
num_epochs = 100

In [271]:
def combine_features_torch(df, have_prolific):
    features = []
    targets = []
    for i in range(df.shape[0]):
        # abstract and title
        abstract_list = [0] * n_text
        title_list = [0] * n_text
        current_row = df.loc[i]
        abstract_freq = collections.Counter(current_row['abstract'])
        title_freq = collections.Counter(current_row['title'])
        for key, value in dict(abstract_freq).items():
            abstract_list[key-1] = value
        for key, value in dict(title_freq).items():
            title_list[key-1] = value
        # year
        year_list = [0] * n_years
        year_list[current_row['year']-1] = 1
        # venue
        venue_list = [0] * (n_venues + 2) # 466 elements with the last element for empty venue
        if current_row['venue'] == '':
            venue_list[-1] = 1
        else:
            venue_list[current_row['venue']] = 1
        # coauthors
        coauthor_list = [0] * (n_authors - n_prolific + 2) # 21147 elements with the last element for empty coauthors
        if current_row['coauthors'] == []:
            coauthor_list[-1] = 1
        else:
            for coauthor in current_row['coauthors']:
                coauthor_list[coauthor-n_prolific] = 1
           
        combined_features = title_list + coauthor_list + abstract_list
        features.append(np.array(combined_features))
        
        if have_prolific == True:
            # prolific authors
            prolific_list = [0] * (n_prolific + 1) # 101 elements with the last element for empty coauthors
            if current_row['prolific_authors'] == []:
                prolific_list[-1] = 1
            else:
                for prolific in current_row['prolific_authors']:
                    prolific_list[prolific] = 1
            targets.append(np.array(prolific_list))
    if have_prolific == True:
        X_train = np.vstack(features)
        y_train = np.array(targets)
        return X_train, y_train
    else:
        X_test = np.vstack(features)
        return X_test

In [272]:
class AuthorDataset(Dataset):

    def __init__(self, X, y, istrain):
        self.X = X
        self.istrain = istrain
        if self.istrain == True:
            self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        if self.istrain == True:
            return self.X[index], self.y[index]
        else:
            return self.X[index]

In [273]:
X_train, y_train = combine_features_torch(train_df, have_prolific=True)
X_test = combine_features_torch(test_df, have_prolific=False)
training_df = AuthorDataset(X_train, y_train, istrain = True)
#validation_df = AuthorDataset(X_val, y_val, istrain = False)
testing_df = AuthorDataset(X_test, y_train, istrain = False)

In [274]:
def my_collate(batch):
    # for training set
    if len(batch[0]) == 2:
        output = {"x": [], "target": []}
        
        for data in batch:
            x = data[0]
            target = data[1]
            output['x'] += [x]
            output['target'] += [target]
            
        output['x'] = torch.tensor(output['x'], dtype=torch.float)
        output['target'] = torch.tensor(output['target'], dtype=torch.float)
        return output
    
    # for testing set
    else:
        output = {"x": []}
        for data in batch:
            output['x'] += [data]
        output['x'] = torch.tensor(output['x'], dtype=torch.float)
        return output

In [275]:
train_dataloader = DataLoader(dataset = training_df, batch_size = batch_size, shuffle=True, collate_fn = my_collate)
val_dataloader = DataLoader(dataset = validation_df, batch_size = batch_size, shuffle=False, collate_fn = my_collate)
test_dataloader = DataLoader(dataset = testing_df, batch_size = batch_size, shuffle=False, collate_fn = my_collate)
dataiter = next(iter(val_dataloader))
dataiter

{'x': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]])}

In [276]:
class MultilabelModel(torch.nn.Module):

    def __init__(self, input_dim, output_dim):
        super(MultilabelModel, self).__init__()

        self.linear1 = torch.nn.Linear(input_dim, 128)
        self.Sigmoid = torch.nn.Sigmoid()
        self.linear2 = torch.nn.Linear(128, output_dim)

    def forward(self, x):
        
        output = self.linear1(x)
        output = self.linear2(output)
        output = self.Sigmoid(output)
        return output

In [None]:
clf = MultilabelModel(31145, output_dim)
# loss and optimizer
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(clf.parameters(),lr=learning_rate)

# training loop
n_total_steps = len(train_dataloader)
for epoch in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        #batch = batch.to(device)

        # forward
        outputs = clf(batch['x'])
        loss = criterion(outputs, batch['target'])
        
        predictions = np.where(outputs.detach().numpy()>=0.5, 1, 0)
        #print(predictions)
        #break
        f1_acc = f1_score(batch['target'].detach().numpy(), predictions, average="samples", zero_division=1)

        
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #if (i + 1) % 100 == 0:
    print(f'epoch {epoch + 1} / {num_epochs}, loss = {loss.item():.4f}, training f1 score = {f1_acc:.4f}')

epoch 1 / 100, loss = 0.0331, training f1 score = 0.6364
epoch 2 / 100, loss = 0.0126, training f1 score = 0.8081
epoch 3 / 100, loss = 0.0126, training f1 score = 0.8485
epoch 4 / 100, loss = 0.0050, training f1 score = 0.9596
epoch 5 / 100, loss = 0.0016, training f1 score = 0.9596
epoch 6 / 100, loss = 0.0036, training f1 score = 0.9495
epoch 7 / 100, loss = 0.0030, training f1 score = 0.9293
epoch 8 / 100, loss = 0.0025, training f1 score = 0.8990
epoch 9 / 100, loss = 0.0032, training f1 score = 0.9232
epoch 10 / 100, loss = 0.0019, training f1 score = 0.9697
epoch 11 / 100, loss = 0.0015, training f1 score = 0.9899
epoch 12 / 100, loss = 0.0024, training f1 score = 0.9293
epoch 13 / 100, loss = 0.0009, training f1 score = 0.9596
epoch 14 / 100, loss = 0.0007, training f1 score = 0.9697
epoch 15 / 100, loss = 0.0006, training f1 score = 0.9697
epoch 16 / 100, loss = 0.0009, training f1 score = 0.9798
epoch 17 / 100, loss = 0.0008, training f1 score = 0.9899
epoch 18 / 100, loss = 

In [252]:
# testing
test_preds = []
with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        outputs = clf(batch['x'])
        
        predictions = np.where(outputs.detach().numpy()>=0.5, 1, 0)
        test_preds.append(predictions)

In [253]:
test_preds[1][3]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [254]:
test_final_result = []
for i in range(len(test_preds)):
    for j in range(4):
        test_final_result.append(test_preds[i][j])

In [255]:
NN_result = test_df[['identifier']]
NN_result.loc[:,'Predict'] = ''
final_result = []
for i in range(len(test_preds)):
    for j in range(batch_size):
        final_result.append(test_preds[i][j])
print(len(final_result))
for i in range(len(final_result)):
    result = final_result[i]
    if result[-1] == 1 or len(find(list(result), 1)) == 0:
        NN_result.loc[i,'Predict'] = -1
    else:
        NN_result.loc[i,'Predict'] = ' '.join(str(e) for e in find(list(result), 1))
        
NN_result = NN_result.rename(columns={'identifier':'ID'})
NN_result.to_csv('./NN_results.csv', index=False)

800


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [256]:
NN_result

Unnamed: 0,ID,Predict
0,0,-1
1,1,-1
2,2,-1
3,3,23
4,4,-1
...,...,...
795,795,-1
796,796,97
797,797,-1
798,798,-1
