In [26]:
import pandas as pd
import numpy as np
import json
import collections
from sklearn.model_selection import train_test_split
from skmultilearn.adapt import MLkNN
from sklearn.metrics import f1_score

In [2]:
max_prolific = 99
n_text = 4999
n_authors = 21245
n_prolific = 100
n_years = 19
n_venues = 464

In [97]:
# read train json file
train_filename = './data/train.json'
with open(train_filename, 'r', encoding='utf-8') as f:
    raw_train = json.load(f)
# read test json file
test_filename = './data/test.json'
with open(test_filename, 'r', encoding='utf-8') as f:
    raw_test = json.load(f)
    
# get a copy
train = raw_train.copy()
test = raw_test.copy()

# extract coauthors as a new key from train.json
for i in range(len(train)):
    coauthors = []
    prolific_authors = []
    for auth in train[i]['authors']:
        if auth >= max_prolific:
            coauthors.append(auth)
        else:
            prolific_authors.append(auth)
    train[i]['coauthors'] = coauthors
    #if len(prolific_authors) == 0:
        #prolific_authors.append(-1)
    train[i]['prolific_authors'] = prolific_authors

train_df = pd.DataFrame.from_dict(train)
train_df = train_df.drop(['authors'], axis=1)
test_df = pd.DataFrame.from_dict(test)

In [98]:
def combine_features(df, have_prolific):
    features = []
    targets = []
    for i in range(df.shape[0]):
        # abstract and title
        abstract_list = [0] * n_text
        title_list = [0] * n_text
        current_row = df.loc[i]
        abstract_freq = collections.Counter(current_row['abstract'])
        title_freq = collections.Counter(current_row['title'])
        for key, value in dict(abstract_freq).items():
            abstract_list[key-1] = value
        for key, value in dict(title_freq).items():
            title_list[key-1] = value
        # year
        year_list = [0] * n_years
        year_list[current_row['year']-1] = 1
        # venue
        venue_list = [0] * (n_venues + 2) # 466 elements with the last element for empty venue
        if current_row['venue'] == '':
            venue_list[-1] = 1
        else:
            venue_list[current_row['venue']] = 1
        # coauthors
        coauthor_list = [0] * (n_authors - n_prolific + 2) # 21147 elements with the last element for empty coauthors
        if current_row['coauthors'] == []:
            coauthor_list[-1] = 1
        else:
            for coauthor in current_row['coauthors']:
                coauthor_list[coauthor-n_prolific] = 1
           
        combined_features = abstract_list + title_list + year_list + venue_list + coauthor_list
        features.append(np.array(combined_features))
        
        if have_prolific == True:
            # prolific authors
            prolific_list = [0] * (n_prolific + 1) # 101 elements with the last element for empty coauthors
            if current_row['prolific_authors'] == []:
                prolific_list[-1] = 1
            else:
                for prolific in current_row['prolific_authors']:
                    prolific_list[prolific] = 1
            targets.append(np.array(prolific_list))
    if have_prolific == True:
        X_train = np.vstack(features)
        y_train = np.array(targets)
        return X_train, y_train
    else:
        X_test = np.vstack(features)
        return X_test

In [99]:
X_train, y_train = combine_features(train_df, have_prolific=True)
X_test = combine_features(test_df, have_prolific=False)
X_train.shape

(25793, 31630)

In [100]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.24, random_state = 0)

In [110]:
mlknn_clf = MLkNN(k = 10, s = 0.3)

# train
mlknn_clf.fit(X_train, y_train)

# predict
mlknn_pred = mlknn_clf.predict(X_val)
mlknn_acc = f1_score(mlknn_pred, y_val, average = "samples")
print(f"f1 score under MLKNN classifier is:{mlknn_acc}")



f1 score under MLKNN classifier is:0.6542169508733876


In [111]:
# predict on test set
mlknn_pred_test = mlknn_clf.predict(X_test)

In [112]:
def find(lst, num):
    result = []
    for i, x in enumerate(lst):
        if x==num:
            result.append(i)
    return result

In [113]:
mlknn_result = test_df[['identifier']]
mlknn_result.loc[:,'Predict'] = ''

for i in range(len(mlknn_pred_test.toarray())):
    result = mlknn_pred_test.toarray()[i]
    if result[-1] == 1 or len(find(list(result), 1)) == 0:
        mlknn_result.loc[i,'Predict'] = -1
    else:
        mlknn_result.loc[i,'Predict'] = ' '.join(str(e) for e in find(list(result), 1))
        
mlknn_result = mlknn_result.rename(columns={'identifier':'ID'})
mlknn_result.to_csv('./results.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [114]:
mlknn_result

Unnamed: 0,ID,Predict
0,0,-1
1,1,-1
2,2,-1
3,3,-1
4,4,-1
...,...,...
795,795,-1
796,796,-1
797,797,-1
798,798,-1
