In [34]:
import json
import numpy as np
import tensorflow as tf
from typing import *
from sklearn.model_selection import train_test_split

In [2]:
with open("train.json","r") as fin:
    train_data = json.load(fin)

In [3]:
with open("test.json","r") as fin:
    test_data = json.load(fin)

In [5]:
# take out prolific author and coauthor
for data in train_data:
    data["coauthors"] = [author for author in data["authors"] if author not in range(100)]
    data["prolific_author"] = [author for author in data["authors"] if author in range(100)]

In [6]:
print(train_data[0])

{'authors': [42, 13720, 36], 'year': 9, 'abstract': [2455, 1858, 2335, 1543, 1800, 1860, 2000, 2867, 1546, 1874, 2059, 1525, 2590, 4196, 12, 2634, 1543, 1800, 1586, 2866, 3595, 1866, 1670, 2000, 3743, 1542, 1650, 1527, 33, 4407, 1543, 1535, 1962, 1961, 1543, 33, 1700, 1543, 1535, 1647, 1546, 1580, 4720, 12, 1731, 4231, 2601, 1553, 1704, 1605, 2456, 1543, 3281, 1594, 4407, 2168, 1542, 1586, 3781, 2471, 1525, 1859, 1669, 2512, 4572, 1546, 1609, 3781, 2471, 1525, 3393, 12, 37, 1712, 1586, 4196, 1650, 1527, 3281, 1594, 4407, 1800, 4708, 1904, 2059, 2411, 12], 'venue': 20, 'title': [41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1539, 1715, 1553, 1541, 1536, 1532, 1872, 1538], 'coauthors': [13720], 'prolific_author': [42, 36]}


In [27]:
# using one-hot method 
venues_num = 466
coauthors_num =21246
polific_author_num = 100
year_num = 20
words_num = 4999
def one_hot_vector(instance:Dict):
    
    venue_hot=np.zeros(venues_num)
    venue = instance["venue"]
    if venue == "":
        venue = venues_num-1
    venue_hot[venue] = 1
    
    year_hot=np.zeros(year_num)
    years = instance["year"]
    year_hot[years] =1
    
    coau_hot=np.zeros(coauthors_num)
    coau = instance["coauthors"]
    coau_hot[coau] =1
    
    title_hot=np.zeros(words_num)
    for title in instance["title"]:
        title_hot[title-1] =1
        
    abstr_hot=np.zeros(words_num)
    for abstr in instance["abstract"]:
        abstr_hot[abstr-1] =1
        
    return np.concatenate([venue_hot,year_hot,coau_hot,title_hot,abstr_hot])

In [15]:
# multi_label binary vector
def binary_vector(instance):
    target_bin = np.zeros(polific_author_num)
    for pro_au in instance["prolific_author"]:
        target_bin[pro_au] =1
    return target_bin

In [17]:
binary_vector(train_data[0])
#print(train_data[0]["prolific_author"])

[42, 36]


In [31]:
#create the dataset 
def dataset_cr(dataset,is_test = False):
    train_xs = []
    train_ys = []
    for instance in dataset:
        train_xs.append(one_hot_vector(instance))
        if is_test == False:
            
            train_ys.append(binary_vector(instance))
    return np.vstack(train_xs),np.array(train_ys)

In [29]:
dataset_cr(train_data)

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

In [32]:
train_x,train_y = dataset_cr(train_data)
train_x.shape,train_y.shape

((25793, 31730), (25793, 100))

In [35]:
# binary classification , 1 or 0
#multi-class classification label , {0,1,2,3,4,5,6} ->[4]
#multi-label classification label ,{0,1,2,3,4,5,6} ->[0,1,5]
# this project is multi-label classification label
#train_x = (n_sample,m_feature)
x_train,x_dev,y_train,y_dev = train_test_split(train_x,train_y,test_size = 0.3,random_state = 48)

In [37]:
#train_x = (n_sample,m_feature)
x_train.shape,x_dev.shape,y_train.shape,y_dev.shape

((20634, 31730), (5159, 31730), (20634, 100), (5159, 100))

In [46]:
#classical statistical model
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

In [47]:
x_train_samp = x_train[:30]
y_train_samp = y_train[:30]
x_dev_samp = x_dev[:5]
y_dev_samp = y_dev[:5]

In [54]:
multino = MultinomialNB(alpha=0.01)
clssifi= OneVsRestClassifier(estimator=multino, n_jobs =4)
clssifi.fit(x_train,y_train)

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import f1_score
dev_predi = clssifi.predict(x_dev)
dev_score = f1_score(dev_predi,y_dev,average='samples')
print(dev_score)

In [None]:
#testing 
test_x,test_y = dataset_cr(test_data,is_test = True)
test_pred = clssifi.predict(test_x)

In [None]:
import csv
answer=dict()
head = ["ID",'Predict']
with open("solution.csv","w") as csvfile:
    writer = csv.DictWriter(csvfile,fieldnames = head)
    writer.writeheader()
    for idx,row in enumerate(test_pred):
        result = [str(each) for each in np.where(row ==1)[0].tolist()]
        predict =-1 if len(result) == else " ".join(result)
        writer.writerow({"ID":idx,"Predict":predict})

In [None]:
# neural network based
#embedding method
from torch.utils.data import DataLoader
from torch import nn
from torch.utils.data import Dataset
import torch
import json

In [None]:
# using torch method 
venues_num = 466
coauthors_num =21246
polific_author_num = 100
year_num = 20
words_num = 4999
def torch_vector(instance:Dict):
    
    return {"venue":instance["venue"],
            "year":instance["year"],
            "coauthor":instance["coauthor"],
            "title":instance["title"],
            "abstract",instance["abstract"],}
        
    #return np.concatenate([venue_hot,year_hot,coau_hot,title_hot,abstr_hot])

In [None]:
# multi_label torch vector
def torch_vector(instance):
    target_bin = np.zeros(polific_author_num)
    for pro_au in instance["prolific_author"]:
        target_bin[pro_au] =1
    return target_bin

In [None]:
#create the dataset 
def dataset_cr_torch(dataset,is_test = False):
    train_xs = []
    train_ys = []
    for instance in dataset:
        train_xs.append(torch_vector(instance))
        if is_test == False:
            train_ys.append(torch_vector(instance))
    return np.vstack(train_xs),np.array(train_ys)

In [None]:
x_ner,y_ner = dataset_cr_torch(train_data)
x_ner.shape

In [None]:
#create a data type for author 
class authordataset(Dataset):
    #input raw data
    def __init__(self,data_raw):
        self.train_xs,self.train_ys = dataset_cr_torch(data_raw)
    def __len__(self):
        return len(self.train_ys)
    def __getitem__(self,idx):
        return self.train_xs[idx], self.train_ys[idx]

In [None]:
train_set_t = authordataset(train_data[:50])
dev_set_t = authordataset(train_data[51:100])
print(len(train_set_t))
print(len(dev_set_t))

In [None]:
def colfunction(records):
    """
    record-> list ,include 2-tuple
    """
    
    coauthors = []
    venues =[]
    title = []
    abstract = []
    year = []
    y_train=[]
    for data,label in records:
        coauthors.append(torch.LongTensor(data['coauthors']))
        venues.append(data['venue'])
        year.append(data['year'])
        title.append(torch.LongTensor(data['title']))
        abstract.append(torch.LongTensor(data['abstract']))
        y_train.append(label)
    return {
        'coauthors':coauthors, 'venues': torch.LongTensor(venues),
        'years':torch.LongTensor(year),'title':title,
        'abstract':abstract,
        'labels':torch.FloatTensor(y_train)
    }

In [None]:
#using DataLoader to preprocess the data (iterator)
train_data = DataLoader(train_set_t,batch_size = 8,shuffle=True,collate_fn =colfunction )