# Alternative Approaches - Word Embedding
This notebook displays our alternative approaches of neural networks using word2vec on title and abstract. There are two kinds of neural networks: traditional networks and LSTMs.

In [1]:
import torch
from torch import nn
import numpy as np
import pandas as pd
import json
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import math
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from gensim.models import Word2Vec

In [2]:
max_prolific = 99
n_text = 4999
n_authors = 21245
n_prolific = 100
n_coauthors = n_authors - n_prolific + 1
n_years = 19
n_venues = 466
batch_size = 40
learning_rate = 0.001
num_epochs = 100

In [3]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


# Part 1: Data Prepossessing

In [5]:
train_data_path = '/Users/sicenxi/Documents/GitHub/COMP90051_Project2_Group9/data/train.json'
test_data_path = '/Users/sicenxi/Documents/GitHub/COMP90051_Project2_Group9/data/test.json'
# read train json file
with open(train_data_path, 'r', encoding='utf-8') as f:
    raw_train = json.load(f)
# read test json file
with open(test_data_path, 'r', encoding='utf-8') as f:
    raw_test = json.load(f)
    
# extract coauthors as a new key from train.json
title_list = []
abstract_list = []
word_list = []
for i in range(len(raw_train)):
    coauthors = []
    prolific_authors = []
    for auth in raw_train[i]['authors']:
        if auth >= max_prolific:
            coauthors.append(auth)
        else:
            prolific_authors.append(auth)
    if len(prolific_authors) == 0:
        prolific_authors = -1
    raw_train[i]['coauthors'] = coauthors
    raw_train[i]['prolific_authors'] = prolific_authors
    
train_df = pd.DataFrame.from_dict(raw_train)
train_df = train_df.drop(['authors'], axis=1)
train_df['venue'] = train_df['venue'].replace('', 465)
for i in range(len(train_df)):
    title_list.append(train_df['title'][i])
    abstract_list.append(train_df['abstract'][i])
    word_list.append(train_df['title'][i])
    word_list.append(train_df['abstract'][i])

    
test_df = pd.DataFrame.from_dict(raw_test)
test_df['venue'] = test_df['venue'].replace('', 465)
for i in range(len(test_df)):
    title_list.append(test_df['title'][i])
    abstract_list.append(test_df['abstract'][i])
    word_list.append(test_df['title'][i])
    word_list.append(test_df['abstract'][i])

train_df.shape

(25793, 6)

In [6]:
train_df_prolific = train_df[train_df['prolific_authors'] != -1]
train_df_noprolific = train_df[train_df['prolific_authors'] == -1]
train_df_combine = pd.concat([train_df_prolific, train_df_noprolific.tail(7000)], axis=0)
train_df_combine = train_df_combine.reset_index(drop=True)
train_df_combine.head(3)

Unnamed: 0,year,abstract,venue,title,coauthors,prolific_authors
0,9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1...",[13720],"[42, 36]"
1,15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3...","[1359, 15881]",[45]
2,10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5...",[],[97]


In [7]:
# Word2vec
titlemodel = Word2Vec(title_list, min_count=1, vector_size=128)
titleword_vectors = titlemodel.wv
titlekeyword_list = []
for i in range(len(train_df)):
    word_vec = np.zeros(128)
    for title in title_list[i]:
        word_vec += titleword_vectors[title]
    titlekeyword_list.append(word_vec/len(title_list[i]))
titlekeyword_list = np.array(titlekeyword_list)

abstractmodel = Word2Vec(abstract_list, min_count=1, vector_size=128)
abstractword_vectors = abstractmodel.wv
abstractkeyword_list = []
for i in range(len(train_df)):
    word_vec = np.zeros(128)
    for abstract in abstract_list[i]:
        word_vec += abstractword_vectors[abstract]
    abstractkeyword_list.append(word_vec/len(abstract_list[i]))
abstractkeyword_list = np.array(abstractkeyword_list)

In [8]:
class AuthorDataset(Dataset):

    def __init__(self, dataframe, istrain):
        self.data = dataframe
        self.x = dataframe[['year', 'venue', 'coauthors', 'abstract', 'title']]
        self.istrain = istrain
        if self.istrain == True:
            self.y = self.data.prolific_authors

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, index):
        year = self.data.year[index]
        venue = self.data.venue[index]
        title = self.data.title[index]
        abstract = self.data.abstract[index]
        
        # coauthors to one hot
        coauthors = self.data.coauthors[index]
        coauthor_list = [0] * (n_authors - n_prolific + 2) # 21147 elements with the last element for empty coauthors
        if coauthors == []:
            coauthor_list[-1] = 1
        else:
            for coauthor in coauthors:
                coauthor_list[coauthor-n_prolific] = 1
                
        x_output = {"title": title, "abstract": abstract, "year": year, "venue": venue, "coauthors": coauthor_list}
        
        # target to one hot
        if self.istrain == True:
            prolific_list = [0] * (n_prolific)
            if self.y[index] != -1:
                for prolific in self.y[index]:
                    prolific_list[prolific] = 1
            y_output = prolific_list
            return x_output, y_output
        else:
            return x_output

In [9]:
training_df = AuthorDataset(train_df, istrain = True)
testing_df = AuthorDataset(test_df, istrain = False)

# Part 2: Neural Networks

In [10]:
def my_collate(batch):
    # for training set
    if len(batch[0]) == 2:
        output = {"title": [], "abstract": [], "year": [], "venue": [], "coauthors": [], "target": []}
        
        for data in batch:
            x = data[0]
            target = data[1]
            
            output['title'] += [torch.tensor(x['title'], dtype=torch.long)]
            output['abstract'] += [torch.tensor(x['abstract'], dtype=torch.long)]
            output['year'] += [x['year']]
            output['venue'] += [x['venue']]
            output['coauthors'] += [torch.tensor(x['coauthors'], dtype=torch.long)]
            output['target'] += [target]
            
        output['year'] = torch.tensor(output['year'], dtype=torch.long)
        output['venue'] = torch.tensor(output['venue'], dtype=torch.long)
        output['target'] = torch.tensor(output['target'], dtype=torch.float)
        return output
    
    # for testing set
    else:
        output = {"title": [], "abstract": [], "year": [], "venue": [], "coauthors": []}
        
        for data in batch:
            output['title'] += [torch.tensor(data['title'], dtype=torch.long)]
            output['abstract'] += [torch.tensor(data['abstract'], dtype=torch.long)]
            output['year'] += [data['year']]
            output['venue'] += [data['venue']]
            output['coauthors'] += [torch.tensor(data['coauthors'], dtype=torch.long)]
            
        output['year'] = torch.tensor(output['year'], dtype=torch.long)
        output['venue'] = torch.tensor(output['venue'], dtype=torch.long)
        return output


In [11]:
train_dataloader = DataLoader(dataset = training_df, batch_size = batch_size, shuffle=True, collate_fn = my_collate)
test_dataloader = DataLoader(dataset = testing_df, batch_size = batch_size, shuffle=False, collate_fn = my_collate)

In [12]:
# We combine the traditional neural networks and LSTMs into one cell.
# We uncomment the codes when switching to another approach.

class Multilabel(nn.Module):

    def __init__(self, input_size, embed_dim, hidden_dim, num_layers):
        super(Multilabel, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.weights = torch.FloatTensor(titlekeyword_list)
        #self.weights = torch.FloatTensor(abstractkeyword_list)
        self.embedding = nn.Embedding.from_pretrained(self.weights)
        self.embedding.requires_grad = False
        self.embedding = nn.Embedding(n_text+1, embed_dim)
        
        self.linear1 = nn.Linear(input_size, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, 100)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        
        #self.lstm = nn.LSTM(input_size, hidden_dim, num_layers, batch_first=True)
        #self.fc = nn.Linear(hidden_dim, 100)
        #self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        
        word2vec_title_list = []
        for curr_title in x['title']:
            title_vec = self.embedding(curr_title)
            word2vec_title_list.append(title_vec.mean(dim = 0))
        embed_title = torch.stack(word2vec_title_list)
        
        word2vec_abstract_list = []
        for curr_abstract in x['abstract']:
            abstract_vec = self.embedding(curr_abstract)
            word2vec_abstract_list.append(abstract_vec.mean(dim = 0))
        embed_abstract = torch.stack(word2vec_abstract_list)
        
        #out = embed_title.reshape([embed_title.shape[0], 1, embed_title.shape[1]])
        
        #out = embed_title.reshape([embed_abstract.shape[0], 1, embed_abstract.shape[1]])
        
        
        
        #h0 = torch.zeros(self.num_layers, out.size(0), self.hidden_dim)
        #c0 = torch.zeros(self.num_layers, out.size(0), self.hidden_dim)
        
        # Forward propagate LSTM
        #out, (hn, cn) = self.lstm(out, (h0, c0))  
        # out: tensor of shape (batch_size, seq_length, hidden_size)
        #out = self.fc(out[:, -1, :]) 
        #out = self.sigmoid(out)
        #return out
        
        out = self.linear1(embed_title)
        #out = self.linear1(embed_abstract)
        out = self.activation(out)
        out = self.linear2(out)
        out = torch.sigmoid(out)
        return out
        

In [14]:
clf = Multilabel(64, 64, 128, 1)

In [None]:
# loss and optimizer
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(clf.parameters(),lr=0.1)

# training loop
n_total_steps = len(train_dataloader)
for epoch in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        #batch = batch.to(device)

        # forward
        outputs = clf(batch)
        
        loss = criterion(outputs, batch['target'])
        
        predictions = np.where(outputs.detach().numpy()>=0.5, 1, 0)
        f1_acc = f1_score(batch['target'].detach().numpy(), predictions, average="samples", zero_division=1)

        
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'epoch {epoch + 1} / {num_epochs}, loss = {loss.item():.4f}, training f1 score = {f1_acc:.4f}')
            
            

In [16]:
# testing
test_preds = []
with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        outputs = clf(batch)
        
        predictions = np.where(outputs.detach().numpy()>0.5, 1, 0)
        test_preds.append(predictions)

In [17]:
def find(lst, num):
    result = []
    for i, x in enumerate(lst):
        if x==num:
            result.append(i)
    return result

In [18]:
NN_result = test_df[['identifier']]
NN_result.loc[:,'Predict'] = ''
final_result = []
for i in range(len(test_preds)):
    for j in range(batch_size):
        final_result.append(test_preds[i][j])
print(len(final_result))
for i in range(len(final_result)):
    result = final_result[i]
    if len(find(list(result), 1)) == 0:
        NN_result.loc[i,'Predict'] = -1
    else:
        NN_result.loc[i,'Predict'] = ' '.join(str(e) for e in find(list(result), 1))
        
NN_result = NN_result.rename(columns={'identifier':'ID'})
NN_result.to_csv('./NN_results.csv', index=False)

800


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [None]:
NN_result