In [2]:
from simple_elmo import ElmoModel 
import collections
import torch
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import math
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
import random

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
model.get_elmo_vectors(SENTENCES)
model = ElmoModel()

In [None]:
# read train json file
train_filename = 'train.json'
with open(train_filename, 'r', encoding='utf-8') as f:
    raw_train = json.load(f)
# read test json file
test_filename = 'test.json'
with open(test_filename, 'r', encoding='utf-8') as f:
    raw_test = json.load(f)
    
# get a copy
train = raw_train.copy()
test = raw_test.copy()

# extract coauthors as a new key from train.json
for i in train:
    i['prolific_authors'] = [j for j in i['authors'] if j in range(100)]
    i['coauthors'] = [j for j in i['authors'] if j not in range(100)]


train_df = pd.DataFrame.from_dict(train)
train_df = train_df.drop(['authors'], axis=1)
test_df = pd.DataFrame.from_dict(test)

In [None]:
train_df.shape

In [None]:
train_df

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():

# Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
batch_size = 40
hidden_dim = 200
output_dim = 101
learning_rate = 0.001
num_epochs = 100

In [None]:
# Split Train Set
train_df, validation_df = train_test_split(train_df, test_size=0.05)
length = train_df['prolific_authors'].str.len()
# number of papers without any prolific author
count_empty = (length == 0).sum()
# number of papers with prolific authors
count_non_emtpy = (length >0).sum()
print(count_empty)
print(count_non_emtpy)
train_without = train_df[train_df['prolific_authors'].str.len() == 0]
train_with = train_df[train_df['prolific_authors'].str.len() > 0]
len(train_with)

In [None]:
train_df = pd.concat([train_with, train_without.sample(len(train_with))])
train_df.shape

In [None]:
def combine_features_torch(df, have_prolific):
    features = []
    targets = []
    for i in range(df.shape[0]):
        # abstract and title
        abstract_list = [0] * n_text
        title_list = [0] * n_text
        current_row = df.loc[i]
        abstract_freq = collections.Counter(current_row['abstract'])
        title_freq = collections.Counter(current_row['title'])
        for key, value in dict(abstract_freq).items():
            abstract_list[key-1] = value
        for key, value in dict(title_freq).items():
            title_list[key-1] = value
        # year
        year_list = [0] * n_years
        year_list[current_row['year']-1] = 1
        # venue
        venue_list = [0] * (n_venues + 2) # 466 elements with the last element for empty venue
        if current_row['venue'] == '':
            venue_list[-1] = 1
        else:
            venue_list[current_row['venue']] = 1
        # coauthors
        coauthor_list = [0] * (n_authors - n_prolific + 2) # 21147 elements with the last element for empty coauthors
        if current_row['coauthors'] == []:
            coauthor_list[-1] = 1
        else:
            for coauthor in current_row['coauthors']:
                coauthor_list[coauthor-n_prolific] = 1
           
        combined_features = coauthor_list
        features.append(np.array(combined_features))
        
        if have_prolific == True:
            # prolific authors
            prolific_list = [0] * (n_prolific + 1) # 101 elements with the last element for empty coauthors
            if current_row['prolific_authors'] == []:
                prolific_list[-1] = 1
            else:
                for prolific in current_row['prolific_authors']:
                    prolific_list[prolific] = 1
            targets.append(np.array(prolific_list))
    if have_prolific == True:
        X_train = np.vstack(features)
        y_train = np.array(targets)
        return X_train, y_train
    else:
        X_test = np.vstack(features)
        return X_test

In [None]:
class AuthorDataset(Dataset):

    def __init__(self, X, y, istrain):
        self.X = X
        self.istrain = istrain
        if self.istrain == True:
            self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        if self.istrain == True:
            return self.X[index], self.y[index]
        else:
            return self.X[index]

In [None]:
X_train, y_train = combine_features_torch(train_df, have_prolific=True)
X_test = combine_features_torch(test_df, have_prolific=False)
training_df = AuthorDataset(X_train, y_train, istrain = True)
testing_df = AuthorDataset(X_test, y_train, istrain = False)

In [None]:
def my_collate(batch):
    # for training set
    if len(batch[0]) == 2:
        output = {"x": [], "target": []}
        
        for data in batch:
            x = data[0]
            target = data[1]
            output['x'] += [x]
            output['target'] += [target]
            
        output['x'] = torch.tensor(output['x'], dtype=torch.float)
        output['target'] = torch.tensor(output['target'], dtype=torch.float)
        return output
    
    # for testing set
    else:
        output = {"x": []}
        for data in batch:
            output['x'] += [data]
        output['x'] = torch.tensor(output['x'], dtype=torch.float)
        return output

In [None]:
train_dataloader = DataLoader(dataset = training_df, batch_size = batch_size, shuffle=True, collate_fn = my_collate)
# val_dataloader = DataLoader(dataset = validation_df, batch_size = batch_size, shuffle=False, collate_fn = my_collate)
test_dataloader = DataLoader(dataset = testing_df, batch_size = batch_size, shuffle=False, collate_fn = my_collate)
dataiter = next(iter(train_dataloader))
dataiter

In [None]:
model = ElmoModel()