In [None]:
# basics 
from os.path import exists
import math
import logging
import time
import sys
import argparse
import pickle
from pathlib import Path
import numpy as np
import pandas as pd
import xgboost as xgb
# from sklearn.metrics import average_precision_score, roc_auc_score

# local
from prepare_data_bsl import yearly_authors
from prepare_dataset_bsl import CollabDataset
import utils_bsl as ut

In [None]:
    # define arguments
    parser = argparse.ArgumentParser('baseline link predictions')
    # data 
    parser.add_argument( '--data', type=str, help='collab for our own experiments',
                    default='collab')
    parser.add_argument('--yrs', default = [2010, 2011], type = int, help='years to work on')
    parser.add_argument('--authfile', default = '../../DLrec/newdata/processed_pubs.pickle', \
                        help='crawed pubmed database')
    parser.add_argument('--inpath', default = '../sage/data/mesh/20102011/', \
                        help="since we are using the same dataset as the SAGE, we can reuse the processed dataset")
    parser.add_argument('--node_options', default = 'mesh', \
                        help="node feature options, choose from mesh/pubs")
    parser.add_argument('--savepath', type=str, help='path to save the data',
                    default='20102011_mesh/')

    # model 
    parser.add_argument('--max_depth', type=int, default=2, help='max depth of the tree') #25/10 for SAGE
    parser.add_argument('--lr', type=float, default=0.05, help='Learning rate')
    parser.add_argument('--patience_trees', type=int, default= 10, help=' number of trees to wait before stop building')
    parser.add_argument('--gpu', type=int, default=0, help='GPU index to use if built trees on GPU')
    # training 
    # parser.add_argument('--n_epoch', type=int, default= 100, help='Number of epochs')
    parser.add_argument('--seed', type=int, default=2021, help='One seed that rules them all')
    args = parser.parse_args([])
    

In [None]:
    ## processing and load data
    if not (exists(args.inpath + 'collabs_masks.csv') and exists(args.inpath + 'node_feats.npy')):
        yearly_authors(authfile = args.authfile, years = args.yrs, savepath = args.inpath, options = args.node_options) 
        # dataset processing (graph)
        dataset = CollabDataset(raw_dir = args.inpath)
    
    # original data
    df = pd.read_csv(args.inpath + 'collabs_masks.csv')
    node_feats = np.load(args.inpath + 'node_feats.npy')
    
    logger = ut.create_log(args)
    
    # data preparations; split, get negatives, and merge
    df_train, df_val, df_test = ut.split_mask(df, masks = ['train_mask', 'val_mask', 'test_mask']) 
    
    neg_src, neg_dst = ut.create_negatives(src_list = df['new_author'].to_numpy(), \
                                                       dst_list = df['new_coauthor'].to_numpy(), \
                                                       size = len(df))
    neg_train_src, neg_train_dst = neg_src[:len(df_train)], neg_dst[:len(df_train)]
    neg_val_src, neg_val_dst = neg_src[len(df_train):len(df_train)+ len(df_val)], \
                                neg_dst[len(df_train):len(df_train)+ len(df_val)]
    neg_test_src, neg_test_dst = neg_src[-len(df_test):], neg_dst[-len(df_test):]
    pos_train_src, pos_train_dst = df_train['new_author'].to_numpy(),  df_train['new_coauthor'].to_numpy()
    pos_val_src, pos_val_dst = df_val['new_author'].to_numpy(),  df_val['new_coauthor'].to_numpy()
    pos_test_src, pos_test_dst = df_test['new_author'].to_numpy(),  df_test['new_coauthor'].to_numpy()
    
    # merge first then get the features in numpy
    merged_train_src, y_train, merged_train_dst = ut.merge_pn(pos_ls = pos_train_src, neg_ls = neg_train_src, \
                                                             pos_ls2 = pos_train_dst, neg_ls2 = neg_train_dst, \
                                                             seed = args.seed)
    merged_val_src, y_val, merged_val_dst = ut.merge_pn(pos_ls = pos_val_src, neg_ls = neg_val_src, \
                                                        pos_ls2 = pos_val_dst, neg_ls2 = neg_val_dst,
                                                        seed = args.seed +1 )
    merged_test_src, y_test, merged_test_dst = ut.merge_pn(pos_ls = pos_test_src, neg_ls =neg_test_src, \
                                                           pos_ls2 = pos_test_dst, neg_ls2 = neg_test_dst,
                                                           seed = args.seed +2)
    
    X_train = ut.get_features(merged_train_src, feat_np = node_feats, author_ls2 = merged_train_dst)
    X_val = ut.get_features(merged_val_src, feat_np = node_feats, author_ls2 = merged_val_dst)
    X_test = ut.get_features(merged_test_src, feat_np = node_feats, author_ls2 = merged_test_dst)
    
    # train & pred
    clf = xgb.XGBClassifier(objective ='binary:logistic', 
                        eval_metric = ['logloss', 'auc'],
                        use_label_encoder=False,
                        verbosity = 0,
                        early_stopping_rounds = args.patience_trees,
                        max_depth = args.max_depth, 
                        learning_rate = args.lr, 
                        gpu_id = args.gpu, 
                        random_state = args.seed)

    clf = ut.easy_train(logger = logger, clf = clf, \
                        X_train = X_train, y_train = y_train, X_val = X_val, y_val = y_val, \
                        path = args.savepath)
    ut.easy_predictions(logger = logger, clf = clf, X_test = X_test, y_test = y_test)