In [1]:
#general 
import os
import argparse
import pickle
import dill

#you cannot live without 
from tqdm import trange
import pandas as pd
import numpy as np
import time
#import matplotlib.pyplot as plt
import random
from termcolor import colored
from sklearn.feature_extraction.text import TfidfVectorizer

#pip install transformers
#pytorch related
import torch
import torch.nn as nn
import torch.nn.functional as F

#bert related
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, BertConfig
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

#self-defined
from dataProcessing_bertExtreme import DataProcess
import utils_bertExtreme as ut 
from clfbert import clfModel
from eval_metrics import Metrics


In [2]:
def main():  
        
    #for calling the file from terminal 
    parser = argparse.ArgumentParser(description = 'BERT model for data to paper recommendation')
    #do aruguments here when not calling from terminal/inside jupyter notebook 
    args = parser.parse_args([])
    args.data_path1 = 'data/'
    args.data_path2 = 'IIdata/'
    args.subpath = 'sensitivity1vs5000_gene/'
    args.load_pretrained = False
    args.load_path= 'model_save_v5_sensitivity1vs5000_gene/'
    #args.tf_path = '../biobert_v1.1_pubmed'
    #args.from_tf= False
    args.split1 = True
    args.newSplit1= True
    args.split2 = True
    args.newSplit2 = True 

    
    args.cuda_device = 1
    args.learning_rate = 2e-5
    args.epsilon = 1e-8
    args.train_epochs = 4
    args.plot_train = True
    args.names1 = []
    args.names2 = ['geo','srastudies'] #'immport', 'imspace', 'itnshare',
    args.train_ratio = 5000
    args.extra = True
    
    #make sure results are replicable
    seed_val = 1234
    ut.set_seed(seed_val)
    
    #load dataloader
    dp2 =  DataProcess(path= args.data_path2,
                       subpath = args.subpath,
          load_pretrained = args.load_pretrained, 
          load_path = args.load_path,
          split = args.split1,
          newSplit = args.newSplit2,
          names = args.names2,
          train_ratio = args.train_ratio,
          extra = args.extra)
    dp2.dataframize_()
    train_loader,valid_loader, test_loader = dp2.dataloaderize_() #dataloader right here, len of records 9639005
    
    print(len(train_loader), len(valid_loader), len(test_loader)) #  should be less than (218788, 31365, 29))
    #check device
    if torch.cuda.is_available():
        use_cuda = torch.device('cuda:' + str(args.cuda_device))
    else:
        use_cuda = torch.device('cpu')
        
    #load model for bert 
    model = clfModel(load_pretrained = args.load_pretrained, load_path = args.load_path).model
    model.to(use_cuda)
    
    

    #optimizer and scheduler
    optimizer = AdamW(model.parameters(),
                      lr = args.learning_rate,
                      eps = args.epsilon)

    # Create the learning rate scheduler.
    total_steps = len(train_loader) * args.train_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)
    
    #train and valid 
    training_stats = ut.train(epochs = args.train_epochs,
                                     dp2 = dp2,
                                     model = model,
                                     train_loader = train_loader, 
                                     valid_loader = valid_loader, 
                                     optimizer = optimizer, 
                                     scheduler = scheduler, 
                                     use_cuda = use_cuda,
                                     args = args)
    
    #plot
    if args.plot_train:
        ut.plot_train(training_stats, args.load_path)
        
    
        
    #prediction on test
    combine_predictions, combine_true_labels = ut.predictions( dp2 = dp2,
                                                              model = model, 
                                                              test_loader = test_loader, 
                                                              use_cuda = use_cuda, 
                                                              path = args.load_path)
    
    citation_df = dp2.extra_df.iloc[dp2.test_idx,:]
    similarity_dict, max_leng = ut.create_smilarity_dict(citation_df = citation_df, 
                                                         combine_predictions = combine_predictions, 
                                                        save_path = args.load_path)
    print(max_leng)
    #metrics
    print('MRR:')
    print(Metrics(dp2.citation, leng = max_leng).calculate_mrr(similarity_dict)) #mrr

    print('recall@1, recall@10:')
    print(Metrics(dp2.citation, leng = max_leng).calculate_recall_at_k(similarity_dict, 1))
    print(Metrics(dp2.citation, leng = max_leng).calculate_recall_at_k(similarity_dict, 10))

    print('precision@1, precision@10:')
    print(Metrics(dp2.citation,leng = max_leng).calculate_precision_at_k(similarity_dict, 1))        
    print(Metrics(dp2.citation,leng = max_leng).calculate_precision_at_k(similarity_dict, 10))

    print('MAP:')
    print(Metrics(dp2.citation,leng = max_leng).calculate_MAP_at_k(similarity_dict))

mixed pairs dataframe loaded
length of the corpus 9514699
sample of the corpus ['Mineralocorticoid Receptor (MR) trans-Activation of Inflammatory AP-1 Signaling: DEPENDENCE ON DNA SEQUENCE, MR CONFORMATION, AND AP-1 FAMILY MEMBER EXPRESSION. Glucocorticoids are commonly used to treat inflammatory disorders. The glucocorticoid receptor (GR) can tether to inflammatory transcription factor complexes, such as NFκB and AP-1, and trans-repress the transcription of cytokines, chemokines, and adhesion molecules. In contrast, aldosterone and the mineralocorticoid receptor (MR) primarily promote cardiovascular inflammation by incompletely understood mechanisms. Although MR has been shown to weakly repress NFκB, its role in modulating AP-1 has not been established. Here, the effects of GR and MR on NFκB and AP-1 signaling were directly compared using a variety of ligands, two different AP-1 consensus sequences, GR and MR DNA-binding domain mutants, and siRNA knockdown or overexpression of core AP

In [None]:
if __name__ == '__main__':
    main()