## Load Packages and Data

In [1]:
from model.utils import *
from model.gnn import *
from model.node2vec import *
from model.bert_embeddings import *

# Set the seeds for reproducibility
set_seeds(42)

In [2]:
# Load and preprocess the data
df = data_preprocess('data/full_context_PeerRead.csv')
df.head(5)

Unnamed: 0,right_citated_text,left_citated_text,source_abstract,source_author,source_id,source_title,source_venue,source_year,target_id,target_author,target_abstract,target_year,target_title,target_venue,citated_text,citated_text_id
0,andsyntactic parsing .Because RNNs make very f...,We conducted additional experiments on artific...,Deep Neural Networks (DNNs) are powerful model...,ilya sutskever;oriol vinyals;quoc v le,1409.3215v1,Sequence to Sequence Learning with Neural Netw...,NIPS,2014.0,1606.03622v1,robin jia;percy liang,Modeling crisp logical regularities is crucial...,2016,Data Recombination for Neural Semantic Parsing,ACL,We conducted additional experiments on artific...,0
1,.Because RNNs make very few domain-specific as...,We conducted additional experiments on artific...,Syntactic parsing is a fundamental problem in ...,oriol vinyals;lukasz kaiser;terry koo;slav pet...,1412.7449v1,Grammar as a Foreign Language,NIPS,2014.0,1606.03622v1,robin jia;percy liang,Modeling crisp logical regularities is crucial...,2016,Data Recombination for Neural Semantic Parsing,ACL,We conducted additional experiments on artific...,0
2,"; in a Pointer Network,the only way to generat...","Reproducibility. All code, data, and experimen...",We introduce a new neural architecture to lear...,oriol vinyals;meire fortunato;navdeep jaitly,1506.03134v1,Pointer Networks,NIPS,2015.0,1606.03622v1,robin jia;percy liang,Modeling crisp logical regularities is crucial...,2016,Data Recombination for Neural Semantic Parsing,ACL,"Reproducibility. All code, data, and experimen...",1
3,". Recently, nsur . have shown superior perfor...","st like CWS and POS tagging, automatic prosody...",The recently introduced continuous Skip-gram m...,tomas mikolov;ilya sutskever;kai chen 0010;gre...,1310.4546v1,Distributed Representations of Words and Phras...,NIPS,2013.0,1511.00360v1,chuang ding;lei xie;jie yan;weini zhang;yang liu,Prosody affects the naturalness and intelligib...,2015,Automatic Prosody Prediction for Chinese Speec...,arxiv,"st like CWS and POS tagging, automatic prosody...",2
4,model trained on the Google News dataset3.In a...,We begin by considering a document as the set ...,The recently introduced continuous Skip-gram m...,tomas mikolov;ilya sutskever;kai chen 0010;gre...,1310.4546v1,Distributed Representations of Words and Phras...,NIPS,2013.0,1705.10900v1,paul michel;abhilasha ravichander;shruti rijhwani,We investigate the pertinence of methods from ...,2017,Does the Geometry of Word Embeddings Help Docu...,arxiv,We begin by considering a document as the set ...,3


## Node2Vec Baseline Model

In [3]:
# Train a Node2Vec model and save to 'model/node2vec_model.pth'. 
run_node2vec(df, 'train', 'model/node2vec_model.pth')

Epoch: 100, Train Loss: 0.001, Val AUC: 0.642
Epoch: 200, Train Loss: 0.001, Val AUC: 0.644
Epoch: 300, Train Loss: 0.001, Val AUC: 0.645
Epoch: 400, Train Loss: 0.001, Val AUC: 0.646
Epoch: 500, Train Loss: 0.001, Val AUC: 0.647
Epoch: 600, Train Loss: 0.001, Val AUC: 0.648
Epoch: 700, Train Loss: 0.001, Val AUC: 0.648
Epoch: 800, Train Loss: 0.001, Val AUC: 0.649
Epoch: 900, Train Loss: 0.001, Val AUC: 0.649
Epoch: 1000, Train Loss: 0.001, Val AUC: 0.650
Model saved to model/node2vec_model.pth


In [4]:
# Evaluate the Node2Vec model trained previously and saved at 'model/node2vec_model.pth'. 
# This step generates Node2Vec model evaluation metrics.
node2vec_result = run_node2vec(df, 'evaluate', 'model/node2vec_model.pth')
node2vec_result

{'MRR': 0.1879389495677682,
 'MAP@5': 0.062332882273342084,
 'MAP@10': 0.0579965099105345,
 'MAP@30': 0.05951454702935102,
 'MAP@50': 0.060283805284909224,
 'MAP@80': 0.060862072781913135,
 'Recall@5': 0.07121131709678721,
 'Recall@10': 0.08962863466492975,
 'Recall@30': 0.12709981918330793,
 'Recall@50': 0.1501050697410791,
 'Recall@80': 0.17726019423011827}

## BERT+GNN

In [5]:
# Processes each citated text through a BERT model to obtain embeddings
# Note: Running `generate_embeddings(df)` is resource-intensive and time-consuming.
# If you have pre-generated embeddings and saved them as a .pkl file, you can skip this step.
# Uncomment the line below if you need to generate embeddings.
# generate_embeddings(df)

In [6]:
# Train a GNN model and save to 'model/gnn_model.pth'.
run_gnn(df, 'train', 'model/gnn_model.pth')

Epoch: 100, Train Loss: 0.592, Val AUC: 0.823
Epoch: 200, Train Loss: 0.568, Val AUC: 0.832
Epoch: 300, Train Loss: 0.550, Val AUC: 0.840
Epoch: 400, Train Loss: 0.532, Val AUC: 0.844
Epoch: 500, Train Loss: 0.522, Val AUC: 0.852
Epoch: 600, Train Loss: 0.514, Val AUC: 0.858
Epoch: 700, Train Loss: 0.503, Val AUC: 0.864
Epoch: 800, Train Loss: 0.494, Val AUC: 0.871
Epoch: 900, Train Loss: 0.486, Val AUC: 0.876
Epoch: 1000, Train Loss: 0.480, Val AUC: 0.878
Model saved to model/gnn_model.pth


In [7]:
# Evaluate the GNN model trained previously and saved at 'model/gnn_model.pth'. 
# This step generates GNN model evaluation metrics.
gnn_result = run_gnn(df, 'evaluate', 'model/gnn_model.pth')
gnn_result

{'MRR': 0.21987128506003825,
 'MAP@5': 0.07586393023605455,
 'MAP@10': 0.07415244890117617,
 'MAP@30': 0.08168203471244725,
 'MAP@50': 0.08451303560611,
 'MAP@80': 0.08637684553743262,
 'Recall@5': 0.10321673127555345,
 'Recall@10': 0.15107529303517533,
 'Recall@30': 0.24542951936823074,
 'Recall@50': 0.29673323168396737,
 'Recall@80': 0.3490031253977236}

## Model Comparion

In [8]:
# Dictionary to hold the evaluation results of different models for comparison
models = {
    'node2vec': node2vec_result,  # evaluation metrics for the Node2Vec model
    'BERT+GNN': gnn_result,  # evaluation metrics for the BERT+GNN model
}

# Generate the evaluation table for model comparison
evaluation_table = metric_evaluation_table(models)

# Print the evaluation table
print(evaluation_table)

               MRR     MAP@5    MAP@10    MAP@30    MAP@50    MAP@80  \
Model                                                                  
node2vec  0.187939  0.062333  0.057997  0.059515  0.060284  0.060862   
BERT+GNN  0.219871  0.075864  0.074152  0.081682  0.084513  0.086377   

          Recall@5  Recall@10  Recall@30  Recall@50  Recall@80  
Model                                                           
node2vec  0.071211   0.089629    0.12710   0.150105   0.177260  
BERT+GNN  0.103217   0.151075    0.24543   0.296733   0.349003  
