In [3]:
# !pip install ogb
# !pip install torch torchvision torchaudio
# !pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric


from ogb.nodeproppred import PygNodePropPredDataset

dataset = PygNodePropPredDataset(name = "ogbn-arxiv") 

split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
graph = dataset[0] # pyg graph object



  _torch_pytree._register_pytree_node(


# train test

In [5]:
import pandas as pd
import torch

# Create a DataFrame with all node IDs
node_ids = torch.arange(graph.num_nodes)

# Create labels for train/valid/test
labels = ['train' if i in train_idx else 'valid' if i in valid_idx else 'test' if i in test_idx else 'unlabeled' for i in node_ids]

# Convert to DataFrame
df_traintest = pd.DataFrame({'node_id': node_ids.tolist(), 'label': labels})

# Show the first few rows of the DataFrame
print(df_traintest.tail())


        node_id label
169338   169338  test
169339   169339  test
169340   169340  test
169341   169341  test
169342   169342  test


# labels

In [7]:
node_labels = graph.y # squeeze to remove unnecessary dimensions

# Convert to DataFrame
df_labels = pd.DataFrame({'node_id': node_ids.tolist(), 'label': node_labels.tolist()})

# Show the first few rows of the DataFrame
print(df_labels.head())

   node_id label
0        0   [4]
1        1   [5]
2        2  [28]
3        3   [8]
4        4  [27]


In [18]:
# Check the number of nodes and edges
num_nodes = graph.num_nodes
num_edges = graph.num_edges
print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")

# node_features = graph.x  # 'x' typically contains the node features
# print("Node features shape:", node_features.shape)
# print("First 5 node features:\n", node_features[:5])

# Node labels
node_labels = graph.y  # 'y' typically contains the node labels
print("Node labels shape:", node_labels.shape)
print("First 5 node labels:\n", node_labels[:5])

# Edge index
edge_index = graph.edge_index  # 'edge_index' contains the edges
print("Edge index shape:", edge_index.shape)
print("First 5 edges:\n", edge_index[:, :5])  # First 5 edges


# Find all unique node labels
# unique_labels = torch.unique(node_labels)

# Print the unique labels
# print("Unique node labels:", unique_labels)

Number of nodes: 169343
Number of edges: 1166243
Node labels shape: torch.Size([169343, 1])
First 5 node labels:
 tensor([[ 4],
        [ 5],
        [28],
        [ 8],
        [27]])
Edge index shape: torch.Size([2, 1166243])
First 5 edges:
 tensor([[104447,  15858, 107156, 107156, 107156],
        [ 13091,  47283,  69161, 136440, 107366]])


# Title and abstract

In [13]:
df_node_mapping = pd.read_csv('dataset/ogbn_arxiv/mapping/nodeidx2paperid.csv')  # nodeid to paperid mapping
df_paper_data = pd.read_csv('new_titles_abstracts.tsv', sep='\t')
df_paper_data.head()

Unnamed: 0,paper id,title,abstract
0,200971,ontology as a source for rule generation,This paper discloses the potential of OWL (Web...
1,549074,a novel methodology for thermal analysis a 3 d...,The semiconductor industry is reaching a fasci...
2,630234,spreadsheets on the move an evaluation of mobi...,The power of mobile devices has increased dram...
3,803423,multi view metric learning for multi view vide...,Traditional methods on video summarization are...
4,1102481,big data analytics in future internet of things,Current research on Internet of Things (IoT) m...


In [15]:
df_node_mapping.head()

Unnamed: 0,node idx,paper id
0,0,9657784
1,1,39886162
2,2,116214155
3,3,121432379
4,4,231147053


In [16]:

# Step 2: Merge the two dataframes on 'paper_id'
df_titleabs = pd.merge(df_node_mapping, df_paper_data, on='paper id', how='inner')

# Step 3: Rename columns for clarity (optional)
df_titleabs = df_titleabs.rename(columns={'node idx': 'node_id', 'title': 'title', 'abstract': 'abstract'})

# Step 4: Show the first few rows of the new dataframe
print(df_titleabs[['node_id', 'title', 'abstract']].head())



   node_id                                              title  \
0        0  evasion attacks against machine learning at te...   
1        1  how hard is computing parity with noisy commun...   
2        2  on the absence of the rip in real world applic...   
3        3      a promise theory perspective on data networks   
4        4  analysis of asymptotically optimal sampling ba...   

                                            abstract  
0  In security-sensitive applications, the succes...  
1  We show a tight lower bound of $\Omega(N \log\...  
2  The purpose of this paper is twofold. The firs...  
3  Networking is undergoing a transformation thro...  
4  Over the last 20 years significant effort has ...  


# prepare data

In [20]:
sentences = [
    f"{row['title']} {row['abstract']}"
    for index, row in df_titleabs.iterrows()
]

# Prepare labels based on the df_labels
labels = [
    str(row['label'][0])  # Assuming the label is in a list and you want the first element
    for index, row in df_labels.iterrows()
]

# Prepare train/test list based on df_traintest
train_or_test_list = [
    'train' if row['node_id'] < 169338 else 'test'  # Change this logic as per your requirements
    for index, row in df_traintest.iterrows()
]

# Prepare meta data
meta_data_list = []
for i in range(len(sentences)):
    meta = f"{i}\t{train_or_test_list[i]}\t{labels[i]}"
    meta_data_list.append(meta)

meta_data_str = '\n'.join(meta_data_list)

# Write meta data to file
with open(f'data/graph_node_labels.txt', 'w') as f:
    f.write(meta_data_str)

# Write sentences (corpus) to file
corpus_str = '\n'.join(sentences)
with open(f'data/corpus/graph_node_corpus.txt', 'w') as f:
    f.write(corpus_str)

In [21]:
print(meta_data_list[:5])
(sentences[:5])

['0\ttrain\t4', '1\ttrain\t5', '2\ttrain\t28', '3\ttrain\t8', '4\ttrain\t27']


["evasion attacks against machine learning at test time In security-sensitive applications, the success of machine learning depends on a thorough vetting of their resistance to adversarial data. In one pertinent, well-motivated attack scenario, an adversary may attempt to evade a deployed system at test time by carefully manipulating attack samples. In this work, we present a simple but effective gradient-based approach that can be exploited to systematically assess the security of several, widely-used classification algorithms against evasion attacks. Following a recently proposed framework for security evaluation, we simulate attack scenarios that exhibit different risk levels for the classifier by increasing the attacker's knowledge of the system and her ability to manipulate attack samples. This gives the classifier designer a better picture of the classifier performance under evasion attacks, and allows him to perform a more informed model selection (or parameter setting). We eval

# class labels

In [17]:
import pandas as pd

# Load the CSV dataset (replace 'your_file.csv' with the actual file path)
df = pd.read_csv('dataset/ogbn_arxiv/mapping/labelidx2arxivcategeory.csv')

# Display the first few rows of the dataset
print(df.head())

# Print the column names
print(df.columns)

   label idx arxiv category
0          0    arxiv cs na
1          1    arxiv cs mm
2          2    arxiv cs lo
3          3    arxiv cs cy
4          4    arxiv cs cr
Index(['label idx', 'arxiv category'], dtype='object')
