In [1]:
# !pip install ogb
# !pip install torch torchvision torchaudio
# !pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric


from ogb.nodeproppred import PygNodePropPredDataset

dataset = PygNodePropPredDataset(name = "ogbn-arxiv") 

split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
graph = dataset[0] # pyg graph object

  _torch_pytree._register_pytree_node(


Downloading http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip


Downloaded 0.08 GB: 100%|███████████████████████| 81/81 [08:05<00:00,  6.00s/it]


Extracting dataset/arxiv.zip


Processing...


Loading necessary files...
This might take a while.
Processing graphs...


100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 7570.95it/s]


Converting graphs into PyG objects...


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 808.46it/s]

Saving...



Done!


In [2]:
print("Train Indices:", train_idx)
print("Validation Indices:", valid_idx)
print("Test Indices:", test_idx)
print("Graph Object:", graph)

Train Indices: tensor([     0,      1,      2,  ..., 169145, 169148, 169251])
Validation Indices: tensor([   349,    357,    366,  ..., 169185, 169261, 169296])
Test Indices: tensor([   346,    398,    451,  ..., 169340, 169341, 169342])
Graph Object: Data(num_nodes=169343, edge_index=[2, 1166243], x=[169343, 128], node_year=[169343, 1], y=[169343, 1])


# train test

In [4]:
import pandas as pd
import torch

# Create a DataFrame with all node IDs
node_ids = torch.arange(graph.num_nodes)

# Create labels for train/valid/test
labels = ['train' if i in train_idx else 'valid' if i in valid_idx else 'test' if i in test_idx else 'unlabeled' for i in node_ids]

# Convert to DataFrame
df_traintest = pd.DataFrame({'node_id': node_ids.tolist(), 'label': labels})

# Show the first few rows of the DataFrame


        node_id label
169333   169333  test
169334   169334  test
169335   169335  test
169336   169336  test
169337   169337  test
169338   169338  test
169339   169339  test
169340   169340  test
169341   169341  test
169342   169342  test


In [5]:
unique_values = df_traintest['label'].value_counts()

# Print the unique values and their counts
print(unique_values)

label
train    90941
test     48603
valid    29799
Name: count, dtype: int64


# labels

In [6]:
node_labels = graph.y # squeeze to remove unnecessary dimensions

# Convert to DataFrame
df_labels = pd.DataFrame({'node_id': node_ids.tolist(), 'label': node_labels.tolist()})

# Show the first few rows of the DataFrame
print(df_labels.head())
unique_values = df_labels['label'].value_counts()

# Print the unique values and their counts
print(unique_values)

   node_id label
0        0   [4]
1        1   [5]
2        2  [28]
3        3   [8]
4        4  [27]
label
[16]    27321
[24]    22187
[28]    21406
[30]    11814
[10]     7869
[34]     7867
[8]      6232
[4]      5862
[5]      4958
[2]      4839
[27]     4801
[26]     4605
[36]     3524
[19]     2877
[23]     2834
[31]     2828
[9]      2820
[37]     2369
[13]     2358
[3]      2080
[20]     2076
[39]     2029
[22]     1903
[6]      1618
[38]     1507
[33]     1271
[25]     1257
[11]      750
[18]      749
[1]       687
[14]      597
[7]       589
[0]       565
[17]      515
[29]      416
[32]      411
[15]      403
[21]      393
[35]      127
[12]       29
Name: count, dtype: int64


In [7]:
# Check the number of nodes and edges
num_nodes = graph.num_nodes
num_edges = graph.num_edges
print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")

# node_features = graph.x  # 'x' typically contains the node features
# print("Node features shape:", node_features.shape)
# print("First 5 node features:\n", node_features[:5])

# Edge index
edge_index = graph.edge_index  # 'edge_index' contains the edges
print("Edge index shape:", edge_index.shape)
print("First 5 edges:\n", edge_index[:, :5])  # First 5 edges


Number of nodes: 169343
Number of edges: 1166243
Edge index shape: torch.Size([2, 1166243])
First 5 edges:
 tensor([[104447,  15858, 107156, 107156, 107156],
        [ 13091,  47283,  69161, 136440, 107366]])


# Title and abstract

In [10]:
df_node_mapping = pd.read_csv('dataset/ogbn_arxiv/mapping/nodeidx2paperid.csv')  # nodeid to paperid mapping
df_paper_data = pd.read_csv('new_titles_abstracts.tsv', sep='\t')
df_paper_data.head()

Unnamed: 0,paper id,title,abstract
0,200971,ontology as a source for rule generation,This paper discloses the potential of OWL (Web...
1,549074,a novel methodology for thermal analysis a 3 d...,The semiconductor industry is reaching a fasci...
2,630234,spreadsheets on the move an evaluation of mobi...,The power of mobile devices has increased dram...
3,803423,multi view metric learning for multi view vide...,Traditional methods on video summarization are...
4,1102481,big data analytics in future internet of things,Current research on Internet of Things (IoT) m...


In [11]:
df_node_mapping.head()

Unnamed: 0,node idx,paper id
0,0,9657784
1,1,39886162
2,2,116214155
3,3,121432379
4,4,231147053


In [13]:

# Step 2: Merge the two dataframes on 'paper_id'
df_titleabs = pd.merge(df_node_mapping, df_paper_data, on='paper id', how='inner')

# Step 3: Rename columns for clarity (optional)
df_titleabs = df_titleabs.rename(columns={'node idx': 'node_id', 'title': 'title', 'abstract': 'abstract'})

# Step 4: Show the first few rows of the new dataframe
print(df_titleabs[['node_id', 'title', 'abstract']].head(1))

print(df_titleabs[['node_id', 'title', 'abstract']].tail(1))

   node_id                                              title  \
0        0  evasion attacks against machine learning at te...   

                                            abstract  
0  In security-sensitive applications, the succes...  
        node_id                                              title  \
169342   169342  fauras a proxy based framework for ensuring th...   

                                                 abstract  
169342  HTTP/2 video streaming has caught a lot of att...  


# prepare data

In [14]:
sentences = [
    f"{row['title']} {row['abstract']}"
    for index, row in df_titleabs.iterrows()
]

In [15]:
labels = [
    str(row['label'][0])  # Assuming the label is in a list and you want the first element
    for index, row in df_labels.iterrows()
]

In [16]:
import random

# Define the percentages for train, test, and validation
train_percentage = 0.70  # 70% of the data in the training set
val_percentage = 0.15    # 15% of the data in the validation set
test_percentage = 0.15   # 15% of the data in the test set

# Ensure percentages sum to 1
assert train_percentage + val_percentage + test_percentage == 1, "Percentages must sum to 1"

# Get the total number of rows in the data
num_rows = len(df_traintest)

# Generate a random list of indices
indices = list(range(num_rows))
random.shuffle(indices)

# Calculate the number of rows for each split
num_train = int(train_percentage * num_rows)
num_val = int(val_percentage * num_rows)
num_test = num_rows - num_train - num_val  # Remaining rows go to test

# Assign each row to train, validation, or test
train_indices = indices[:num_train]
val_indices = indices[num_train:num_train + num_val]
test_indices = indices[num_train + num_val:]

train_or_test_list = []
for i in range(num_rows):
    if i in train_indices:
        train_or_test_list.append('train')
    elif i in val_indices:
        train_or_test_list.append('val')
    else:
        train_or_test_list.append('test')



In [18]:
print(sentences[::50000])
print(labels[::50000])
print(train_or_test_list[::50000])

["evasion attacks against machine learning at test time In security-sensitive applications, the success of machine learning depends on a thorough vetting of their resistance to adversarial data. In one pertinent, well-motivated attack scenario, an adversary may attempt to evade a deployed system at test time by carefully manipulating attack samples. In this work, we present a simple but effective gradient-based approach that can be exploited to systematically assess the security of several, widely-used classification algorithms against evasion attacks. Following a recently proposed framework for security evaluation, we simulate attack scenarios that exhibit different risk levels for the classifier by increasing the attacker's knowledge of the system and her ability to manipulate attack samples. This gives the classifier designer a better picture of the classifier performance under evasion attacks, and allows him to perform a more informed model selection (or parameter setting). We eval

In [19]:
# sentences = [
#     f"{row['title']} {row['abstract']}"
#     for index, row in df_titleabs.iterrows()
# ]

# # Prepare labels based on the df_labels
# labels = [
#     str(row['label'][0])  # Assuming the label is in a list and you want the first element
#     for index, row in df_labels.iterrows()
# ]

# # Prepare train/test list based on df_traintest
# train_or_test_list = [
#     'train' if row['node_id'] < 169338 else 'test'  # Change this logic as per your requirements
#     for index, row in df_traintest.iterrows()
# ]

# Prepare meta data
meta_data_list = []
for i in range(len(sentences)):
    meta = f"{i}\t{train_or_test_list[i]}\t{labels[i]}"
    meta_data_list.append(meta)

meta_data_str = '\n'.join(meta_data_list)

# Write meta data to file
with open(f'data/all_labels.txt', 'w') as f:
    f.write(meta_data_str)

# Write sentences (corpus) to file
corpus_str = '\n'.join(sentences)
with open(f'data/corpus/all_corpus.txt', 'w') as f:
    f.write(corpus_str)

In [20]:
print(meta_data_list[::50000])


['0\ttrain\t4', '50000\ttrain\t23', '100000\ttrain\t13', '150000\ttest\t16']


# class labels

In [21]:
import pandas as pd

# Load the CSV dataset (replace 'your_file.csv' with the actual file path)
df = pd.read_csv('dataset/ogbn_arxiv/mapping/labelidx2arxivcategeory.csv')

# Display the first few rows of the dataset
print(df.head())

# Print the column names
print(df.columns)

   label idx arxiv category
0          0    arxiv cs na
1          1    arxiv cs mm
2          2    arxiv cs lo
3          3    arxiv cs cy
4          4    arxiv cs cr
Index(['label idx', 'arxiv category'], dtype='object')
