<a href="https://colab.research.google.com/github/bruhCarrotz/hw3-109006234/blob/main/hw3_datascience.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing the DGL Package

In [None]:
!pip install --upgrade dgl

Collecting dgl
  Downloading dgl-2.1.0-cp310-cp310-manylinux1_x86_64.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2->torchdata>=0.5.0->dgl)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2->torchdata>=0.5.0->dgl)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2->torchdata>=0.5.0->dgl)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2->torchdata>=0.5.0->dgl)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=2->torchdata>=0.5.0->dgl)
  Using cached nvidia_cublas_cu12-12.1.3.1

### Replacing the Traditional Command Line Arguments

In [None]:
def parse_arguments():
    args = {
        'epochs': 200,
        'es_iters': 10,
        'use_gpu': True
    }

    # Loop through command-line arguments
    i = 1
    while i < len(sys.argv):
        if sys.argv[i] == '--epochs':
            args['epochs'] = int(sys.argv[i + 1])
            i += 1
        elif sys.argv[i] == '--es_iters':
            args['es_iters'] = int(sys.argv[i + 1])
            i += 1
        elif sys.argv[i] == '--use-gpu':
            args['use_gpu'] = True
        i += 1

    return args

### Load Data from the Provided Dataset

In [None]:
import pickle as pkl
import sys

def load_data():
    """
    * Load data from pickle file in folder `dataset`.
    * No need to modify.

    * test_labels is an array of length 1000 with each element being -1.
    * train_mask, val_mask, and test_mask are used to indicate the index of each set of nodes.
    """
    names = ['features', 'graph', 'num_classes',
             'train_labels', 'val_labels', 'test_labels',
             'train_mask', 'val_mask', 'test_mask']

    objects = []
    for i in range(len(names)):
        with open("dataset/private_{}.pkl".format(names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    features, graph, num_classes, \
    train_labels, val_labels, test_labels, \
    train_mask, val_mask, test_mask = tuple(objects)

    return features, graph, num_classes, train_labels, val_labels, test_labels, train_mask, val_mask, test_mask

### GAT Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
from dgl.nn.pytorch import GATConv, GraphConv

class GCN(nn.Module):
    """
    Baseline Model:
    - A simple two-layer GCN model, similar to https://github.com/tkipf/pygcn
    - Implement with DGL package
    """
    def __init__(self, in_size, hid_size, out_size):
        super().__init__()
        self.layers = nn.ModuleList()
        # two-layer GCN
        self.layers.append(
            GraphConv(in_size, hid_size, activation=F.relu)
        )
        self.layers.append(GraphConv(hid_size, out_size))
        self.dropout = nn.Dropout(0.5)

    def forward(self, g, features):
        h = features
        for i, layer in enumerate(self.layers):
            if i != 0:
                h = self.dropout(h)
            h = layer(g, h)
        return h

class GAT(nn.Module):
    """
    Graph Attention Network (GAT) Model
    """
    def __init__(self, in_size, hid_size, out_size, num_heads=1, num_layers=2, dropout=0.5):
        super(GAT, self).__init__()
        self.layers = nn.ModuleList()
        # Input layer
        self.layers.append(GATConv(in_size, hid_size, num_heads=num_heads))
        # Hidden layers
        for _ in range(num_layers - 2):
            self.layers.append(GATConv(hid_size * num_heads, hid_size, num_heads=num_heads))
        # Output layer
        self.layers.append(GATConv(hid_size * num_heads, out_size, num_heads=num_heads))
        self.dropout = nn.Dropout(dropout)

    def forward(self, g, features):
        h = features
        for i, layer in enumerate(self.layers):
            h = layer(g, h).flatten(1)
            h = F.elu(h)
            if i != len(self.layers) - 1:
                h = self.dropout(h)
        return h


DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


### Evaluating and Training the Model

In [None]:
import torch
import torch.nn as nn

import dgl.function as fn

import os
import warnings
warnings.filterwarnings("ignore")

def evaluate(g, features, labels, mask, model):
    """Evaluate model accuracy"""
    model.eval()
    with torch.no_grad():
        logits = model(g, features)
        logits = logits[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

def train(g, features, train_labels, val_labels, train_mask, val_mask, model, epochs, es_iters=None):

    # define train/val samples, loss function and optimizer
    loss_fcn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, weight_decay=5e-4)

    # If early stopping criteria, initialize relevant parameters
    if es_iters:
        print("Early stopping monitoring on")
        loss_min = 1e8
        es_i = 0

    # training loop
    for epoch in range(epochs):
        model.train()
        logits = model(g, features)
        loss = loss_fcn(logits[train_mask], train_labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        acc = evaluate(g, features, val_labels, val_mask, model)
        print(
            "Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
                epoch, loss.item(), acc
            )
        )

        val_loss = loss_fcn(logits[val_mask], val_labels).item()
        if es_iters:
            if val_loss < loss_min:
                loss_min = val_loss
                es_i = 0
            else:
                es_i += 1

            if es_i >= es_iters:
                print(f"Early stopping at epoch={epoch+1}")
                break

### Driver Function

In [None]:
if __name__ == '__main__':
    args = parse_arguments()

    if args['use_gpu']:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    else:
        device = torch.device("cpu")

    # Load data
    features, graph, num_classes, \
    train_labels, val_labels, test_labels, \
    train_mask, val_mask, test_mask = load_data()

    # Initialize the model (Baseline Model: GCN)
    """TODO: build your own model in model.py and replace GCN() with your model"""
    in_size = features.shape[1]
    out_size = num_classes
    model = GAT(in_size, 32, out_size, num_heads=4, num_layers=2, dropout=0.1)
    model.to(device)

    # model training
    print("Training...")
    train(graph, features, train_labels, val_labels, train_mask, val_mask, model, args['epochs'], args['es_iters'])

    print("Testing...")
    model.eval()
    with torch.no_grad():
        logits = model(graph, features)
        logits = logits[test_mask]
        _, indices = torch.max(logits, dim=1)

    # Export predictions as csv file
    print("Export predictions as csv file.")
    with open('output.csv', 'w') as f:
        f.write('ID,Predict\n')
        for idx, pred in enumerate(indices):
            f.write(f'{idx},{int(pred)}\n')
    # Please remember to upload your output.csv file to Kaggle for scoring

Training...
Early stopping monitoring on
Epoch 00000 | Loss 2.4880 | Accuracy 0.5717 
Epoch 00001 | Loss 2.2966 | Accuracy 0.6233 
Epoch 00002 | Loss 2.0900 | Accuracy 0.6350 
Epoch 00003 | Loss 1.8709 | Accuracy 0.6267 
Epoch 00004 | Loss 1.6597 | Accuracy 0.6350 
Epoch 00005 | Loss 1.4717 | Accuracy 0.6483 
Epoch 00006 | Loss 1.2886 | Accuracy 0.6783 
Epoch 00007 | Loss 1.1590 | Accuracy 0.7150 
Epoch 00008 | Loss 1.0651 | Accuracy 0.7300 
Epoch 00009 | Loss 0.9981 | Accuracy 0.7300 
Epoch 00010 | Loss 0.9253 | Accuracy 0.7217 
Epoch 00011 | Loss 0.8796 | Accuracy 0.7167 
Epoch 00012 | Loss 0.8292 | Accuracy 0.7200 
Epoch 00013 | Loss 0.8017 | Accuracy 0.7183 
Epoch 00014 | Loss 0.7582 | Accuracy 0.7183 
Epoch 00015 | Loss 0.7378 | Accuracy 0.7300 
Epoch 00016 | Loss 0.7087 | Accuracy 0.7317 
Epoch 00017 | Loss 0.6539 | Accuracy 0.7383 
Epoch 00018 | Loss 0.6519 | Accuracy 0.7383 
Epoch 00019 | Loss 0.6152 | Accuracy 0.7400 
Epoch 00020 | Loss 0.5808 | Accuracy 0.7433 
Epoch 00021 | 

### Finding the Best Hyperparameter Tuning for GAT Model

In [None]:
import random

# Define the search space for hyperparameters
search_space = {
    'hid_size': [8, 16, 32],
    'num_heads': [2, 4, 8],
    'num_layers': [2, 3, 4],
    'dropout': [0.1, 0.01, 0.001]
}

# Set the number of trials
num_trials = 10

best_accuracy = 0
best_hyperparameters = {}

# Perform random search
for i in range(num_trials):
    # Randomly sample hyperparameters
    hyperparameters = {
        'in_size': features.shape[1],
        'hid_size': random.choice(search_space['hid_size']),
        'out_size': num_classes,
        'num_heads': random.choice(search_space['num_heads']),
        'num_layers': random.choice(search_space['num_layers']),
        'dropout': random.choice(search_space['dropout'])
    }

    print(f"Trial {i+1}/{num_trials}: Hyperparameters - {hyperparameters}")

    # Initialize and train the model with the sampled hyperparameters
    model = GAT(**hyperparameters)
    model.to(device)
    train(graph, features, train_labels, val_labels, train_mask, val_mask, model, epochs=args['epochs'], es_iters=args['es_iters'])

    # Evaluate the model on the validation set
    accuracy = evaluate(graph, features, val_labels, val_mask, model)

    print(f"Validation Accuracy: {accuracy}")

    # Keep track of the best hyperparameters
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_hyperparameters = hyperparameters

print("Best Hyperparameters:", best_hyperparameters)
print("Best Accuracy:", best_accuracy)


Trial 1/10: Hyperparameters - {'in_size': 478, 'hid_size': 8, 'out_size': 3, 'num_heads': 4, 'num_layers': 2, 'dropout': 0.1}
Early stopping monitoring on
Epoch 00000 | Loss 2.4919 | Accuracy 0.4317 
Epoch 00001 | Loss 2.3950 | Accuracy 0.4050 
Epoch 00002 | Loss 2.2990 | Accuracy 0.4050 
Epoch 00003 | Loss 2.1962 | Accuracy 0.4050 
Epoch 00004 | Loss 2.0853 | Accuracy 0.4050 
Epoch 00005 | Loss 1.9813 | Accuracy 0.4050 
Epoch 00006 | Loss 1.8685 | Accuracy 0.4050 
Epoch 00007 | Loss 1.7666 | Accuracy 0.4050 
Epoch 00008 | Loss 1.6686 | Accuracy 0.4050 
Epoch 00009 | Loss 1.5667 | Accuracy 0.4050 
Epoch 00010 | Loss 1.4890 | Accuracy 0.4050 
Epoch 00011 | Loss 1.4056 | Accuracy 0.4050 
Epoch 00012 | Loss 1.3172 | Accuracy 0.4050 
Epoch 00013 | Loss 1.2786 | Accuracy 0.4100 
Epoch 00014 | Loss 1.2107 | Accuracy 0.4100 
Epoch 00015 | Loss 1.1576 | Accuracy 0.4200 
Epoch 00016 | Loss 1.1020 | Accuracy 0.4700 
Epoch 00017 | Loss 1.0721 | Accuracy 0.6433 
Epoch 00018 | Loss 1.0457 | Accurac