In [18]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from morphopy.computation import file_manager as fm
from morphopy.neurontree import NeuronTree as nt
import os
import re
import torch
from torch_geometric.utils.convert import from_networkx
from torch_geometric.data import Dataset, Data
import pandas as pd
from sklearn.preprocessing import LabelEncoder


## Directory for Gouwens Dataset

In [None]:
GOUWENS_DIR = "/external/rprshnas01/netdata_kcni/stlab/Public/AIBS_patchseq_2020/mouse/morphology/download.brainlib.org+8811/biccn/zeng/pseq/morph/200526/"

## Preprocessing

In [None]:
def extract_id_from_file(filename):
    parts = filename.split("/")
    extracted_number = parts[-1].split("_")[0]

    return extracted_number

In [None]:
def extract_graph(filename):
    try:
        N = fm.load_swc_file(filename)
        _id = extract_id_from_file(filename)
        return (int(_id), N, N.get_graph())
    except ValueError:
        return -1

In [None]:
def get_id(filename):
    match = re.search(r'\d+', filename)
    if match:
        number = match.group()
        return number

In [None]:
def get_graphs(directory):
    file_list = []
    for file in os.listdir(directory):
        if file.endswith("_transformed.swc"):
            file_list.append(os.path.join(directory, file))
            
    return file_list

In [None]:
samples = get_graphs(GOUWENS_DIR)

# Load Gouwens Metadata

In [None]:
metadata_df = pd.read_csv('/nethome/kcni/aaulakh/morphology/metadata_gouwens.csv')

In [None]:
metadata_df.columns

In [None]:
metadata_df['corresponding_AIT2.3.1_alias'].value_counts()

In [None]:
metadata_df['cluster_label'].value_counts()

## Graph Neural Network

In [None]:
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the corresponding labels
encoded_labels = label_encoder.fit_transform(metadata_df['cluster_label'])

# Store the encoded labels back into the DataFrame
metadata_df['encoded_labels'] = encoded_labels

# Create a mapping dictionary of original labels to encoded values
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))


In [None]:
def create_pytorch_object(df, _id, g, neuron_tree):
    data = from_networkx(g, group_node_attrs=neuron_tree.get_node_attribute_names(), group_edge_attrs=neuron_tree.get_edge_attribute_names())
    
    subclass_label = df.loc[df['cell_specimen_id'] == _id, 'encoded_labels'].values[0]
    subclass_label = np.array([subclass_label], dtype=np.int64)
    data.y = torch.from_numpy(subclass_label)
    
    return data

In [None]:
data_objects = []

### Create Graph Objects

In [None]:
for s in samples:
    g_info = extract_graph(s)
    if g_info != -1:
        _id = g_info[0]
        neuron_tree = g_info[1]
        g = g_info[2]
        if len(list(nx.weakly_connected_components(g))) == 1:
            try:
                data_object = create_pytorch_object(metadata_df, _id, g, neuron_tree)
                data_objects.append(data_object)
            except ValueError:
                disconnected_nodes = [node for node in g.nodes() if not nx.node_connected_component(g, node)]
                print(f"Skipping graph {_id} due to disconnected nodes: {disconnected_nodes}")


### Define Dataset Class

In [None]:
class MorphologyDataset(Dataset):
    def __init__(self, data_objects, transform=None):
        super().__init__(root=None, transform=transform)
        self.data_objects = data_objects

    def len(self):
        return len(self.data_objects)

    def get(self, idx):
        return self.data_objects[idx]

In [None]:
dataset = MorphologyDataset(data_objects, transform=None)

In [None]:
torch.save(dataset, '/nethome/kcni/aaulakh/morphology/gouwens_pyg_clusters_dataset.pt')

In [None]:
dataset = torch.load('/nethome/kcni/aaulakh/morphology/gouwens_pyg_clusters_dataset.pt')

In [None]:
print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

In [None]:
torch.manual_seed(12345)
dataset = dataset.shuffle()

train_dataset = dataset[:376]
test_dataset = dataset[376:]

In [None]:
dataset

### Define DataLoader

In [None]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

### Define Network

In [None]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, dropout_rate=0.5, l2_regularization=0.01):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)
        self.dropout_rate = dropout_rate
        self.l2_regularization = l2_regularization

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=self.dropout_rate, training=self.training)
        x = self.lin(x)

        return x

    def l2_loss(self):
        l2_loss = 0
        for param in self.parameters():
            l2_loss += torch.norm(param, p=2)
        return l2_loss

    def loss(self, pred, target):
        return F.cross_entropy(pred, target) + self.l2_regularization * self.l2_loss()

### Set-up GPU access

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Train the Model

In [None]:
model = GCN(hidden_channels=64).to(device)
model.double()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

epochs = []
train_accuracies = []
test_accuracies = []
def train():
    model.double()
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
        data = data.to(device)
        out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
        loss = criterion(out, data.y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.

def test(loader):
    model.eval()

    correct = 0
    for data in loader:  # Iterate in batches over the training/test dataset.
        data = data.to(device)
        out = model(data.x, data.edge_index, data.batch)  
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        correct += int((pred == data.y).sum())  # Check against ground-truth labels.
    return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 101):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    epochs.append(epoch)
    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

In [None]:
plt.plot(epochs, train_accuracies, label='Train Accuracy')
plt.plot(epochs, test_accuracies, label='Test Accuracy')
plt.title('Train and Test Accuracy vs. Epochs using Clusters')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('/nethome/kcni/aaulakh/morphology/500_epochs_gnn_clusters.png')