In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import csv
import copy
from torch import nn
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, global_mean_pool

## Load data

In [2]:
gene_name = np.loadtxt('gene_name.txt', dtype=str)
gene_edge = np.loadtxt('gene_edge.txt', dtype=str)

In [3]:
# create mapping table from genes to nodes number
node_map = {}
for i, j in enumerate(gene_name): 
    node_map.update({j:i})

In [None]:
edge_list = []
for k, i in enumerate(gene_edge): 
    
    letter_count = 0
    while i[letter_count] is not '|': 
        letter_count += 1
        
    first_gene = i[:letter_count]
    second_gene = i[letter_count+1:]
    
    try: 
        node_map[first_gene]
        node_map[second_gene]
        edge_list.append([node_map[first_gene], node_map[second_gene]])
        edge_list.append([node_map[second_gene], node_map[first_gene]])
    except: 
        print('could not fine gene name at %dth line of gene edge' %k)

In [5]:
node_map['Gusb']

1423

In [6]:
edge_list = torch.tensor(np.array(edge_list).T, dtype=torch.long)

In [7]:
edge_list.size()

torch.Size([2, 492746])

In [8]:
# load training data
train_loader = []
with open('input_train_cat.csv') as csvfile: 
    reader = csv.reader(csvfile, quoting=csv.QUOTE_NONNUMERIC)
    for row in reader: 
        
        # index 1423 is Gusb gene
        target_expression_level = torch.tensor(copy.copy(row[1423]), dtype=torch.float).view(1, 1)
        row[1423] = 0
        
        # create graph
        data = Data(x=torch.tensor(row, dtype=torch.float).view(1431, 1), 
                    y=target_expression_level, edge_index=edge_list)
        train_loader.append(data)

In [9]:
from torch_geometric.data import Batch

In [11]:
# load validation data
validate_loader = []
with open('input_test_cat.csv') as csvfile: 
    reader = csv.reader(csvfile, quoting=csv.QUOTE_NONNUMERIC)
    for row in reader: 
        
        # index 1423 is Gusb gene
        target_expression_level = torch.tensor(copy.copy(row[1423]), dtype=torch.float).view(1, 1)
        row[1423] = 0
        
        # create graph
        data = Data(x=torch.tensor(row, dtype=torch.float).view(1431, 1), 
                    y=target_expression_level, edge_index=edge_list)
        validate_loader.append(data)

In [12]:
print(len(train_loader))
print(len(validate_loader))

18542
4636


In [13]:
print(train_loader[0].x.size())
print(train_loader[0].y.size())
print(train_loader[0].edge_index.size())

torch.Size([1431, 1])
torch.Size([1, 1])
torch.Size([2, 492746])


## Define GCN

In [10]:
class GCNInferenceNetwork(nn.Module): 
    def __init__(self, d=1): 
        super(GCNInferenceNetwork, self).__init__()
        self.conv1 = GCNConv(d, 16)
        self.conv2 = GCNConv(16, 16)
        self.fc    = nn.Linear(16, 1)
    
    def forward(self, graph): 
        x, edges, y = graph.x, graph.edge_index, graph.y
        
        x = self.conv1(x, edges)
        x = F.relu(x)
        x = self.conv2(x, edges)
        x = F.relu(x)
        # x = global_mean_pool(x, torch.zeros(graph.num_nodes, dtype=torch.long))
        x = global_mean_pool(x, graph.batch)
        
        x = self.fc(x)
        
        return x

In [101]:
for parameter in model.parameters():
    print(parameter.size())

torch.Size([1, 32])
torch.Size([32])
torch.Size([32, 32])
torch.Size([32])
torch.Size([1, 32])
torch.Size([1])


## Train GCN

In [11]:
batch_size = 32
def train(model, train_loader): 
    model.train()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-5)
    loss_fn = nn.MSELoss()
    loss_all = 0
    
    def new_fun(): 
        sample_ids = np.random.randint(len(train_loader), size=batch_size)
        batch_1 = [train_loader[j].to(torch.device('cuda')) for j in sample_ids]
        batch = Batch.from_data_list(batch_1).to(torch.device('cuda'))
        x, edges, y = batch.x, batch.edge_index, batch.y
        pred_ = model(batch)
        loss = loss_fn(pred_, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        del pred_
        del batch
        for exp in batch_1:
            del exp
        del batch_1
        del x
        del edges
        del y
        
        return loss.item()

        if i%2 == 0: 
            print('%d batches trained' %i)
    
    for i in range(len(train_loader) // batch_size):
        loss_all += new_fun()
    
    return loss_all / len(train_loader)

In [16]:
def validate(model, validate_loader): 
    model.evaluate()
    loss_fn = nn.MSELoss()
    loss_all = 0
    
    for data in validate_loader: 
        x, edges, y = data.x, data.edge_index, data.y
        pred_ = model(data)
        loss = loss_fn(pred_, y)
        loss_all += loss.item()
    
    return loss_all / len(validate_loader)

In [None]:
model = GCNInferenceNetwork().to(torch.device('cuda'))

epoches = 1
for i in range(epoches): 
    train_loss = train(model, train_loader)
    # val_loss = validate(model, validate_loader)
    # print ('epoches: %d || train_losses: %f || val_loss: %f' %(i, train_loss, val_loss))
    print ('epoches: %d || train_losses: %f' %(i, train_loss))