In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import wandb


import os
import shutil
import pickle

genes =  [
    "C6orf150",
    "CCL5",
    "CXCL10",
    "TMEM173",
    "CXCL9",
    "CXCL11",
    "NFKB1",
    "IKBKE",
    "IRF3",
    "TREX1",
    "ATM",
    "IL6",
    "IL8"
  ]



USERNAME = 'borna-personal'
PROJECT = 'GENIE-Nextflow-v2'

BASE_PATH = 'adj-mtrix'

api = wandb.Api()
runs = api.runs(f"{USERNAME}/{PROJECT}")

if os.path.exists(f'{BASE_PATH}'):
    shutil.rmtree(f'{BASE_PATH}')
os.mkdir(f'{BASE_PATH}')

for run in runs:
    cancer = run.config.get('cancers')
    adj_matrix = run.summary.get('adjacency_matrix')
    target = run.config.get('variable')

    if not os.path.exists(f'{BASE_PATH}/{cancer}'):
        os.mkdir(f'{BASE_PATH}/{cancer}')
    

    with open(f'{BASE_PATH}/{cancer}/{target}-adj-matrix.obj', 'wb') as fh:
        pickle.dump(adj_matrix, fh)

In [None]:
import pickle

from captum.attr import IntegratedGradients
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import torch
from torch_geometric.loader import DataListLoader

from scripts.genie_utils import load_config
from scripts.gcn import GraphConvolutionalNetwork, GCNModelTrainer

LR = 0.0001
WD = 1e-1
HIDDEN_SIZE = 64
EPOCHS = 100

if __name__ == "__main__":
    with open('dumps/BLCA_dataset.obj', 'rb') as fh:
        dataset = pickle.load(fh)

    with open('adj-matrix/BLCA/DSS-adj-matrix.obj', 'rb') as fh:
        adj_matrix = pickle.load(fh)

    config = load_config('scripts/genie_config.json')
    list_of_genes = config['genes']
    NODES = dataset[0].x.shape[-1]
    ADJ_MATRIX_SHAPE = (NODES, NODES)
    NUM_CLASSES = dataset[0].y.shape[-1]

    split_idx = round(0.75 * len(dataset))
    train_dataset = dataset[:split_idx]
    test_dataset = dataset[split_idx:]

    train_loader = DataListLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataListLoader(test_dataset, batch_size=64, shuffle=True)

    def epoch_finished(epoch, tr_f1, tr_loss, f1, loss):
        print({"epoch": epoch, "train_f1": tr_f1, "train_loss": tr_loss, "f1": f1, "loss": loss})
        pass

    model = GraphConvolutionalNetwork(num_node_features=1, num_nodes=NODES, num_classes=NUM_CLASSES, hidden_channels=HIDDEN_SIZE,
                                      adj_matrix=adj_matrix)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WD)
    trainer = GCNModelTrainer(model=model, optimizer=optimizer, criterion=criterion,
                              num_classes=NUM_CLASSES, epochs=EPOCHS)
    print('training started')
    final_f1 = trainer.train(train_loader=train_loader, test_loader=test_loader, on_epoch_finished=epoch_finished)
    print('\nfinished')

In [9]:
import wandb

runs = wandb.Api().runs('borna-personal/GENIE-Nextflow-IG-Reruns')
df = runs[0].history()[['attributions', 'class']].iloc[-2:, :]

In [14]:
for row in df.values:
    print(row[1], ':', row[0])

DSS_1 : [0.008060641231686096, 5.964795482662625e-05, 0.03289338381167286, 0.01583801060180481, 0.04500872355916577, 0.06437164859009172, 0.01217993710528911, 0.008160756662641353, 0.002002890985192178, 0.0062118510120335695, 0.0001485172278512529, 0.0065553948709312845, 0.013543413918366352]
DSS_0 : [0.005900392706390255, 4.280109699305358e-05, 0.024195390482158925, 0.011129337242501028, 0.0320670176473587, 0.04726706123090146, 0.009406753726771435, 0.006475167829103952, 0.001719187297999782, 0.004879879928410557, 0.00011139768564012367, 0.004528087286163789, 0.009994458465569818]


In [None]:
import math
from tqdm import tqdm

import pandas as pd

outputs: dict = {
    'STAGE': ['early', 'late'],
    'DSS': ['DSS_1', 'DSS_0'],
    'OS': ['OS_1', 'OS_0'],
    'GENDER': ['gender_female', 'gender_male']
}

target = 'DSS'
cancer = 'LUSC'

df = pd.read_csv(f'dumps/{cancer}_dataset_dump.csv')

def draw_graph(attrs):
    cmap = plt.cm.RdPu
    colors = [cmap(val) for val in attrs]

    fig = plt.figure(figsize=(10, 8))
    G = nx.from_numpy_array(np.array(adj_matrix))
    labels = {i: gene for i, gene in enumerate(config['genes'])}
    pos = nx.kamada_kawai_layout(G)
    nx.draw(G, pos, labels=labels, with_labels=True, node_color=colors, node_size=500, font_size=12)


feature_importance = np.array([])

for class_index in range(NUM_CLASSES):

    temp_dataset = []
    _class = outputs[target][class_index]
    for _, row in df[df[_class] == 1].iterrows():
        gene_data = torch.tensor(row[:num_genes].values, dtype=torch.float).view(1, -1)
        label = torch.tensor(row[num_genes:].values, dtype=torch.float).view(-1, len(row[num_genes:]))
        temp_dataset.append(Data(x=gene_data, y=label))

    data_loader = DataListLoader(temp_dataset, batch_size=64)

    total_attributions = []
    for i, sample in tqdm(enumerate(temp_dataset), total=len(temp_dataset), position=0, leave=False):
        integrated_gradients = IntegratedGradients(model)

        # inputs = sample.x.unsqueeze(-1)

        # X = torch.stack(tuple(data.x for data in batch)).reshape(shape=(len(batch), num_genes)).unsqueeze(-1)
        # print(sample.y.unsqueeze(-1))
        attributions = integrated_gradients.attribute(inputs=sample.x.unsqueeze(-1), target=class_index)
        attributions = attributions.squeeze().numpy()
        total_attributions.append(np.abs(attributions))

    feature_importance = np.mean(total_attributions, axis=0)
      
    
    print(outputs[target][class_index], ':', feature_importance)
    draw_graph([math.log10(val + 1) for val in feature_importance])