# Data Modeling & Model

## Data definition

In [1]:
import torch
from torch import nn

patient = 3265
max_patients = 3265

def load_patient_files(patient, dir_path):
    tensors = {
        'svs': torch.load(f"{dir_path}{patient}_svs.pt"),
        'clinical': torch.load(f"{dir_path}{patient}_svs.pt"),
        'copy_segment': torch.load(f"{dir_path}{patient}_Copy_Number_Segment.pt"),
        'protein': torch.load(f"{dir_path}{patient}_Protein_Expression_Quantification.pt"),
        'miRNA': torch.load(f"{dir_path}{patient}_miRNA_Expression_Quantification.pt"),
        'isoform': torch.load(f"{dir_path}{patient}_Isoform_Expression_Quantification.pt"),
        'gene': torch.load(f"{dir_path}{patient}_Gene_Expression_Quantification.pt"),
        'allele': torch.load(f"{dir_path}{patient}_Allele-specific_Copy_Number_Segment.pt"),
    }
    
    return tensors


data_dict = {
    'svs': {'size': (3, 2400, 2400), 'file': 'svs'},
    'masked_somatic': {'size': (1024, 135), 'file': 'Masked_Somatic_Mutation'},
    'gene_copy': {'size': (60623, 6), 'file': 'Gene_Level_Copy_Number'},
    'clinical': {'size': (768,), 'file': 'Clinical'},
    'segment_copy': {'size': (1024, 5), 'file': 'Copy_Number_Segment'},
    'protein': {'size': (530,), 'file': 'Protein_Expression_Quantification'},
    'miRNA': {'size': (1881, 2), 'file': 'miRNA_Expression_Quantification'},
    'isoform': {'size': (1881, 2), 'file': 'Isoform_Expression_Quantification'},
    'allele_copy': {'size': (1024, 6), 'file': 'Allele-specific_Copy_Number_Segment'},
    'gene_expr': {'size': (60664, 6), 'file': 'Gene_Expression_Quantification'},
}

## Z-scoring of data

In [2]:
import torch
import numpy as np

dir_path = "/mnt/Cancer_2/processed_data/"  # replace with your directory path


for k, v in data_dict.items():
    tensors = []
    for patient in range(5):
        tensor = torch.load(f"{dir_path}{patient}_{v['file']}.pt").detach()
        tensors.append(tensor)

    # Stacking tensors along a new dimension
    X = torch.stack(tensors, dim=0)
    lower_clip = torch.nanquantile(X.reshape([-1, X.size()[-1]]), 0.05, dim=0)
    upper_clip = torch.nanquantile(X.reshape((-1, X.size()[-1])), 0.95, dim=0)

    clipped_X = torch.clamp(X, lower_clip, upper_clip)

    # TODO: Use smarter imputation, albeit a lot of it is handled in the embedding layer
    clippd_X = torch.nan_to_num(clipped_X, 0.0)
    mean = clipped_X.nanmean(dim=[i for i in range(len(v['size']) - 1)], keepdim=True)
    std = torch.tensor(np.nanstd(clipped_X.numpy(), axis=tuple([i for i in range(len(v['size']) - 1)]), keepdims=True))

    z_scores_tensor = (torch.nan_to_num(X, 0) - mean) / (std + 1e-7)  # adding a small value to prevent division by zero
    data_dict[k]['data'] = z_scores_tensor


  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


## Model

- gene_expr passes through a GRU to produce a 512 one dimensional embedding
- allele_copy passes through a GRU to produce a 512 one dimensional embedding
- isoform passes through a FF to produce a 512 one dimensional embedding
- miRNA passes through a FF to produce a 512 one dimensional embedding
- protein passes through a FF to produce a 512 one dimensional embedding
- segment_copy passes through a GRU to produce a 512 one dimensional embedding
- clinical passes through a FF to produce a 512 one dimensional embedding
- gene_copy passes through a GRU to produce a 512 one dimensional embedding
- masked_somatic passes through a GRU to produce a 512 one dimensional embedding
- svs passes through a CNN (choose whatever architecture you want) to produce a 512 one dimensional embedding

masked_somatic, allele_copy, gene_copy, gene_expr, and segment_copy are combined to produce a 512 one dimensional embedding using a FF.

Then,  are used to determine the attention for the combination of isoform, and miRNA using a FF to produce a 512 one dimensional embedding using a FF.

The two embeddings are used to produce a 1024 one dimensional embedding using a FF. 

This embedding is used for attention to produce a 512 dimension embedding using protein.

Then the 1024 embedding, the newly produced protein embedding, the svs embedding, and the clinical embedding are all combined using a FF to produce a 512 embedding, which is then used to predict one category (true / false)

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=True)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=True)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=True)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask=None):
        N = query.shape[0]

        # Split the embedding into self.heads different pieces
        values = values.view(N, self.heads, self.head_dim)
        keys = keys.view(N, self.heads, self.head_dim)
        query = query.view(N, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(query)

        # Calculate energy (similarity) with dot product
        energy = torch.einsum("nhd,nhd->nh", [queries, keys])

        if mask is not None:
            energy = energy.masked_fill(mask.squeeze() == 0, float("-1e20"))

        # Apply softmax to have the sum of attention weights equal to 1
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=1)

        # Calculate the new values
        out = torch.einsum("nh,nhd->nhd", [attention, values]).reshape(
            N, self.heads * self.head_dim
        )

        out = self.fc_out(out)
        return out


    
class Net(nn.Module):
    def __init__(self, size_last):
        super(Net, self).__init__()
        
        self.layer1 = nn.Linear(size_last, 512)
        self.layer2 = nn.Linear(512, 1024)
        self.layer3 = nn.Linear(1024, 512)

        self.relu = nn.ReLU() # activation function

    def forward(self, x):
        x = x.reshape(x.size()[0], -1)
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        return x


class CombinedDataModel(nn.Module):
    def __init__(self):
        super(CombinedDataModel, self).__init__()
        self.gru_gene_expr = nn.GRU(6, 512, 2)
        self.gru_allele_copy = nn.GRU(6, 512, 2)
        self.gru_masked_somatic = nn.GRU(135, 512, 2)
        self.gru_segment_copy = nn.GRU(5, 512, 2)
        self.gru_gene_copy = nn.GRU(6, 512, 2)
        
        self.isoform = Net(1881 * 2)
        self.miRNA = Net(1881 * 2)
        self.protein = Net(530)
        self.clinical = Net(768)
        
        self.transcriptomics_combined = Net(512*2)
        self.genomics_combined_ff = Net(512*5)
        self.gen_and_transcriptomics = Net(512*2)
        self.final_linear = Net(512*4)#Net(512*4)
        
        self.transcriptomics_combined_attention = SelfAttention(512, 8)
        self.protein_attention = SelfAttention(512, 8)
        
        self.output_layer = nn.Linear(512, 1)
        
    def forward(self, x):
        gene_exp, _ = self.gru_gene_expr(torch.nan_to_num(x["gene_expr"]["data"], 0.0))
        gene_exp = gene_exp[:, -1, :]
        
        allele_copy, _ = self.gru_allele_copy(torch.nan_to_num(x["allele_copy"]["data"], 0.0))
        allele_copy = allele_copy[:, -1, :]
        
        masked_somatic, _ = self.gru_masked_somatic(torch.nan_to_num(x["masked_somatic"]["data"], 0.0))
        masked_somatic = masked_somatic[:, -1, :]
        
        segment_copy, _ = self.gru_segment_copy(torch.nan_to_num(x["segment_copy"]["data"], 0.0))
        segment_copy = segment_copy[:, -1, :]
        
        gene_copy, _ = self.gru_gene_copy(torch.nan_to_num(x["gene_copy"]["data"], 0.0))
        gene_copy = gene_copy[:, -1, :]
        
        isoform = self.isoform(torch.reshape(torch.nan_to_num(x["isoform"]["data"], 0.0), (x["isoform"]["data"].size()[0], -1)))
        miRNA = self.miRNA(torch.reshape(torch.nan_to_num(x["miRNA"]["data"], 0.0), (x["miRNA"]["data"].size()[0], -1)))
        protein = self.protein(torch.reshape(torch.nan_to_num(x["protein"]["data"], 0.0), (x["protein"]["data"].size()[0], -1)))
        clinical = self.clinical(torch.reshape(torch.nan_to_num(x["clinical"]["data"], 0.0), (x["clinical"]["data"].size()[0], -1)))
        
        combined_genomics = self.genomics_combined_ff(torch.cat([gene_exp, allele_copy, masked_somatic, allele_copy, segment_copy], 1))
        combined_transcriptomics = self.transcriptomics_combined(torch.cat([miRNA, isoform], 1))
        
        attention_transcriptomics = self.transcriptomics_combined_attention(values=combined_transcriptomics, keys=combined_transcriptomics, query=combined_genomics)
        
        gen_and_transcriptomics = self.gen_and_transcriptomics(torch.cat([combined_genomics, attention_transcriptomics], 1))
        
        attention_proteomics = self.protein_attention(values=protein, keys=protein, query=gen_and_transcriptomics)

        final_embedding = self.final_linear(torch.cat([attention_proteomics, combined_genomics, combined_transcriptomics, clinical], 1))
        out = self.output_layer(final_embedding)
        
        return out

    
model = CombinedDataModel()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(1)

# Let's move our example data to the device
for key in data_dict.keys():
    print(key)
    data_dict[key]['data'] = data_dict[key]['data'].float().to(device)

# Use the model
output = model(data_dict)


1
svs
masked_somatic
gene_copy
clinical
segment_copy
protein
miRNA
isoform
allele_copy
gene_expr
