# Preparing the data for training

## Import necessary packages

In [1]:
import pandas as pd
import csv
import numpy as np
import os
import glob
import pickle
import gzip

## Load protein--protein interaction data from STRING DB (version 11.0)

In [2]:
proteins = {}
interactions1 = []
interactions2 = []
with gzip.open('data/9606.protein.links.v11.0.txt.gz', 'rt') as f:
    next(f) # Ignore the header
    for line in f:
        protein1, protein2, score = line.strip().split()
        score = int(score)
        if score >= 700: # Filter interactions with more confidence
            protein1 = protein1.split('.')[1]
            protein2 = protein2.split('.')[1]
            if protein1 not in proteins:
                proteins[protein1] = len(proteins) # Assign an ID to protein
            if protein2 not in proteins:
                proteins[protein2] = len(proteins)
            interactions1.append(proteins[protein1])
            interactions2.append(proteins[protein2])

edge_index = [interactions1, interactions2]

# Save proteins and interactions for future use
proteins_df = pd.DataFrame({'proteins': list(proteins.keys()), 'ids': proteins.values()})
proteins_df.to_pickle('data/proteins.pkl')
interactions_df = pd.DataFrame({'protein1': interactions1, 'protein2': interactions2})
interactions_df.to_pickle('data/interactions.pkl')
proteins_df, interactions_df

(              proteins    ids
 0      ENSP00000000233      0
 1      ENSP00000432568      1
 2      ENSP00000427900      2
 3      ENSP00000350199      3
 4      ENSP00000354878      4
 ...                ...    ...
 17180  ENSP00000485615  17180
 17181  ENSP00000484618  17181
 17182  ENSP00000478591  17182
 17183  ENSP00000478796  17183
 17184  ENSP00000484443  17184
 
 [17185 rows x 2 columns],
         protein1  protein2
 0              0         1
 1              0         2
 2              0         3
 3              0         4
 4              0         5
 ...          ...       ...
 841063     10457      5817
 841064     10457      5823
 841065     10457       730
 841066     10457      5818
 841067     10457      5824
 
 [841068 rows x 2 columns])

## Ensemble Gene ID mappings

In [3]:
ensembl_df = pd.read_csv('data/ensembl.tsv', sep='\t')
ensembl_df

Unnamed: 0,Gene stable ID,Transcript stable ID,Protein stable ID
0,ENSG00000261657,ENST00000566782,ENSP00000456546
1,ENSG00000261657,ENST00000562780,ENSP00000457004
2,ENSG00000261657,ENST00000569579,ENSP00000456312
3,ENSG00000261657,ENST00000568242,ENSP00000456306
4,ENSG00000261657,ENST00000565530,
...,...,...,...
215442,ENSG00000263267,ENST00000572372,
215443,ENSG00000263267,ENST00000572960,
215444,ENSG00000263267,ENST00000574432,
215445,ENSG00000262336,ENST00000577132,


## Generate mappings from gene to proteins and transcript to protein

In [4]:
gene2prot = {} # one to many mapping
trans2prot = {} # one to one mapping

for i, row in ensembl_df.iterrows():
    protein_id = row['Protein stable ID']
    if not pd.isna(protein_id):
        gene_id = row['Gene stable ID']
        trans_id = row['Transcript stable ID']
        if gene_id not in gene2prot:
            gene2prot[gene_id] = set()
        gene2prot[gene_id].add(protein_id)
        trans2prot[trans_id] = protein_id
len(gene2prot), len(trans2prot)


(23393, 104763)

## Function for loading methylation data

In [5]:
def load_meth_data(filename):
    data = {}
    with open(filename) as f:
        for line in f:
            transcripts, meth_val = line.strip().split('\t')
            meth_val = float(meth_val)
            transcripts = transcripts.split(';')
            for tid in transcripts:
                tid = tid.split('.')[0] # Remove transcript version
                if tid in trans2prot:
                    pid = trans2prot[tid]
                    data[pid] = meth_val
    return data
# Example
load_meth_data('cancer_types/TCGA-BRCA/myth/jhu-usc.edu_BRCA.HumanMethylation27.1.lvl-3.TCGA-A8-A081-01A-11D-A00Y-05.gdc_hg38.txt')

{'ENSP00000349595': 0.907436396867636,
 'ENSP00000378879': 0.907436396867636,
 'ENSP00000443101': 0.806026586506443,
 'ENSP00000457798': 0.806026586506443,
 'ENSP00000458035': 0.907436396867636,
 'ENSP00000295951': 0.0198578363714635,
 'ENSP00000295952': 0.0198578363714635,
 'ENSP00000373224': 0.0198578363714635,
 'ENSP00000398661': 0.0198578363714635,
 'ENSP00000412945': 0.0198578363714635,
 'ENSP00000262041': 0.0331624403423048,
 'ENSP00000424734': 0.359350475404284,
 'ENSP00000249440': 0.499272133494444,
 'ENSP00000386498': 0.499272133494444,
 'ENSP00000392615': 0.359350475404284,
 'ENSP00000227638': 0.11302520778643402,
 'ENSP00000411461': 0.11302520778643402,
 'ENSP00000340568': 0.766327263232795,
 'ENSP00000256339': 0.766327263232795,
 'ENSP00000269159': 0.0137876162072788,
 'ENSP00000372867': 0.0137876162072788,
 'ENSP00000464767': 0.0137876162072788,
 'ENSP00000465416': 0.0137876162072788,
 'ENSP00000466059': 0.0137876162072788,
 'ENSP00000465938': 0.0137876162072788,
 'ENSP000

## Function for loading gene data

In [6]:
def load_gene_data(filename, is_gzip=False):
    mode = 'r'
    fopen = open
    if is_gzip:
        mode = 'rt'
        fopen = gzip.open
    data = {}
    with fopen(filename, mode) as f:
        for line in f:
            try:
                gene_id, val = line.strip().split('\t')
                gene_id = gene_id.split('.')[0]
                val = float(val)
                if gene_id in gene2prot:
                    for prot_id in gene2prot[gene_id]:
                        data[prot_id] = val
            except Exception as e:
                print(e)
                print(filename)
                print(line)
    return data

load_gene_data('cancer_types/TCGA-ACC/diff_exp/diff_807147a0-dadd-4df3-bb2f-ac7515346efb.FPKM-UQ.txt.gz', is_gzip=True)

{'ENSP00000471803': 24792.83031854361,
 'ENSP00000367557': 24792.83031854361,
 'ENSP00000471504': 24792.83031854361,
 'ENSP00000349560': 24792.83031854361,
 'ENSP00000470246': 24792.83031854361,
 'ENSP00000179259': 266.8177924280659,
 'ENSP00000425253': 330.9708997577152,
 'ENSP00000424699': 330.9708997577152,
 'ENSP00000427604': 330.9708997577152,
 'ENSP00000274811': 330.9708997577152,
 'ENSP00000440352': 330.9708997577152,
 'ENSP00000394245': 290.88461338050644,
 'ENSP00000261383': 290.88461338050644,
 'ENSP00000389103': 289.82062668814024,
 'ENSP00000378396': 289.82062668814024,
 'ENSP00000468402': 289.82062668814024,
 'ENSP00000347969': 289.82062668814024,
 'ENSP00000465342': 289.82062668814024,
 'ENSP00000466425': 289.82062668814024,
 'ENSP00000468172': 289.82062668814024,
 'ENSP00000402996': 961.110635599542,
 'ENSP00000400831': 961.110635599542,
 'ENSP00000256496': 961.110635599542,
 'ENSP00000405196': 961.110635599542,
 'ENSP00000307508': 2610.4885970059545,
 'ENSP00000454639':

# Reading preprocessed data into a dataframe

In [7]:
cancer_types = [
    "TCGA-ACC", "TCGA-BLCA", "TCGA-BRCA", "TCGA-CESC",
    "TCGA-CHOL", "TCGA-COAD", "TCGA-DLBC", "TCGA-ESCA",
    "TCGA-GBM", "TCGA-HNSC", "TCGA-KICH", "TCGA-KIRC",
    "TCGA-KIRP", "TCGA-LAML","TCGA-LGG","TCGA-LIHC",
    "TCGA-LUAD","TCGA-LUSC","TCGA-MESO","TCGA-OV",
    "TCGA-PAAD","TCGA-PCPG","TCGA-PRAD","TCGA-READ",
    "TCGA-SARC","TCGA-SKCM","TCGA-STAD","TCGA-TGCT",
    "TCGA-THCA","TCGA-THYM","TCGA-UCEC","TCGA-UCS","TCGA-UVM"]

for i, cancer_type in enumerate(cancer_types):
    clinical_duration = [] # for clinical data (i.e. number of days to survive, days to death for dead patients and days to last followup for alive patients)
    feature_vecs = [] # list of lists ([[patient1],[patient2],.....[patientN]]) -- [patientX] = [gene_expression_value, diff_gene_expression_value, methylation_value, diff_methylation_value, VCF_value, CNV_value]
    survival = [] # list that include wheather a patient is alive or dead (i.e. 0 for dead and 1 for alive)
    # if os.path.exists('data/' + cancer_type + '.pkl'):
    #     continue
    with open('mapping_files/' + cancer_type + '.tsv') as f:
        next(f) # Ignore headers
        for line in f:
            it = line.strip().split('\t')
            duration = int(it[6])
            surv = int(it[2])
            meth_data = load_meth_data(f'cancer_types/{cancer_type}/myth/' + it[3])
            diff_meth_data = load_meth_data(f'cancer_types/{cancer_type}/diff_myth/' + it[1])
            exp_norm_data = load_gene_data(f'cancer_types/{cancer_type}/exp_upper/' + it[-1], is_gzip=True)
            diff_exp_norm_data = load_gene_data(f'cancer_types/{cancer_type}/diff_exp/' + it[0], is_gzip=True)
            cnv_data = load_gene_data(f'cancer_types/{cancer_type}/cnv/' + it[4] + '.txt')
            vcf_data = load_gene_data(f'cancer_types/{cancer_type}/vcf/output/' + 'OutputAnnoFile_' + it[5] + '.hg38_multianno.txt.dat')
            features = np.zeros((len(proteins), 6), dtype=np.float32)
            # print(len(meth_data), len(diff_meth_data), len(exp_norm_data), len(diff_exp_norm_data), len(cnv_data), len(vcf_data))
            for protein, pid in proteins.items():
                if protein in meth_data:
                    features[pid, 0] = meth_data[protein]
                if protein in diff_meth_data:
                    features[pid, 1] = diff_meth_data[protein]
                if protein in exp_norm_data:
                    features[pid, 2] = exp_norm_data[protein]
                if protein in diff_exp_norm_data:
                    features[pid, 3] = diff_exp_norm_data[protein]
                if protein in cnv_data:
                    features[pid, 4] = cnv_data[protein]
                if protein in vcf_data:
                    features[pid, 5] = vcf_data[protein]
            clinical_duration.append(duration)
            survival.append(surv)
            feature_vecs.append(features)
    df = pd.DataFrame({'survival': survival, 'duration': clinical_duration, 'features': feature_vecs})
    df.to_pickle('data/' + cancer_type + '.pkl')
    print('Saved data for ' + cancer_type)
    

Saved data for TCGA-ACC
not enough values to unpack (expected 2, got 1)
cancer_types/TCGA-BLCA/vcf/output/OutputAnnoFile_8378fb20-7bbd-4ecb-ab35-805a9a6ca541.vep.hg38_multianno.txt.dat
	0.31

not enough values to unpack (expected 2, got 1)
cancer_types/TCGA-BLCA/vcf/output/OutputAnnoFile_40e20600-3ced-4182-80ba-ef6acfd67a53.vep.hg38_multianno.txt.dat
	0.34

not enough values to unpack (expected 2, got 1)
cancer_types/TCGA-BLCA/vcf/output/OutputAnnoFile_999cdb6d-e130-428c-8257-c0b9cd84ac5b.vep.hg38_multianno.txt.dat
	0.5

Saved data for TCGA-BLCA
not enough values to unpack (expected 2, got 1)
cancer_types/TCGA-BRCA/vcf/output/OutputAnnoFile_2c91bbc6-1287-448c-bc13-1a13af6cd497.vep.hg38_multianno.txt.dat
	0.25

not enough values to unpack (expected 2, got 1)
cancer_types/TCGA-BRCA/vcf/output/OutputAnnoFile_0f666429-2cef-4868-82c8-d31bc2b9d09e.vep.hg38_multianno.txt.dat
	-0.29

not enough values to unpack (expected 2, got 1)
cancer_types/TCGA-BRCA/vcf/output/OutputAnnoFile_1faa7dab-340f-

not enough values to unpack (expected 2, got 1)
cancer_types/TCGA-SKCM/vcf/output/OutputAnnoFile_6f141884-53d7-4f35-b956-9fd6e50e876c.vep.hg38_multianno.txt.dat
	0.53

not enough values to unpack (expected 2, got 1)
cancer_types/TCGA-SKCM/vcf/output/OutputAnnoFile_6f141884-53d7-4f35-b956-9fd6e50e876c.vep.hg38_multianno.txt.dat
	0.53

not enough values to unpack (expected 2, got 1)
cancer_types/TCGA-SKCM/vcf/output/OutputAnnoFile_0cc74a73-2111-4d43-80bf-83626048a437.vep.hg38_multianno.txt.dat
	-0.19

Saved data for TCGA-SKCM
not enough values to unpack (expected 2, got 1)
cancer_types/TCGA-STAD/vcf/output/OutputAnnoFile_34787987-e997-4952-99f1-e2de353272fe.vep.hg38_multianno.txt.dat
	0.51

not enough values to unpack (expected 2, got 1)
cancer_types/TCGA-STAD/vcf/output/OutputAnnoFile_372f867b-08d6-4aa2-bc53-142a994e0e06.vep.hg38_multianno.txt.dat
	0.45

Saved data for TCGA-STAD
Saved data for TCGA-TGCT
not enough values to unpack (expected 2, got 1)
cancer_types/TCGA-THCA/vcf/output/Ou