In [1]:
import pandas as pd
import numpy as np
import time
import pickle as pkl

In [4]:

def get_healthy_tissue_gene_exp_df(sample_attr_DS_path,gtex_tcga_path,tissue_str):
    '''
    Inputs:
        sample_attr_DS_path: path to a file that looks like 'GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt'
        gtex_tcga_path: path to a file that looks like 'D:\Downloads\TcgaTargetGtex_RSEM_isoform_fpkm\TcgaTargetGtex_RSEM_isoform_fpkm'
    Output:
        gtex_tcga_df: data frame with different genes as rows, and different samples as columns
    
    
    '''
    
    
    #Read in attributes file. This file contains a map from Sample ID to Tissue type of sample. 
    #We read in two columns only, SAMPID (Sample ID) and SMTS
    attributesDS_df = pd.read_csv(sample_attr_DS_path,delimiter='\t',usecols=['SAMPID','SMTS'])

    ids_and_tissue_type_df = attributesDS_df[attributesDS_df['SMTS']==tissue_str]
    sample_ids_to_use = ids_and_tissue_type_df['SAMPID'].values # contains all the sample ids that we are looking for
    
    #We'll append 'sample' to the columns to look for, so that cols_to_read contains the name of all the columns to read
    #from the GTEX+TCGA combined file.
    cols_to_read = np.append(np.array(['sample']),sample_ids_to_use)
    #Not all of the GTEX samples are present in the GTEX+TCGA combined file, so we need to find a set intersection between 
    #the SampleIds present in the sample_attr file and the gtex_tcga file
    gtex_tcga_header = pd.read_csv(gtex_tcga_path,delimiter='\t',nrows=1)
    cols_to_read = np.intersect1d(cols_to_read,gtex_tcga_header.columns)
    
    #Now, we'll read the the TCGA+GTEX file for the gene expression of only the specific samples we need.
    print("starting, this takes a while")
    start_time = time.time()
    gtex_tcga_df = pd.read_csv(gtex_tcga_path,delimiter='\t',usecols=cols_to_read)
    print('time elapsed:', time.time() - start_time)
    
    return gtex_tcga_df

In [75]:
sampleAttributesDS_path = '../GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt'
# gtexTCGA_path = 'D:\Downloads\TcgaTargetGtex_RSEM_isoform_fpkm\TcgaTargetGtex_RSEM_isoform_fpkm'
gtexTCGA_path = '../TcgaTargetGtex_rsem_gene_fpkm'

breast_gene_exp_df = get_healthy_tissue_gene_exp_df(sample_attr_DS_path=sampleAttributesDS_path,
                                                  gtex_tcga_path=gtexTCGA_path,
                                                  tissue_str='Breast')
    
    
breast_gene_exp_df.head()

starting, this takes a while
time elapsed: 62.54576086997986


Unnamed: 0,sample,GTEX-ZA64-1526-SM-5CVMD,GTEX-X4EP-2926-SM-3P5YQ,GTEX-S7SE-0826-SM-4AT4D,GTEX-XMD1-0826-SM-4AT52,GTEX-11P81-1926-SM-5BC53,GTEX-13FTY-2226-SM-5J1ND,GTEX-Y5V6-2126-SM-4WWFX,GTEX-X261-0626-SM-3NMD9,GTEX-S32W-2026-SM-4AD6E,...,GTEX-ZAJG-0626-SM-5HL8X,GTEX-ZXES-0826-SM-5E43C,GTEX-TML8-1226-SM-32QON,GTEX-13NZ9-1026-SM-5MR5K,GTEX-139T8-0826-SM-5L3DE,GTEX-12WSN-1326-SM-5GCNT,GTEX-REY6-2426-SM-48FF5,GTEX-U3ZH-1426-SM-4DXSR,GTEX-11P7K-0726-SM-5EGKX,GTEX-1117F-2826-SM-5GZXL
0,ENSG00000242268.2,-1.9379,-3.6259,-1.0262,-9.9658,-0.9686,-3.0469,0.0014,-9.9658,-1.5105,...,-9.9658,-9.9658,-0.8863,-3.1714,-9.9658,-2.5479,-1.9379,-9.9658,-9.9658,-3.816
1,ENSG00000259041.1,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,...,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658
2,ENSG00000270112.3,-3.6259,-9.9658,-9.9658,-5.5735,-6.5064,-6.5064,-4.035,-9.9658,-5.5735,...,-9.9658,-6.5064,-9.9658,-6.5064,-5.0116,-9.9658,-3.0469,-5.5735,-6.5064,-9.9658
3,ENSG00000167578.16,4.5589,4.6877,5.3045,5.3176,4.1684,5.1441,4.9117,4.4881,5.124,...,4.9561,4.4135,4.7005,5.1065,5.1831,5.2612,4.5608,4.9855,4.6554,4.7274
4,ENSG00000278814.1,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,...,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658


In [76]:
breast_gene_exp_df.to_csv('breast_gene_exp_healthy.csv')

In [None]:
cols_to_read