In [62]:
import pandas as pd
import numpy as np
import time

In [74]:

def get_healthy_tissue_gene_exp_df(sample_attr_DS_path,gtex_tcga_path,tissue_str):
    '''
    Inputs:
        sample_attr_DS_path: path to a file that looks like 'GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt'
        gtex_tcga_path: path to a file that looks like 'D:\Downloads\TcgaTargetGtex_RSEM_isoform_fpkm\TcgaTargetGtex_RSEM_isoform_fpkm'
    Output:
        gtex_tcga_df: data frame with different genes as rows, and different samples as columns
    
    
    '''
    
    
    #Read in attributes file. This file contains a map from Sample ID to Tissue type of sample. 
    #We read in two columns only, SAMPID (Sample ID) and SMTS
    attributesDS_df = pd.read_csv(sample_attr_DS_path,delimiter='\t',usecols=['SAMPID','SMTS'])

    ids_and_tissue_type_df = attributesDS_df[attributesDS_df['SMTS']==tissue_str]
    sample_ids_to_use = ids_and_tissue_type_df['SAMPID'].values # contains all the sample ids that we are looking for
    
    #We'll append 'sample' to the columns to look for, so that cols_to_read contains the name of all the columns to read
    #from the GTEX+TCGA combined file.
    cols_to_read = np.append(np.array(['sample']),sample_ids_to_use)
    #Not all of the GTEX samples are present in the GTEX+TCGA combined file, so we need to find a set intersection between 
    #the SampleIds present in the sample_attr file and the gtex_tcga file
    gtex_tcga_header = pd.read_csv(gtex_tcga_path,delimiter='\t',nrows=1)
    cols_to_read = np.intersect1d(cols_to_read,gtex_tcga_header.columns)
    
    #Now, we'll read the the TCGA+GTEX file for the gene expression of only the specific samples we need.
    print("starting, this takes a while")
    start_time = time.time()
    gtex_tcga_df = pd.read_csv('D:\Downloads\TcgaTargetGtex_RSEM_isoform_fpkm\TcgaTargetGtex_RSEM_isoform_fpkm',delimiter='\t',usecols=cols_to_read)
    print('time elapsed:', time.time() - start_time)
    
    return gtex_tcga_df

In [75]:
sampleAttributesDS_path = './GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt'
gtexTCGA_path = 'D:\Downloads\TcgaTargetGtex_RSEM_isoform_fpkm\TcgaTargetGtex_RSEM_isoform_fpkm'


lung_gene_exp_df = get_healthy_tissue_gene_exp_df(sample_attr_DS_path=sampleAttributesDS_path,
                                                  gtex_tcga_path=gtexTCGA_path,
                                                  tissue_str='Lung')
    
    
lung_gene_exp_df.head()

starting, this takes a while
time elapsed: 365.52924728393555


Unnamed: 0,sample,GTEX-1399S-1726-SM-5L3DI,GTEX-11EI6-0826-SM-5985V,GTEX-117YW-0526-SM-5H11C,GTEX-1122O-0126-SM-5GICA,GTEX-1211K-0826-SM-5FQUP,GTEX-11EQ9-0226-SM-5A5JX,GTEX-13111-0426-SM-5DUXR,GTEX-145LT-0326-SM-5LUAD,GTEX-11EMC-0126-SM-5EGKV,...,GTEX-132QS-0726-SM-5IJE9,GTEX-13QJC-0526-SM-5RQKB,GTEX-12BJ1-1026-SM-5EGJA,GTEX-131XW-1126-SM-5EGK4,GTEX-11NSD-0326-SM-5A5LS,GTEX-13YAN-1026-SM-5O9CF,GTEX-13O61-0726-SM-5J2MD,GTEX-13PVQ-0926-SM-5IJFD,GTEX-132NY-1226-SM-5PNVF,GTEX-131XE-0726-SM-5HL9K
0,ENST00000548312.5,-3.3076,-1.9379,-2.7274,-9.9658,-2.4659,-3.1714,-9.9658,-9.9658,-4.035,...,-3.3076,-9.9658,-3.3076,-4.035,-2.4659,-2.3147,-3.816,-1.7809,-9.9658,-4.035
1,ENST00000483781.5,-0.2328,1.4281,0.6332,-1.7322,-9.9658,-0.4719,-4.2934,-1.3921,0.537,...,-2.9324,1.7702,-1.9942,1.6649,0.4125,0.1124,-0.3383,-0.5543,-1.8836,0.688
2,ENST00000535093.1,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,...,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658
3,ENST00000338863.11,3.9166,4.0269,4.5699,3.8905,4.1563,3.9819,3.5887,4.5374,4.3876,...,4.4881,3.6077,4.284,4.3618,3.5754,4.4628,4.3477,4.3772,3.4648,3.728
4,ENST00000570899.1,-2.4659,-9.9658,-9.9658,-2.5479,-9.9658,-9.9658,-1.4699,-9.9658,-9.9658,...,-2.9324,-9.9658,-9.9658,-9.9658,-2.5479,-9.9658,-9.9658,-9.9658,-9.9658,-2.3147
