# info
This script generates a file with filtered DEGs derived from patient-specific DEG calling.

Filters are:
1. Filter on P < 0.05 and Q < 0.1 for DESeq2.
2. Filter out genes that in the comparison do not cross the 2 TPM barrier.
3. Discrete abs log2 FC > 1

Coherent DEGs are the ones that pass all three filters in n-1 patients.

In [1]:
import pandas, os, numpy, pybiomart

# 0. user-defined variables

In [2]:
expression_threshold = 2
discrete_fc_threshold = 1
noise_threshold = 1/2

In [3]:
hypotheses = ['hypothesis_A', 'hypothesis_B', 'hypothesis_E']
trends = ['up', 'down']

In [4]:
metadata_file = '/home/adrian/projects/hegoi/metadata/hegoi metadata - hypotheses formatted for filter.tsv'
#####!!!! the following line needs to change
tpm_file = '/home/adrian/projects/hegoi/results/tpm/DESeq2_TPM_values.tsv'
sleuth_folder = '/home/adrian/projects/hegoi/results/subsamples/sleuth/'

output_dir = '/home/adrian/projects/hegoi/results/subsamples/sleuth/filtered/'
filtered_DEGs_file = '/home/adrian/projects/hegoi/results/subsamples/sleuth/filtered/filtered_DEGs.tsv'

# 1. build annotation

In [5]:
dataset = pybiomart.Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')
annotation = dataset.query(attributes=['ensembl_gene_id', 'entrezgene_id', 'gene_biotype', 'external_gene_name', 'description'])
annotation.fillna(value={'NCBI gene (formerly Entrezgene) ID':-1}, inplace=True)
annotation = annotation.astype({'NCBI gene (formerly Entrezgene) ID':'int'})
annotation.set_index('Gene stable ID', inplace=True)
annotation

Unnamed: 0_level_0,NCBI gene (formerly Entrezgene) ID,Gene type,Gene name,Gene description
Gene stable ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000210049,-1,Mt_tRNA,MT-TF,mitochondrially encoded tRNA-Phe (UUU/C) [Sour...
ENSG00000211459,-1,Mt_rRNA,MT-RNR1,mitochondrially encoded 12S rRNA [Source:HGNC ...
ENSG00000210077,-1,Mt_tRNA,MT-TV,mitochondrially encoded tRNA-Val (GUN) [Source...
ENSG00000210082,-1,Mt_rRNA,MT-RNR2,mitochondrially encoded 16S rRNA [Source:HGNC ...
ENSG00000209082,-1,Mt_tRNA,MT-TL1,mitochondrially encoded tRNA-Leu (UUA/G) 1 [So...
...,...,...,...,...
ENSG00000162543,127733,protein_coding,UBXN10,UBX domain protein 10 [Source:HGNC Symbol;Acc:...
ENSG00000134686,1912,protein_coding,PHC2,polyhomeotic homolog 2 [Source:HGNC Symbol;Acc...
ENSG00000159023,2035,protein_coding,EPB41,erythrocyte membrane protein band 4.1 [Source:...
ENSG00000198216,777,protein_coding,CACNA1E,calcium voltage-gated channel subunit alpha1 E...


# 1. read input files

## 1.1 read expression data

In [6]:
expression = pandas.read_csv(tpm_file, sep='\t', index_col=0)
sample_names = expression.columns.to_list()

expression.shape

(40173, 26)

## 1.2. read metadata

In [7]:
metadata = pandas.read_csv(metadata_file, sep='\t', index_col=0)
metadata.head()

Unnamed: 0_level_0,patient,sampleA,sampleB
hypothesis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hypothesis_A,46,Stat46,Lami46
hypothesis_A,153,Stat153,Lam153
hypothesis_A,154,Stat154,Lami154
hypothesis_A,94,Stat94,Lami94
hypothesis_B,46,Lami46,Osci46


# 2. iterate over hypotheses and patients

In [8]:
annotation.columns

Index(['NCBI gene (formerly Entrezgene) ID', 'Gene type', 'Gene name',
       'Gene description'],
      dtype='object')

In [12]:
elements

['hypothesis_A_153.csv',
 'hypothesis_D_154.csv',
 'hypothesis_A_154.csv',
 'hypothesis_B_153.csv',
 'hypothesis_C_154.csv',
 'hypothesis_B_154.csv',
 'hypothesis_D_46.csv',
 'hypothesis_D_94.csv',
 'hypothesis_D_153.csv',
 'hypothesis_B_46.csv',
 'hypothesis_E_46.csv',
 'hypothesis_C_109.csv',
 'hypothesis_A_94.csv',
 'hypothesis_E_154.csv',
 'hypothesis_C_153.csv',
 'hypothesis_E_153.csv',
 'hypothesis_E_109.csv',
 'hypothesis_C_46.csv',
 'hypothesis_A_46.csv']

In [23]:
### 1. iterate over hypothesis
for hypothesis in hypotheses:
    
    print('working with {}'.format(hypothesis))
    box = {}; predf = {}

    ### 3. gather patients
    elements = os.listdir(sleuth_folder)
    working_elements = [element for element in elements if hypothesis in element]
    number_of_patients = len(working_elements)

    print(working_elements, len(working_elements))

    ### 4. gather information

    ## 4.1. get adjusted P values and log2FC
    for working_element in working_elements:

        ## 4.1. define reference and sample expression labels
        patient = int(working_element.split('_')[2].split('.csv')[0])
        ref_label = None; sam_label = None
        ref_label = metadata[metadata['patient'] == patient].loc[hypothesis]['sampleA']
        sam_label = metadata[metadata['patient'] == patient].loc[hypothesis]['sampleB']

        ## 4.2. get adjusted P values and log2FC
        f = open(sleuth_folder + working_element, 'r')
        print(sleuth_folder + working_element)
        next(f)
        for line in f:
            v = line.split(',')
            ensembl = v[1].replace('"', '')
            pvalue = float(v[-2].replace('"', ''))
            qvalue = float(v[-1].replace('"', ''))
#             print(v)
#             print(ensembl)
#             print(pvalue)
#             print(qvalue)
#             asdf

            if ensembl not in box:
                box[ensembl]=[[pvalue], [qvalue], [], [], []]
            else:
                box[ensembl][0].append(pvalue); box[ensembl][1].append(qvalue)   

            ### 4.3. get reference and sample expression   
            tpm_ref = expression[ref_label].loc[ensembl]
            tpm_sam = expression[sam_label].loc[ensembl]

            num = numpy.around(tpm_sam) + 1
            den = numpy.around(tpm_ref) + 1
            fc = num/den
            abs_discrete_log2FC = numpy.abs(numpy.log2(fc))

            box[ensembl][2].append(tpm_ref)
            box[ensembl][3].append(tpm_sam)
            box[ensembl][4].append(abs_discrete_log2FC)

        # late file closure
        f.close()

    ### 5. perform filters
    calls = len(box.keys())
    consistency_filter_count = 0
    low_expression_filter_count = 0
    discrete_filter_count = 0
    noise_filter_count = 0

    for gene in box:

        selected = True

        ## F1 | consistency across patients, DEG called at least n - 1
        if len(box[gene][0]) < number_of_patients - 1:
            selected = False
            consistency_filter_count = consistency_filter_count + 1

        ## F2 | filter on low expression
        if selected == True:
            a = numpy.median(box[gene][2])
            b = numpy.median(box[gene][3])
            top = numpy.max([a, b])
            if top < expression_threshold:
                selected = False
                low_expression_filter_count = low_expression_filter_count + 1

        ## F3 | filter on abs discrete log2 FC
        if selected == True:
            if numpy.median(box[gene][4]) < discrete_fc_threshold:
                selected = False
                discrete_filter_count = discrete_filter_count + 1

        ## F4 | filter on noise 
        if selected == True:

            a = box[gene][2]; b = box[gene][3]
            a = numpy.around(a); b = numpy.around(b)

            if numpy.mean(a) > 2:
                sem_ref = numpy.std(a) / numpy.sqrt(len(a))
                rsem_ref = sem_ref / numpy.mean(a)
            else:
                rsem_ref = 0

            if numpy.mean(b) > 2:
                sem_sam = numpy.std(b) / numpy.sqrt(len(b))
                rsem_sam = sem_sam / numpy.mean(b)
            else:
                rsem_sam = 0

            noise = numpy.max([rsem_ref, rsem_sam])

            if noise > noise_threshold:
                selected = False
                noise_filter_count = noise_filter_count + 1 
                print('\t', 'WARNING: removing due to noise {}'.format(noise))
                print('\t', gene, annotation.loc[gene, 'Gene name'], annotation.loc[gene, 'Gene description'])
                print('\t', 'ref', a, rsem_ref)
                print('\t', 'sam', b, rsem_sam)
                print()
        ###
        if selected == True:
            predf[gene] = []
            predf[gene].append(numpy.median(box[gene][0])) # pvalue
            predf[gene].append(numpy.median(box[gene][1])) # qvalue
            predf[gene].append(numpy.median(box[gene][2])) # ref
            predf[gene].append(numpy.median(box[gene][3])) # sam
            predf[gene].append(numpy.median(box[gene][4])) # abs discrete log2FC                  

    ### final print        
    print('DEGs union across patients \t {}'.format(calls))
    print('working set after n - 1 consistency \t {}'.format(calls-consistency_filter_count))
    print('working set after low-expression filter \t {}'.format(calls-consistency_filter_count-low_expression_filter_count))
    print('working set after discrete FC filter \t {}'.format(calls-consistency_filter_count-low_expression_filter_count-discrete_filter_count))
    print('working set after noise filter \t {}'.format(calls-consistency_filter_count-low_expression_filter_count-discrete_filter_count-noise_filter_count))

    ### generate a dataframe and store
    outputfile = output_dir + 'filtered_' + hypothesis + '_' + trend + '.tsv'

    df = pandas.DataFrame.from_dict(predf)
    dft = df.transpose()
    dft.index.name = 'ENSEMBL'
    dft.columns = ['P value', 'Q value', 'Reference expression (TPM)', 'Sample expression (TPM)', 'Discrete abs(log2FC)']
    dft.sort_values(by='Discrete abs(log2FC)', axis=0, inplace=True, ascending=False)
    print(dft.shape)

    fdf = pandas.merge(dft, annotation.loc[dft.index, ], left_index=True, right_index=True)
    fdf['Gene description'] = fdf['Gene description'].str.split(' \[Source').str.get(0)
    fdf = fdf[['NCBI gene (formerly Entrezgene) ID', 'Gene type', 'Gene name', 'Gene description', 'P value', 'Q value', 'Reference expression (TPM)', 'Sample expression (TPM)', 'Discrete abs(log2FC)']]
    print('about to store a dataframe of shape {}'.format(fdf.shape))

    fdf.to_csv(outputfile)

    print('--------------------------------------')

working with hypothesis_A
['hypothesis_A_153.csv', 'hypothesis_A_154.csv', 'hypothesis_A_94.csv', 'hypothesis_A_46.csv'] 4
/home/adrian/projects/hegoi/results/subsamples/sleuth/hypothesis_A_153.csv


KeyError: 'ENSG00000291237'

In [11]:
dft

