# info
This script generates a file with filtered DEGs derived from patient-specific DEG calling.

Filters are:
1. Filter on P < 0.05 and Q < 0.1 for DESeq2; Q < 0.05 in sleuth.
2. Filter out genes that in the comparison do not cross the 2 TPM barrier.
3. Discrete abs log2 FC > 1

Coherent DEGs are the ones that pass all three filters in n-1 patients.

In [1]:
import pandas, os, numpy

# 0. user-defined variables

In [2]:
expression_threshold = 2
discrete_fc_threshold = 1
noise_threshold = 1/2

In [3]:
hypotheses = ['hypothesis_A', 'hypothesis_B', 'hypothesis_C', 'hypothesis_D', 'hypothesis_E']
trends = ['up', 'down']

In [4]:
metadata_file = '/home/adrian/projects/hegoi/metadata/hegoi metadata - hypotheses formatted for filter.tsv'
tpm_file = '/home/adrian/projects/hegoi/results/tpm/DESeq2_TPM_values.tsv'
deseq2_folder = '/home/adrian/projects/hegoi/results/subsamples/DESeq2/'

output_dir = '/home/adrian/projects/hegoi/results/subsamples/DEG_filtered/'
filtered_DEGs_file = '/home/adrian/projects/hegoi/results/subsamples/DEG_filtered/filtered_DEGs.tsv'

# 1. read input files

## 1.1 read expression data

In [5]:
expression = pandas.read_csv(tpm_file, sep='\t', index_col=0)
sample_names = expression.columns.to_list()

print(sample_names)
expression.head()

['Lam153', 'Lami154', 'Lami46', 'Lami94', 'LamiP109', 'LamiP153', 'LamiP154', 'LamiP176', 'LamiPi46', 'LamiPi94', 'Osci109', 'Osci153', 'Osci154', 'Osci46', 'OsciP109', 'OsciP153', 'OsciP154', 'OsciP175', 'OsciP178', 'OsciPi46', 'Stat109', 'Stat153', 'Stat154', 'Stat176', 'Stat46', 'Stat94']


Unnamed: 0,Lam153,Lami154,Lami46,Lami94,LamiP109,LamiP153,LamiP154,LamiP176,LamiPi46,LamiPi94,...,OsciP154,OsciP175,OsciP178,OsciPi46,Stat109,Stat153,Stat154,Stat176,Stat46,Stat94
ENSG00000000003,24.240953,20.242011,22.919919,15.438227,18.098457,29.000777,14.784503,11.903633,26.217884,18.455471,...,38.756393,15.976121,21.649331,28.426877,29.688663,29.207002,28.989493,28.83238,34.600089,27.971771
ENSG00000000005,0.06296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013958,0.0,0.139231
ENSG00000000419,59.982918,62.984443,120.871295,138.059565,115.707396,84.892,172.643218,147.901176,145.183145,135.750959,...,98.473591,161.024012,86.023654,115.321222,64.46468,81.686544,84.388415,88.740844,81.992995,97.987628
ENSG00000000457,4.680221,3.684855,1.989328,3.244174,4.417648,2.189278,6.217483,3.998143,4.786793,2.949599,...,3.012943,1.024624,2.255165,2.967113,1.825466,2.371161,3.513856,2.617785,2.546863,2.668982
ENSG00000000460,1.98269,1.344508,1.422365,1.667403,0.570064,2.856698,0.416009,0.0,1.310974,1.812507,...,5.510741,3.805323,1.489214,0.957885,4.467578,5.425887,7.680067,4.002537,3.650609,4.056977


## 1.2. read metadata

In [6]:
metadata = pandas.read_csv(metadata_file, sep='\t', index_col=0)
metadata.head()

Unnamed: 0_level_0,patient,sampleA,sampleB
hypothesis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hypothesis_A,46,Stat46,Lami46
hypothesis_A,153,Stat153,Lam153
hypothesis_A,154,Stat154,Lami154
hypothesis_A,94,Stat94,Lami94
hypothesis_B,46,Lami46,Osci46


# 2. iterate over hypotheses and patients

In [12]:
### 1. iterate over hypothesis
for hypothesis in hypotheses:
    
    ### 2. iterate over trends
    for trend in trends:
        
        print('working with {} {}'.format(hypothesis, trend))
        box = {}; predf = {}
        
        ### 3. gather patients
        elements = os.listdir(deseq2_folder)
        working_elements = [element for element in elements if hypothesis in element and trend in element]
        number_of_patients = len(working_elements)
        
        print(working_elements, len(working_elements))

        ### 4. gather information
        
        ## 4.1. get adjusted P values and log2FC
        for working_element in working_elements:
            
            ## 4.1. define reference and sample expression labels
            patient = int(working_element.split('_')[2])
            ref_label = None; sam_label = None
            ref_label = metadata[metadata['patient'] == patient].loc[hypothesis]['sampleA']
            sam_label = metadata[metadata['patient'] == patient].loc[hypothesis]['sampleB']
            
            ## 4.2. get adjusted P values and log2FC
            f = open(deseq2_folder + working_element, 'r')
            next(f)
            for line in f:
                v = line.split('\t')
                ensembl = v[0]
                log2FC = float(v[2])
                adjusted = float(v[6])
                annotation = (v[7], v[8].split(' [')[0], v[9].replace('\n', ''))
                                
                if ensembl not in box:
                    box[ensembl]=[[log2FC], [adjusted], [], [], [], annotation]
                else:
                    box[ensembl][0].append(log2FC); box[ensembl][1].append(adjusted)   

                ### 4.3. get reference and sample expression   
                tpm_ref = expression[ref_label].loc[ensembl]
                tpm_sam = expression[sam_label].loc[ensembl]

                num = numpy.around(tpm_sam) + 1
                den = numpy.around(tpm_ref) + 1
                fc = num/den
                abs_discrete_log2FC = numpy.abs(numpy.log2(fc))

                box[ensembl][2].append(tpm_ref)
                box[ensembl][3].append(tpm_sam)
                box[ensembl][4].append(abs_discrete_log2FC)
                
            # late file closure
            f.close()
        
        ### 5. perform filters
        calls = len(box.keys())
        consistency_filter_count = 0
        low_expression_filter_count = 0
        discrete_filter_count = 0
        noise_filter_count = 0
        
        for gene in box:
            
            selected = True
                                    
            ## F1 | consistency across patients, DEG called at least n - 1
            if len(box[gene][0]) < number_of_patients - 1:
                selected = False
                consistency_filter_count = consistency_filter_count + 1
                
            ## F2 | filter on low expression
            if selected == True:
                a = numpy.median(box[gene][2])
                b = numpy.median(box[gene][3])
                top = numpy.max([a, b])
                if top < expression_threshold:
                    selected = False
                    low_expression_filter_count = low_expression_filter_count + 1
            
            ## F3 | filter on abs discrete log2 FC
            if selected == True:
                if numpy.median(box[gene][4]) < discrete_fc_threshold:
                    selected = False
                    discrete_filter_count = discrete_filter_count + 1
                    
            ## F4 | filter on noise 
            if selected == True:
                
                a = box[gene][2]; b = box[gene][3]
                a = numpy.around(a); b = numpy.around(b)
                
                if numpy.mean(a) > 2:
                    sem_ref = numpy.std(a) / numpy.sqrt(len(a))
                    rsem_ref = sem_ref / numpy.mean(a)
                else:
                    rsem_ref = 0
                
                if numpy.mean(b) > 2:
                    sem_sam = numpy.std(b) / numpy.sqrt(len(b))
                    rsem_sam = sem_sam / numpy.mean(b)
                else:
                    rsem_sam = 0
                
                noise = numpy.max([rsem_ref, rsem_sam])
                
                if noise > noise_threshold:
                    selected = False
                    noise_filter_count = noise_filter_count + 1 
                    print('\t', 'WARNING: removing due to noise {}'.format(noise))
                    print('\t', gene, box[gene][-1])
                    print('\t', 'ref', a, rsem_ref)
                    print('\t', 'sam', b, rsem_sam)
                    print()
            ###
            if selected == True:
                predf[gene] = []
                predf[gene].append(numpy.median(box[gene][0])) # log2FC
                predf[gene].append(numpy.median(box[gene][1])) # adjusted P
                predf[gene].append(numpy.median(box[gene][2])) # ref
                predf[gene].append(numpy.median(box[gene][3])) # sam
                predf[gene].append(numpy.median(box[gene][4])) # abs discrete log2FC
                
                # checking that FCs ave the same sign across all patients
                same_sign = not min(box[gene][0]) < 0 < max(box[gene][0])
                if same_sign == False:
                    raise ValueError('ERROR: Not all patients have the same trend.')
                
                for ann in box[gene][5]:
                    predf[gene].append(ann)
                    
            if gene == 'ENSG00000187957':
                for element in box[gene]:
                    print(element)
                    
        ### final print        
        print('DEGs union across patients \t {}'.format(calls))
        print('working set after n - 1 consistency \t {}'.format(calls-consistency_filter_count))
        print('working set after low-expression filter \t {}'.format(calls-consistency_filter_count-low_expression_filter_count))
        print('working set after discrete FC filter \t {}'.format(calls-consistency_filter_count-low_expression_filter_count-discrete_filter_count))
        print('working set after noise filter \t {}'.format(calls-consistency_filter_count-low_expression_filter_count-discrete_filter_count-noise_filter_count))
        
        ### generate a dataframe and store
        outputfile = output_dir + 'annotateddf_' + hypothesis + '_' + trend + '.tsv'
        
        df = pandas.DataFrame.from_dict(predf)
        dft = df.transpose()
        print('about to store a dataframe of shape {}'.format(dft.shape))
        
        dft.columns = ['log2FC', 'adjusted P', 'Reference expression (TPM)', 'Sample expression (TPM)', 'Discrete abs(log2FC)', 'Biotype', 'Description', 'Symbol']
        ordered_columns = ['Biotype', 'Symbol', 'Description', 'log2FC', 'adjusted P', 'Reference expression (TPM)', 'Sample expression (TPM)', 'Discrete abs(log2FC)']
        dft = dft[ordered_columns]
        dft.index.name = 'ENSEMBL'
        dft.sort_values(by='Discrete abs(log2FC)', axis=0, inplace=True, ascending=False)
        # remove duplicates
        dft.drop_duplicates(subset=['Symbol'], keep='first', inplace=True)
        
        dft.to_csv(outputfile)
        
        print('--------------------------------------')

working with hypothesis_A up
['hypothesis_A_154_up.tsv', 'hypothesis_A_46_up.tsv', 'hypothesis_A_94_up.tsv', 'hypothesis_A_153_up.tsv'] 4
	 ENSG00000115461 ('protein_coding', 'insulin like growth factor binding protein 5', 'IGFBP5')
	 ref [0. 0. 0. 1.] 0
	 sam [423.   7.   4. 105.] 0.6356594003488406

	 ENSG00000187922 ('protein_coding', 'lipocalin 10', 'LCN10')
	 ref [0. 0. 0. 0.] 0
	 sam [130.   1.   3.  49.] 0.5715217447490105

	 ENSG00000198643 ('protein_coding', 'family with sequence similarity 3 member D', 'FAM3D')
	 ref [0. 0. 0.] 0
	 sam [41.  1. 12.] 0.5411718220815724

	 ENSG00000159167 ('protein_coding', 'stanniocalcin 1', 'STC1')
	 ref [1. 1. 1. 0.] 0
	 sam [85.  5.  7. 11.] 0.6214059902060192

[1.95227257407968, 4.84352418008654, 2.27817707497821, 2.31620311503187]
[0.0144980013881868, 3.13732002297953e-06, 0.00219315819912681, 0.00322259628009699]
[0.6378924, 0.04994954, 0.4161218, 0.602106]
[2.31244, 1.031262, 1.981396, 3.165025]
[0.5849625007211562, 1.0, 1.5849625007211

	 ENSG00000115457 ('protein_coding', 'insulin like growth factor binding protein 2', 'IGFBP2')
	 ref [0. 2. 7.] 0.5665577237325317
	 sam [ 0. 13. 34.] 0.516222422271712

	 ENSG00000273622 ('protein_coding', 'CDC42 effector protein 5', 'CDC42EP5')
	 ref [6. 2. 0.] 0.5400617248673217
	 sam [75. 38. 34.] 0.21747438589313992

	 ENSG00000100321 ('protein_coding', 'synaptogyrin 1', 'SYNGR1')
	 ref [5. 0. 2.] 0.5084322977157767
	 sam [15.  3.  9.] 0.31426968052735443

	 ENSG00000276409 ('protein_coding', 'C-C motif chemokine ligand 14', 'CCL14')
	 ref [ 0. 15.  1.] 0.7412686197773832
	 sam [217. 116. 130.] 0.16714141683225217

DEGs union across patients 	 5038
working set after n - 1 consistency 	 164
working set after low-expression filter 	 91
working set after discrete FC filter 	 89
working set after noise filter 	 85
about to store a dataframe of shape (85, 8)
--------------------------------------
working with hypothesis_C down
['hypothesis_C_46_down.tsv', 'hypothesis_C_154_down.tsv', '