This pipeline retrieves significant DEGs and perform appropriate filters to generate biologically relevant expression changing gene sets.

Filter set is:
- log2 FC > 1.
- min max expression > 2 TPM
- noise threshold > 0.5


In [27]:
import pandas, numpy

# 0. user-defined variables

In [49]:
DEG_folder = '/home/adrian/projects/reynisfjara/results/DEGs_DESeq2/'
expression_file = '/home/adrian/projects/reynisfjara/results/tpm/DESeq2_TPM_values.tsv'
annotation_file = '/home/adrian/projects/reynisfjara/results/annotation/annotation.csv'

mice = ['a3922', 'a4774', 'a4775', 'a4776']
trends = ['up', 'down']

expression_threshold = 2
discrete_fc_threshold = 1
noise_threshold = 1/2

# 1. read data

## 1.1. expression data

In [3]:
expression = pandas.read_csv(expression_file, sep='\t', index_col=0)
expression.head()

Unnamed: 0,a3922_0h_1,a3922_0h_2,a3922_0h_3,a3922_48h_1,a3922_48h_2,a3922_48h_3,a3922_72h_1,a3922_72h_2,a3922_72h_3,a4774_0h_1,...,a4775_72h_3,a4776_0h_1,a4776_0h_2,a4776_0h_3,a4776_48h_1,a4776_48h_2,a4776_48h_3,a4776_72h_1,a4776_72h_2,a4776_72h_3
ENSMUSG00000000001,65.244411,65.953285,64.396929,75.289955,71.147817,72.146832,67.786762,70.217188,70.219265,61.227221,...,68.440778,64.806231,65.619286,66.749396,64.425407,68.665136,68.419059,69.705546,70.717428,71.017699
ENSMUSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000028,5.945436,5.46424,4.924612,20.245428,19.780706,22.747363,19.239938,22.062649,20.547492,7.843197,...,14.565935,4.712688,3.690099,4.120889,7.220914,8.37785,9.611563,9.465468,10.235803,9.808747
ENSMUSG00000000037,0.220972,0.959207,0.25827,0.969948,1.149452,2.212842,0.989879,2.354492,1.813624,0.545637,...,0.989086,0.451844,0.523639,0.679725,2.862086,0.865126,1.97357,1.612622,2.148935,5.445061
ENSMUSG00000000049,0.061451,0.061879,0.096945,0.0,0.071373,0.0,0.059768,0.0,0.0,0.214027,...,0.761343,0.0,0.125876,0.0,0.0,0.096454,0.130578,0.0,0.0,0.0


## 1.2. annotation

In [58]:
annotation = pandas.read_csv(annotation_file, sep=',', index_col='ens_gene')
annotation.drop(columns=['Unnamed: 0', 'target_id'], inplace=True)
annotation.drop_duplicates(inplace=True)
print(annotation.shape)
annotation.head()

(53115, 1)


Unnamed: 0_level_0,ext_gene
ens_gene,Unnamed: 1_level_1
ENSMUSG00000064336,mt-Tf
ENSMUSG00000064337,mt-Rnr1
ENSMUSG00000064338,mt-Tv
ENSMUSG00000064339,mt-Rnr2
ENSMUSG00000064340,mt-Tl1


In [56]:
annotation.head()

Unnamed: 0_level_0,target_id,ext_gene
ens_gene,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSMUSG00000064336,ENSMUST00000082387,mt-Tf
ENSMUSG00000064337,ENSMUST00000082388,mt-Rnr1
ENSMUSG00000064338,ENSMUST00000082389,mt-Tv
ENSMUSG00000064339,ENSMUST00000082390,mt-Rnr2
ENSMUSG00000064340,ENSMUST00000082391,mt-Tl1


# 2. define gene sets

In [43]:
for mouse in mice[:1]:
    condition_labels0 = ['{}_0h_{}'.format(mouse, i+1) for i in range(3)]
    condition_labels48 = ['{}_48h_{}'.format(mouse, i+1) for i in range(3)]
    condition_labels72 = ['{}_72h_{}'.format(mouse, i+1) for i in range(3)]
    
    for trend in trends[:1]:
        print('working with mouse {} and trend {}'.format(mouse, trend))
        
        DEG_file = DEG_folder + mouse + '_' + trend + '.tsv'
        DEGs = pandas.read_csv(DEG_file, sep='\t', index_col=0)
        print('\t detected {} DEGs'.format(DEGs.shape[0]))
        
        rules = (DEGs['pvalue'] < 0.05) & (DEGs['padj'] < 0.1)
        sub = DEGs[rules]
        print('\t passed tests {}'.format(DEGs.shape[0]))
        
        for ensembl in DEGs.index[:10]:
            print('\t\t working with {}'.format(ensembl))
            including = True
        
            #
            # filter 1 --- expression
            #
        
            tpm0 = numpy.median([expression[label][ensembl] for label in condition_labels0])
            tpm48 = numpy.median([expression[label][ensembl] for label in condition_labels48])
            tpm72 = numpy.median([expression[label][ensembl] for label in condition_labels72])
            top = numpy.max([tpm0, tpm48, tpm72])
            #print(top)
            
            #
            # filter 2 --- identify fold-changes using discrete values
            #        
            ###
            ###            [round(x, epsilon)/epsilon ] + 1
            ###  FC = abs  -------------------------------- > 1
            ###            [round(y, epsilon)/epsilon ] + 1
            ###
            ###
            ###  epsilon = 1
            sam48 = numpy.around(tpm48) + 1
            sam72 = numpy.around(tpm72) + 1
            ref = numpy.around(tpm0) + 1
            
            fc48 = sam48/ref; fc72 = sam72/ref
            log2FC48 = numpy.log2(fc48); log2FC72 = numpy.log2(fc72)
            
            #
            # selection
            #
            if top < expression_threshold:
                including = False
                info = '\t\t\t WARNING: low-expression gene discarded. Expression changes from {:.3f} to {:.3f} and {:.3f}'.format(tpm0, tpm48, tpm72)
                print(info)
                
            if (including == True):
                # check that trend is consistent in both timepoints
                if (log2FC48*log2FC72) < 0:
                    including = False
                    info = '\t\t\t WARNING: inconsistent trend: {:.3f} and {:.3f}'.format(log2FC48, log2FC72)
                    print(info)
                # check that you are abs log2FC > 1 at both conditions
                if numpy.min([numpy.abs(log2FC48), numpy.abs(log2FC72)]) < 1:
                    including = False
                    info = '\t\t\t WARNING: small change gene discarded'
                    # info = '\t WARNING: small change gene discarded. Expression changes from {:.3f} ({}) to {:.3f} ({}), resulting in abs_log2FC {:.3f}. {}, {}'.format(r, den, s, num, abs_log2FC, case[1], case[3])
                    
            
        
        

working with mouse a3922 and trend up
	 detected 854 DEGs
	 passed tests 854
		 working with ENSMUSG00000089739
		 working with ENSMUSG00000040715
		 working with ENSMUSG00000038963
		 working with ENSMUSG00000056481
		 working with ENSMUSG00000041216
		 working with ENSMUSG00000095845
		 working with ENSMUSG00000059901
		 working with ENSMUSG00000004540
		 working with ENSMUSG00000108900
		 working with ENSMUSG00000053279


In [29]:
tpm0 = numpy.median([expression[label][ensembl] for label in condition_labels0])
tpm48 = numpy.median([expression[label][ensembl] for label in condition_labels48])
tpm72 = numpy.median([expression[label][ensembl] for label in condition_labels72])

top = numpy.max([tpm0, tpm48, tpm72])

print(tpm0)
print(tpm48)
print(tpm72)
print(top)

0.0
0.7661801
1.65427
1.65427


In [32]:
numpy.around(3.4)

3.0