This script is the same as in relaxed pattern analysis with the difference of using only start and end time points instead of the three.

In [1]:
import pandas, numpy, termcolor, seaborn
import scipy, scipy.stats

In [2]:
import matplotlib, matplotlib.pyplot
matplotlib.rcParams.update({'font.size':20, 'font.family':'sans-serif', 'xtick.labelsize':30, 'ytick.labelsize':30, 'figure.figsize':(16, 9), 'axes.labelsize':40})

# 0. user-defined variables

In [3]:
DEG_folder = '/home/adrian/projects/reynisfjara/results/DEGs_DESeq2/start_end/'
expression_file = '/home/adrian/projects/reynisfjara/results/tpm/DESeq2_TPM_values.tsv'

annotation_file = '/home/adrian/projects/reynisfjara/results/annotation/annotation.csv'
dorothea_file = '/home/adrian/software/dorothea/mmusculus/mmusculus.dorothea.txt'

mice = ['a3922', 'a4774', 'a4775', 'a4776']
times = ['0h', '48h', '72h']
numerical_times = [0, 48, 72]

# 1. read data

## 1.1. read expression

In [4]:
expression = pandas.read_csv(expression_file, sep='\t', index_col=0)
expression.head()

Unnamed: 0,a3922_0h_1,a3922_0h_2,a3922_0h_3,a3922_48h_1,a3922_48h_2,a3922_48h_3,a3922_72h_1,a3922_72h_2,a3922_72h_3,a4774_0h_1,...,a4775_72h_3,a4776_0h_1,a4776_0h_2,a4776_0h_3,a4776_48h_1,a4776_48h_2,a4776_48h_3,a4776_72h_1,a4776_72h_2,a4776_72h_3
ENSMUSG00000000001,65.244411,65.953285,64.396929,75.289955,71.147817,72.146832,67.786762,70.217188,70.219265,61.227221,...,68.440778,64.806231,65.619286,66.749396,64.425407,68.665136,68.419059,69.705546,70.717428,71.017699
ENSMUSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000028,5.945436,5.46424,4.924612,20.245428,19.780706,22.747363,19.239938,22.062649,20.547492,7.843197,...,14.565935,4.712688,3.690099,4.120889,7.220914,8.37785,9.611563,9.465468,10.235803,9.808747
ENSMUSG00000000037,0.220972,0.959207,0.25827,0.969948,1.149452,2.212842,0.989879,2.354492,1.813624,0.545637,...,0.989086,0.451844,0.523639,0.679725,2.862086,0.865126,1.97357,1.612622,2.148935,5.445061
ENSMUSG00000000049,0.061451,0.061879,0.096945,0.0,0.071373,0.0,0.059768,0.0,0.0,0.214027,...,0.761343,0.0,0.125876,0.0,0.0,0.096454,0.130578,0.0,0.0,0.0


## 1.2. read annotation

In [5]:
annotation = pandas.read_csv(annotation_file, sep=',', index_col='ens_gene')
annotation.drop(columns=['Unnamed: 0', 'target_id'], inplace=True)
annotation.drop_duplicates(inplace=True)
print(annotation.shape)
annotation.head()

(53193, 3)


Unnamed: 0_level_0,gene_biotype,description,ext_gene
ens_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSMUSG00000064336,Mt_tRNA,mitochondrially encoded tRNA phenylalanine [So...,mt-Tf
ENSMUSG00000064337,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:MGI S...,mt-Rnr1
ENSMUSG00000064338,Mt_tRNA,mitochondrially encoded tRNA valine [Source:MG...,mt-Tv
ENSMUSG00000064339,Mt_rRNA,mitochondrially encoded 16S rRNA [Source:MGI S...,mt-Rnr2
ENSMUSG00000064340,Mt_tRNA,mitochondrially encoded tRNA leucine 1 [Source...,mt-Tl1


# 2. transform expression to be more amenable to downstream analysis

In [6]:
df = pandas.DataFrame()
for mouse in mice:
    for time in times:
        condition_labels = [label for label in expression.columns if mouse in label and time in label]
        df[mouse + '_' + time] = expression.loc[:, condition_labels].median(axis=1)
df.head()

Unnamed: 0,a3922_0h,a3922_48h,a3922_72h,a4774_0h,a4774_48h,a4774_72h,a4775_0h,a4775_48h,a4775_72h,a4776_0h,a4776_48h,a4776_72h
ENSMUSG00000000001,65.244411,72.146832,70.217188,61.227221,65.197716,64.804086,55.869249,65.371837,68.440778,65.619286,68.419059,70.717428
ENSMUSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000028,5.46424,20.245428,20.547492,7.843197,8.262456,11.467644,8.258439,15.989181,16.179836,4.120889,8.37785,9.808747
ENSMUSG00000000037,0.25827,1.149452,1.813624,0.567393,1.344796,1.122762,0.472029,0.958435,1.713961,0.523639,1.97357,2.148935
ENSMUSG00000000049,0.061879,0.0,0.0,0.214027,0.0,0.0,0.149524,0.0,0.441745,0.0,0.096454,0.0


In [7]:
for label in df.columns:
    if 'a3922' in label:
        new_label = 'WT_' + label.split('_')[1]
        df.rename(columns = {label:new_label}, inplace=True)
df.head()

Unnamed: 0,WT_0h,WT_48h,WT_72h,a4774_0h,a4774_48h,a4774_72h,a4775_0h,a4775_48h,a4775_72h,a4776_0h,a4776_48h,a4776_72h
ENSMUSG00000000001,65.244411,72.146832,70.217188,61.227221,65.197716,64.804086,55.869249,65.371837,68.440778,65.619286,68.419059,70.717428
ENSMUSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000028,5.46424,20.245428,20.547492,7.843197,8.262456,11.467644,8.258439,15.989181,16.179836,4.120889,8.37785,9.808747
ENSMUSG00000000037,0.25827,1.149452,1.813624,0.567393,1.344796,1.122762,0.472029,0.958435,1.713961,0.523639,1.97357,2.148935
ENSMUSG00000000049,0.061879,0.0,0.0,0.214027,0.0,0.0,0.149524,0.0,0.441745,0.0,0.096454,0.0


In [8]:
simple_expression = df.iloc[:, :3]
simple_expression.head()

Unnamed: 0,WT_0h,WT_48h,WT_72h
ENSMUSG00000000001,65.244411,72.146832,70.217188
ENSMUSG00000000003,0.0,0.0,0.0
ENSMUSG00000000028,5.46424,20.245428,20.547492
ENSMUSG00000000037,0.25827,1.149452,1.813624
ENSMUSG00000000049,0.061879,0.0,0.0


In [9]:
simple_expression = df.iloc[:, :3]

for time in times:
    condition_labels = [label for label in df.columns if time in label and 'WT' not in label]
    simple_expression['MUT_' + time] = df.loc[:, condition_labels].median(axis=1)

simple_expression.head()

Unnamed: 0,WT_0h,WT_48h,WT_72h,MUT_0h,MUT_48h,MUT_72h
ENSMUSG00000000001,65.244411,72.146832,70.217188,61.227221,65.371837,68.440778
ENSMUSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000028,5.46424,20.245428,20.547492,7.843197,8.37785,11.467644
ENSMUSG00000000037,0.25827,1.149452,1.813624,0.523639,1.344796,1.713961
ENSMUSG00000000049,0.061879,0.0,0.0,0.149524,0.0,0.0


# 3. search for pattern

## 3.1. retrieve genes that change with time in MUT phenotype

In [10]:
DEGs = []
for mouse in mice[1:]:
    path = DEG_folder + 'start_end_MUTt72_over_MUTt0_' + mouse + '.tsv'
    df = pandas.read_csv(path, sep='\t', index_col=0)
    print(df.shape)
    DEGs.append(df.index.to_list())
    
    # checking for Snai2 ENSMUSG00000022676
    if 'ENSMUSG00000022676' in df.index:
        print('Snai2 found')
    if 'ENSMUSG00000016128' in df.index:
        print('Stard13 found')
    
list_one = list(set(DEGs[0]) & set(DEGs[1]) & set(DEGs[2]))
print(len(list_one))

(479, 6)
Snai2 found
(1041, 6)
Snai2 found
(1030, 6)
Snai2 found
224


In [11]:
# filter out genes that do not cross the abs log2FC > 1 and the max. expr. > 2
list_one_a = []
for ensembl in list_one:
    working_expression = simple_expression.loc[ensembl, :]
    start = numpy.round(simple_expression.loc[ensembl, 'MUT_0h']) + 1
    end = numpy.round(simple_expression.loc[ensembl, 'MUT_72h']) + 1
    abs_log2FC = numpy.abs(numpy.log2(end/start))    
    max_expr = numpy.max([start, end])
    
    if abs_log2FC > 1 and max_expr > 2+1:
        list_one_a.append(ensembl)
        
print(len(list_one_a))

143


## 3.2. check that genes are WT flat

In [12]:
path = DEG_folder + 'start_end_WTt72_over_WTt0' + '.tsv'
df = pandas.read_csv(path, sep='\t', index_col=0)
print(df.shape)
DEGs = df.index.to_list()
list_four = list(set(DEGs))
print(len(list_four))

(1389, 6)
1389


In [13]:
# a flat gene is a gene that:
# if has low expression (TPM < 2)
# elif abs log2FC < log(1.5) and not in significant group

list_four_a = []
for ensembl in simple_expression.index:
    start = numpy.round(simple_expression.loc[ensembl, 'WT_0h']) + 1
    end = numpy.round(simple_expression.loc[ensembl, 'WT_72h']) + 1
    max_expr = numpy.max([start, end])
    abs_log2FC = numpy.abs(numpy.log2(end/start))
    
    if ensembl == 'ENSMUSG00000022676':
        print('Snai2 abs log2FC for WT is {:.5} and the threshold is {:.5}'.format(abs_log2FC, numpy.log2(1.5)))
    
    if max_expr < 2+1:
        list_four_a.append(ensembl)
    if abs_log2FC < numpy.log2(1.5) and ensembl not in list_four:
        list_four_a.append(ensembl)
        
list_four_b = list(set(list_four_a))
print(len(simple_expression.index), len(list_four_a), len(list_four_b))

Snai2 abs log2FC for WT is 0.75207 and the threshold is 0.58496
35938 56793 33226


## 3.3. define gene set such that MUT changes and WT does not

In [14]:
list_five = []
for ensembl in list_one_a:
    if ensembl in list_four_b:
        list_five.append(ensembl)
print(len(list_five))

31


# 4. plot and print identified genes

## 4.1. print selected set of genes

In [15]:
for ensembl in list_five:
    print(ensembl)

ENSMUSG00000020190
ENSMUSG00000029510
ENSMUSG00000006235
ENSMUSG00000032564
ENSMUSG00000036698
ENSMUSG00000015766
ENSMUSG00000037049
ENSMUSG00000029861
ENSMUSG00000107874
ENSMUSG00000014444
ENSMUSG00000028713
ENSMUSG00000030796
ENSMUSG00000036853
ENSMUSG00000047793
ENSMUSG00000025511
ENSMUSG00000059013
ENSMUSG00000003617
ENSMUSG00000009687
ENSMUSG00000021696
ENSMUSG00000026857
ENSMUSG00000022243
ENSMUSG00000030084
ENSMUSG00000072941
ENSMUSG00000074892
ENSMUSG00000028194
ENSMUSG00000023009
ENSMUSG00000001751
ENSMUSG00000035273
ENSMUSG00000068220
ENSMUSG00000046727
ENSMUSG00000016496


In [16]:
index = 0
for ensembl in list_five:
    index = index + 1
    gene_name = annotation.loc[ensembl]['ext_gene']
    description = annotation.loc[ensembl]['description'].split(' [')[0]
    if simple_expression.loc[ensembl, 'MUT_72h'] > simple_expression.loc[ensembl, 'MUT_0h']:
        trend = 'up'
        message = '\t'.join([str(index), ensembl, gene_name, trend, description])
        print(termcolor.colored(message, 'red'))
    else:
        trend = 'down'
        message = '\t'.join([str(index), ensembl, gene_name, trend, description])
        print(termcolor.colored(message, 'blue'))

[34m1	ENSMUSG00000020190	Mknk2	down	MAP kinase-interacting serine/threonine kinase 2[0m
[31m2	ENSMUSG00000029510	Gpc2	up	glypican 2 (cerebroglycan)[0m
[31m3	ENSMUSG00000006235	Epor	up	erythropoietin receptor[0m
[34m4	ENSMUSG00000032564	Cpne4	down	copine IV[0m
[34m5	ENSMUSG00000036698	Ago2	down	argonaute RISC catalytic subunit 2[0m
[34m6	ENSMUSG00000015766	Eps8	down	epidermal growth factor receptor pathway substrate 8[0m
[34m7	ENSMUSG00000037049	Smpd1	down	sphingomyelin phosphodiesterase 1, acid lysosomal[0m
[31m8	ENSMUSG00000029861	Fam131b	up	family with sequence similarity 131, member B[0m
[34m9	ENSMUSG00000107874	Prpmp5	down	proline-rich protein MP5[0m
[34m10	ENSMUSG00000014444	Piezo1	down	piezo-type mechanosensitive ion channel component 1[0m
[31m11	ENSMUSG00000028713	Cyp4b1	up	cytochrome P450, family 4, subfamily b, polypeptide 1[0m
[31m12	ENSMUSG00000030796	Tead2	up	TEA domain family member 2[0m
[34m13	ENSMUSG00000036853	Mcoln3	down	mucolipin 3[0m
[31m14

Yeah, the same list as in relax analysis.