# info

This script will generate a heatmap of hypoteses A, B and E.

This is due to the following contrasts:
* Hypothesis A: laminar over static
* Hypothesis B is oscillatory over laminar
* Hypothesis C i osicllatory Pi over laminar Pi
* Hypothesis D is laminar Pi over laminar no Pi
* Hypothesis E is oscillatory Pi over oscillatory no Pi

In [1]:
import os, pandas, numpy, seaborn
import scipy, scipy.stats

In [2]:
import matplotlib, matplotlib.pyplot
matplotlib.rcParams.update({'font.size':20, 'font.family':'sans-serif', 'xtick.labelsize':20, 'ytick.labelsize':20, 'figure.figsize':(16, 9)})

In [3]:
colors = [(0, "blue"), (0.5-0.125, "white"), (0.5+0.125, "white"), (1, "red")]
bwwr = matplotlib.colors.LinearSegmentedColormap.from_list("bwwr", colors)

# 0. user-defined variables

In [4]:
DEG_called_folder = '/home/adrian/projects/hegoi/results/subsamples/sleuth/'
expression_file = '/home/adrian/projects/hegoi/results/tpm/DESeq2_TPM_values.tsv'
metadata_file = '/home/adrian/projects/hegoi/metadata/hegoi metadata - hypotheses formatted for filter.tsv'

In [5]:
working_hypotheses = ['A', 'B', 'E']

# 1. read data

## 1.1. read DEGs

In [24]:
file_names = os.listdir(DEG_called_folder)
file_names.remove('messages.txt')
file_names.sort()

degs = {}
for file_name in file_names:
    label = file_name.split('.csv')[0]
    df = pandas.read_csv(DEG_called_folder+file_name, index_col='target_id')
    ensembls = df.index.to_list()
    degs[label] = ensembls

In [34]:
# detect n-1 DEGs
nminus1_DEGs = []
for hypo in working_hypotheses:
    list_of_degs = [degs[label] for label in degs.keys() if hypo in label]
    union = list(set().union(*list_of_degs))
    robust_list = []
    
    for ensembl in union:
        count = 0
        for patient_list in list_of_degs:
            if ensembl in patient_list:
                count = count + 1
        if count >= len(list_of_degs) - 1:
            robust_list.append(ensembl)
    
    print('{} detected {} DEGs in n - 1 patients'.format(hypo, len(robust_list)))
    nminus1_DEGs.append(robust_list)

A detected 12170 DEGs in n - 1 patients
B detected 11152 DEGs in n - 1 patients
E detected 11419 DEGs in n - 1 patients


## 1.2. read expression

In [35]:
expression = pandas.read_csv(expression_file, sep='\t', index_col=0)
print(expression.shape)
expression.head()

(40320, 26)


Unnamed: 0,Lam153,Lami154,Lami46,Lami94,LamiP109,LamiP153,LamiP154,LamiP176,LamiPi46,LamiPi94,...,OsciP154,OsciP175,OsciP178,OsciPi46,Stat109,Stat153,Stat154,Stat176,Stat46,Stat94
ENSG00000000003,24.240953,20.242011,22.919919,15.438227,18.098457,29.000777,14.784503,11.903633,26.217884,18.455471,...,38.756393,15.976121,21.649331,28.426877,29.688663,29.207002,28.989493,28.83238,34.600089,27.971771
ENSG00000000005,0.06296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013958,0.0,0.139231
ENSG00000000419,59.982918,62.984443,120.871295,138.059565,115.707396,84.892,172.643218,147.901176,145.183145,135.750959,...,98.473591,161.024012,86.023654,115.321222,64.46468,81.686544,84.388415,88.740844,81.992995,97.987628
ENSG00000000457,4.680221,3.684855,1.989328,3.244174,4.417648,2.189278,6.217483,3.998143,4.786793,2.949599,...,3.012943,1.024624,2.255165,2.967113,1.825466,2.371161,3.513856,2.617785,2.546863,2.668982
ENSG00000000460,1.98269,1.344508,1.422365,1.667403,0.570064,2.856698,0.416009,0.0,1.310974,1.812507,...,5.510741,3.805323,1.489214,0.957885,4.467578,5.425887,7.680067,4.002537,3.650609,4.056977


## 1.3. read metadata

In [36]:
metadata = pandas.read_csv(metadata_file, sep='\t')
metadata.head()

Unnamed: 0,hypothesis,patient,sampleA,sampleB
0,hypothesis_A,46,Stat46,Lami46
1,hypothesis_A,153,Stat153,Lam153
2,hypothesis_A,154,Stat154,Lami154
3,hypothesis_A,94,Stat94,Lami94
4,hypothesis_B,46,Lami46,Osci46


In [38]:
samples = []
for working_hypothesis in working_hypotheses:
    label = 'hypothesis_{}'.format(working_hypothesis)
    
    sub = metadata[metadata['hypothesis'] == label]
    
    samplesA = sub['sampleA'].to_list()
    samplesB = sub['sampleB'].to_list()
    both = samplesA + samplesB
    
    for element in both:
        samples.append(element)
    
working_samples = list(set(samples))
print(working_samples)

['Osci46', 'Osci153', 'Lam153', 'Lami94', 'Osci154', 'OsciP153', 'OsciPi46', 'Stat94', 'Osci109', 'Lami154', 'OsciP109', 'Stat153', 'Stat154', 'OsciP154', 'Lami46', 'Stat46']


# 2. select response genes

Response genes should pass the log2FC > 1 and have P < 0.05 for at least n-1 samples.

In [50]:
for hypo in working_hypotheses:
    hypo_label = 'hypothesis_{}'.format(hypo)
    sub = metadata[metadata['hypothesis'] == hypo_label]
    
    ### iterate each gene
    for ensembl in expression.index:
        ### iterate over patients to see if filters are passed in n - 1 patients
        patients = sub['patient']
        
        ## check significance
        ## check low-expression
        ## check fold-change
        #a = 
    
        ## check noise

0     46
1    153
2    154
3     94
Name: patient, dtype: int64
4     46
5    153
6    154
Name: patient, dtype: int64
15     46
16    109
17    153
18    154
Name: patient, dtype: int64


In [45]:
expression[sub['sampleA']]

Unnamed: 0,Osci46,Osci109,Osci153,Osci154
ENSG00000000003,33.095156,25.768010,45.398562,40.621696
ENSG00000000005,0.000000,0.000000,0.000000,0.000000
ENSG00000000419,74.222028,73.583736,96.396265,75.565869
ENSG00000000457,3.157596,2.790279,3.046453,4.134886
ENSG00000000460,2.999867,4.918747,3.494749,4.051209
...,...,...,...,...
ENSG00000286255,0.000000,0.000000,0.000000,0.000000
ENSG00000286261,3.679277,2.468771,2.652880,2.619689
ENSG00000286264,9.122854,10.225990,15.331520,14.764960
ENSG00000286265,5.117352,0.000000,4.305551,4.100239


In [46]:
expression[sub['sampleB']]

Unnamed: 0,OsciPi46,OsciP109,OsciP153,OsciP154
ENSG00000000003,28.426877,36.830702,26.074857,38.756393
ENSG00000000005,0.000000,0.000000,0.000000,0.000000
ENSG00000000419,115.321222,48.739804,152.121655,98.473591
ENSG00000000457,2.967113,2.594576,4.168530,3.012943
ENSG00000000460,0.957885,4.047710,4.144545,5.510741
...,...,...,...,...
ENSG00000286255,0.000000,0.000000,0.000000,0.000000
ENSG00000286261,4.283648,2.741635,3.503773,3.467271
ENSG00000286264,18.978610,9.451626,9.304829,10.181760
ENSG00000286265,6.612776,0.000000,10.250660,2.094471


# 3. build the heatmap

In [None]:
print(log2tpmPO.shape)
block = expression.loc[DEGs, working_samples]
print(block.shape)
block.head()

In [None]:
gene_means = numpy.mean(block, axis='columns')
gene_means

In [None]:
zscore_df = scipy.stats.zscore(block, axis=1)
zscore_df.head()

In [None]:
top = numpy.max(numpy.max(zscore_df, axis=1))
bottom = numpy.min(numpy.min(zscore_df, axis=1))

print(top, bottom)

In [None]:
linkage_methods = ['complete', 'single', 'average', 'ward', 'weighted', 'centroid', 'median']
distance_metrics = ['cosine', 'euclidean', 'correlation', 'braycurtis', 'chebyshev', 'seuclidean', 'minkowski', 'sqeuclidean']

for linkage_method in linkage_methods:
    for distance_metric in distance_metrics:
        
        if linkage_method == 'single' and (distance_metric == 'chebyshev' or distance_metric == 'minkowski'):
            pass
        else:
    
            print('working with {} and {}...'.format(linkage_method, distance_metric))

            seaborn.clustermap(zscore_df, cmap=bwwr, method=linkage_method, metric=distance_metric, vmin=-2, vmax=2)

            matplotlib.pyplot.title('{} {}'.format(linkage_method, distance_metric))
            matplotlib.pyplot.tight_layout()
            matplotlib.pyplot.savefig('figures/{}.{}.pdf'.format(linkage_method, distance_metric))