In [1]:
import pandas, seaborn, numpy

import matplotlib, matplotlib.pyplot
matplotlib.rcParams.update({'font.size':20, 'font.family':'FreeSans', 'xtick.labelsize':20, 'ytick.labelsize':20})

import pyensembl
annotation = pyensembl.EnsemblRelease(100)

# 1. read data

In [2]:
data_file = '/home/adrian/projects/vigur/results/transcriptomics/deseq2_filtered/strict_union_experiment_three.tsv'
df = pandas.read_csv(data_file, sep='\t')
df.set_index('gene_name', drop=True, inplace=True)
df.drop(columns=['zero4', 'zero24'], inplace=True)
df.drop(columns=['half24', 'five24', 'fifty24'], inplace=True)
df = df[df.abs().max(axis=1) >= 0.5]

df.shape

FileNotFoundError: [Errno 2] No such file or directory: '/home/adrian/projects/vigur/results/deseq2_filtered/strict_union_experiment_three.tsv'

# 2. plot data

In [None]:
fig = matplotlib.pyplot.figure(figsize=1.4*numpy.array([6.4, 4.8]))
ax = fig.add_subplot(1,1,1)
o = ax.pcolor(df, cmap='bwr', vmin=-3.5, vmax=3.5)

fig.gca().invert_yaxis()

fig.colorbar(o, ax=ax)
fig.tight_layout()

matplotlib.pyplot.savefig('original.svg')

# 3. cluster data using seaborn

In [None]:
seaborn.clustermap(df, cmap='bwr', vmin=-3, vmax=3, method='ward')

matplotlib.pyplot.savefig('seaborn_clustered.svg')

# 4. qualitative clustering

In [None]:
up = df[df.sum(axis=1) > 0]
print(up.shape)
for ensembl in up.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
        #print('***', ensembl)
    #print('up', up[ensembl], ensembl, name)
    print(ensembl)

## 4.1. group selection

In [None]:
pos1 = df.loc[(df['half4'] > 0) & (df['five4'] > 0) & (df['fifty4'] > 0)]
pos1['sum'] = pos1.sum(axis=1)
pos1.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in pos1.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
pos1['names'] = gene_names

print(pos1.shape)
print(list(pos1['names']))
pos1.head(50)

In [None]:
pos2 = df.loc[(df['half4'] > 0) & (df['five4'] > 0) & (df['fifty4'] <= 0)]
pos2['sum'] = pos2.sum(axis=1)
pos2.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in pos2.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
pos2['names'] = gene_names

print(pos2.shape)
pos2.head()

In [None]:
pos3 = df.loc[(df['half4'] <= 0) & (df['five4'] > 0) & (df['fifty4'] > 0)]
pos3['sum'] = pos3.sum(axis=1)
pos3.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in pos3.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
pos3['names'] = gene_names

print(pos3.shape)
pos3.head()

In [None]:
pos4 = df.loc[(df['half4'] > 0) & (df['five4'] <= 0) & (df['fifty4'] > 0)]
pos4['sum'] = pos4.sum(axis=1)
pos4.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in pos4.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
pos4['names'] = gene_names

print(pos4.shape)
pos4.head(10)

In [None]:
pos5 = df.loc[(df['half4'] > 0) & (df['five4'] <= 0) & (df['fifty4'] <= 0)]
pos5['sum'] = pos5.sum(axis=1)
pos5.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in pos5.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
pos5['names'] = gene_names

print(pos5.shape)
print(list(pos5.index))
pos5.head(20)

In [None]:
pos6 = df.loc[(df['half4'] <= 0) & (df['five4'] > 0) & (df['fifty4'] <= 0)]
pos6['sum'] = pos6.sum(axis=1)
pos6.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in pos6.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
pos6['names'] = gene_names

print(list(pos6.index))
print(pos6.shape)
pos6.head(25)

In [None]:
pos7 = df.loc[(df['half4'] <= 0) & (df['five4'] <= 0) & (df['fifty4'] > 0)]
pos7['sum'] = pos7.sum(axis=1)
pos7.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in pos7.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
pos7['names'] = gene_names

print(pos7.shape)
print(list(pos7.index))
pos7.head(20)

In [None]:
neg1 = df.loc[(df['half4'] >= 0) & (df['five4'] >= 0) & (df['fifty4'] < 0)]
neg1['sum'] = neg1.sum(axis=1)
neg1.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in neg1.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
neg1['names'] = gene_names

print(neg1.shape)
neg1.head(20)

In [None]:
neg2 = df.loc[(df['half4'] >= 0) & (df['five4'] < 0) & (df['fifty4'] >= 0)]
neg2['sum'] = neg2.sum(axis=1)
neg2.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in neg2.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
neg2['names'] = gene_names

print(neg2.shape)
neg2.head(15)

In [None]:
neg3 = df.loc[(df['half4'] < 0) & (df['five4'] >= 0) & (df['fifty4'] >= 0)]
neg3['sum'] = neg3.sum(axis=1)
neg3.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in neg3.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
neg3['names'] = gene_names

print(neg3.shape)
neg3.head(10)

In [None]:
neg4 = df.loc[(df['half4'] < 0) & (df['five4'] >= 0) & (df['fifty4'] < 0)]
neg4['sum'] = neg4.sum(axis=1)
neg4.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in neg4.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
neg4['names'] = gene_names

print(neg4.shape)
neg4.head()

In [None]:
neg5 = df.loc[(df['half4'] >= 0) & (df['five4'] < 0) & (df['fifty4'] < 0)]
neg5['sum'] = neg5.sum(axis=1)
neg5.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in neg5.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
neg5['names'] = gene_names

print(neg5.shape)
for name in gene_names:
    print(name)
neg5.head(10)

In [None]:
neg6 = df.loc[(df['half4'] < 0) & (df['five4'] < 0) & (df['fifty4'] >= 0)]
neg6['sum'] = neg6.sum(axis=1)
neg6.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in neg6.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
neg6['names'] = gene_names

print(neg6.shape)
print(', '.join(gene_names))
neg6.head(10)

In [None]:
neg7 = df.loc[(df['half4'] < 0) & (df['five4'] < 0) & (df['fifty4'] < 0)]
neg7['sum'] = neg7.sum(axis=1)
neg7.sort_values(by=['sum'], inplace=True, ascending=False)

gene_names = []
for ensembl in neg7.index:
    try:
        name = annotation.gene_name_of_gene_id(ensembl)
    except:
        name = ensembl
    gene_names.append(name)
neg7['names'] = gene_names

print(neg7.shape)
for name in gene_names:
    print(name)
neg7.head(10)

## 4.2. re-order and plot

In [None]:
order_pos = list(pos1.index) + list(pos2.index) + list(pos3.index) + list(pos4.index) + list(pos5.index) + list(pos6.index) + list(pos7.index)
order_neg = list(neg1.index) + list(neg2.index) + list(neg3.index) + list(neg4.index) + list(neg5.index) + list(neg6.index) + list(neg7.index)
order = order_pos + order_neg

ordered = df.loc[order, :]

print(df.min(), df.max())

In [None]:
fig = matplotlib.pyplot.figure(figsize=1.5*numpy.array([6.4, 4.8]))
ax = fig.add_subplot(1,1,1)

heatmap = ax.pcolor(ordered, cmap='bwr', vmin=-5, vmax=5)

# invert y-axis
fig.gca().invert_yaxis()

# scale bar
cbar = fig.colorbar(heatmap, ax=ax)
cbar.set_label('log2FC', rotation=90)

# shift the x major ticks at the middle of each cell
ax.set_xticks(numpy.arange(ordered.shape[1]) + 0.5)

# define x labels
ax.set_xticklabels(df.columns)

# define y labels
gene_names = []
for ensembl in list(df.index):
    try:
        gene_name = annotation.gene_name_of_gene_id(ensembl)
    except:
        gene_name = ensembl
    gene_names.append(gene_name)
    
selected_positions = numpy.arange(0,len(gene_names), 3)
selected_labels = [gene_names[i] for i in selected_positions]

ax.set_yticks(selected_positions)
ax.set_yticklabels(selected_labels, fontsize=8)

# tight layout
fig.tight_layout()

# save
fig.savefig('heatmap.svg')

# 4.3 make the group bar

In [None]:
groups = {}

group_data = []
positive_sizes = [len(pos1.index), len(pos2.index), len(pos3.index), len(pos4.index), len(pos5.index), len(pos6.index), len(pos7.index)]
negative_sizes = [len(neg1.index), len(neg2.index), len(neg3.index), len(neg4.index), len(neg5.index), len(neg6.index), len(neg7.index)]
sizes = positive_sizes + negative_sizes
for i in range(len(sizes)):
    for j in range(sizes[i]):
        group_data.append(i+1)
print(group_data, len(group_data))

groups['types'] = group_data
gp = pandas.DataFrame(groups)

In [None]:
fig = matplotlib.pyplot.figure(figsize=1.5*numpy.array([6.4, 4.8]))
ax = fig.add_subplot(1,1,1)
heatmap = ax.pcolor(gp, cmap='tab20')

# invert y-axis
fig.gca().invert_yaxis()

matplotlib.pyplot.axis('off')
matplotlib.pyplot.savefig('bar.svg')