In [1]:
# import all required packages
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [18]:
# Merge datasets of up- and downregulated gene datasets
def merge_datasets(path_down, path_up, merged_df_path, rows, merged_rows):
    # downregulated dataset
    down = pd.read_csv(path_down, sep="\t")
    down.columns = rows
    # upregulated dataset
    up = pd.read_csv(path_up, sep="\t")
    up.columns = rows
    new_df = pd.concat([down, up])
    new_col_9107 = new_df.loc[: , rows[1]:rows[3]]
    new_col_91418 = new_df.loc[: , rows[4]:rows[6]]
    new_df[merged_rows[0]] = new_col_9107.median(axis=1)
    new_df[merged_rows[1]] = new_col_91418.median(axis=1)
    new_df = new_df.drop([rows[1], rows[2], rows[3], rows[4], rows[5], rows[6]], axis=1)
    new_df.to_csv(merged_df_path, sep="\t", index=False)

# Separate gene names
def prepare_data(file_path):
    norm_genes = pd.read_csv(file_path, sep="\t")
    y_labels = norm_genes["Genes"].tolist()
    norm_genes["Genes"].to_csv("data/gene_ids.csv", index=False, header=None)
    norm_genes = norm_genes.drop(columns=['Genes'])
    return norm_genes, y_labels

In [19]:
# Mapping to Galaxy history
# htseq-count on data 343 and data 405 - BB1-PonA-9107
# htseq-count on data 343 and data 417 - BB2-PonA-9107
# htseq-count on data 343 and data 429 - J-PonA-9107
# htseq-count on data 343 and data 411 - BB1-PonA-91418
# htseq-count on data 343 and data 423 - BB2-PonA-91418
# htseq-count on data 343 and data 435 - J-PonA-91418

# The row names below should be changed according to the dataset being used
rows = ["Genes", "BB1-PonA-9107", "BB2-PonA-9107", "J-PonA-9107", "BB1-PonA-91418", "BB2-PonA-91418", "J-PonA-91418"]
merged_rows = ["PonA-9107", "PonA-91418"]
merged_path = "data/merged_df_pona_9107_91418.csv"
merge_datasets("data/cut_downregulated_pona_9107_91418.tabular", 
                                "data/cut_upregulated_pona_9107_91418.tabular", merged_path, rows, merged_rows)

In [20]:
def plot_cluster_genes(dataframe, file_name, fig_title, y_labels):
    sns.set(font_scale=2.00)
    fig_size = (25,250)
    g = sns.clustermap(dataframe,
                   row_cluster=True,
                   col_cluster=True,
                   cmap="RdBu_r",
                   xticklabels='auto',
                   yticklabels=y_labels,
                   figsize=fig_size,
                   metric='cosine',
                   standard_scale=0
                  )
    g.fig.suptitle(fig_title)
    g.ax_heatmap.set_xlabel("Samples")
    g.ax_heatmap.set_ylabel("Genes")

    for a in g.ax_col_dendrogram.collections:
        a.set_linewidth(3)

    for a in g.ax_row_dendrogram.collections:
        a.set_linewidth(3)
        
    g.savefig('{}.pdf'.format(file_name), dpi=150)
    #g.savefig('{}.png'.format(file_name), dpi=150)

In [21]:
# get gene names
gene_counts, y_labels = prepare_data(merged_path)

# plot cluster heatmap for PonA
#plot_cluster_genes(gene_counts, "plots/clustered_heatmap_PonA-9107-91418", 'DE genes', y_labels)

(['ENST00000011653.9',
  'ENST00000078429.9',
  'ENST00000217740.4',
  'ENST00000220669.10',
  'ENST00000220751.5',
  'ENST00000221132.8',
  'ENST00000225171.7',
  'ENST00000241125.4',
  'ENST00000241453.12',
  'ENST00000245903.4',
  'ENST00000245912.7',
  'ENST00000246841.3',
  'ENST00000251203.14',
  'ENST00000255198.3',
  'ENST00000257575.8',
  'ENST00000258499.8',
  'ENST00000261267.7',
  'ENST00000261783.4',
  'ENST00000269593.5',
  'ENST00000269740.9',
  'ENST00000271732.8',
  'ENST00000274306.7',
  'ENST00000274629.9',
  'ENST00000281589.4',
  'ENST00000285419.8',
  'ENST00000286234.6',
  'ENST00000288135.5',
  'ENST00000289013.11',
  'ENST00000290868.7',
  'ENST00000296327.10',
  'ENST00000296503.10',
  'ENST00000297205.7',
  'ENST00000297632.7',
  'ENST00000300056.8',
  'ENST00000301327.5',
  'ENST00000301522.3',
  'ENST00000303460.5',
  'ENST00000305988.5',
  'ENST00000307746.9',
  'ENST00000311915.12',
  'ENST00000314531.5',
  'ENST00000315576.8',
  'ENST00000316418.9',
  'E

TypeError: Mismatch between array dtype ('<U18') and format specifier ('%.18e')

In [None]:
# TODO : Plot for ETOH
# htseq-count on data 343 and data 408 - BB1-EtoH-91418
# htseq-count on data 343 and data 420 - BB2-EtoH-91418
# htseq-count on data 343 and data 432 - J-EtoH-91418
# htseq-count on data 343 and data 411 - BB1-PonA-91418
# htseq-count on data 343 and data 423 - BB2-PonA-91418
# htseq-count on data 343 and data 435 - J-PonA-91418

merged_df_pona = merge_datasets("data/cut_downregulated_pona_etoh_91418.tabular", 
                                "data/cut_upregulated_pona_etoh_91418.tabular",
                                "data/merged_df_pona_etoh_91418.csv",
                               ["Genes", "BB1-EtoH-91418", "BB2-EtoH-91418", "J-EtoH-91418", "BB1-PonA-91418", "BB2-PonA-91418", "J-PonA-91418"])
path = "data/merged_df_pona_etoh_91418.csv"
normalised_counts_all_genes, y_labels = prepare_data(path)
plot_cluster_genes(normalised_counts_all_genes, "plots/clustered_heatmap_PonA-EtoH-91418", 'DE genes', y_labels)