In [None]:
import json
import datetime
from pathlib import Path

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

from bioinf_common.plotting import corrplot

In [None]:
sns.set_context('talk')

# Parameters

In [None]:
result_files = snakemake.input.result_files
meta_files = snakemake.input.meta_files

out_dir = Path(snakemake.output.out_dir)

# Load data

In [None]:
df_list = []
for fname in result_files:
    tmp = pd.read_csv(fname)

    _, tool, _, _ = fname.split('/')
    tmp['tool'] = tool

    df_list.append(tmp)

df = pd.concat(df_list, ignore_index=True)
df.head()

In [None]:
tmp = []
for fname in meta_files:
    with open(fname) as fd:
        meta_data = json.load(fd)

    _, tool, _, _ = fname.split('/')

    tmp.append({
        'tool': tool,
        'runtime': meta_data['exec_time']
    })

df_meta = pd.DataFrame(tmp)
df_meta.head()

# Runtime comparison

In [None]:
@FuncFormatter
def format_seconds(x, pos):
    """Convert seconds to readable timestamp."""
    return str(datetime.timedelta(seconds=round(x)))

In [None]:
sorted_tool_list = df_meta.loc[df_meta['tool'].str.lower().argsort(), 'tool'].unique()

In [None]:
plt.figure(figsize=(8, 8))

sns.boxplot(
    x='tool', y='runtime', data=df_meta,
    order=sorted_tool_list)
sns.stripplot(
    x='tool', y='runtime', data=df_meta,
    order=sorted_tool_list,
    linewidth=1)

plt.xlabel('Tool')
plt.ylabel('Runtime [h:m:s]')

plt.xticks(rotation=90)
plt.yscale('log')

plt.gca().yaxis.set_major_formatter(format_seconds)

plt.tight_layout()
plt.savefig(out_dir / 'runtime.pdf')

# Enrichment correlation between tools

In [None]:
df.loc[df['p_value'] == 0, 'p_value'] = np.nan #df.loc[df['p_value'] > 0, 'p_value'].min()
df['pvalue_trans'] = -np.log10(df['p_value'])
df.head()

In [None]:
df_wide = df.pivot(index='term', columns='tool', values='pvalue_trans')
df_wide.to_csv(out_dir / 'enrichments.csv')

In [None]:
df_wide.reset_index(inplace=True)
df_wide.head()

In [None]:
g = corrplot(
    df_wide,
    #hue='term',
    corr_method='spearman',
    diag_kws=dict(
        kde=False,
        bins=np.linspace(
            np.nanmin(df_wide.select_dtypes(include=np.number).values),
            np.nanmax(df_wide.select_dtypes(include=np.number).values),
            50
        )
    ),
    lower_kws=dict(alpha=.5, rasterized=True, edgecolor='none'),
    diag_sharey=False, height=3
)

#g.add_legend()
g.savefig(out_dir / 'pvalue_scatterplots.pdf', dpi=200)