In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('')

#### Barplots for organism match

In [None]:
data_most_common = data.loc[data['org'].isin(
    ['hsapiens', 'mmusculus', 'athaliana', 'dmelanogaster', 'drerio', 'rnorvegicus', 'zmays', 'mmulatta', 'scerevisiae', 'osativa']
)].copy()

category_counts_all = np.unique(data_most_common['match_org_all'], return_counts=True)
percentages_all = category_counts_all[1] / len(data_most_common) * 100

category_counts_100 = np.unique(data_most_common['match_org_100'], return_counts=True)
percentages_100 = category_counts_100[1] / len(data_most_common) * 100


fig, ax = plt.subplots(1,2, figsize=(10, 5), gridspec_kw={'width_ratios': [.5, .5]})
g = sns.barplot(data=data_most_common, x=category_counts_all[0], y=percentages_all, palette="Blues", ax=ax[0])
g.set(xlabel="Match Category", ylabel="Percentage")
g.bar_label(g.containers[0], fmt='%.f%%')
g.set(title='Matched organism %, 10 most frequent organisms' + '\nAll organism transcripts' 
       + '\nLibrary source min-match percentage: 2' 
       + '\nLibrary source min frequency ratio: 2')
f = sns.barplot(data=data_most_common, x=category_counts_100[0], y=percentages_100, palette="Blues", ax=ax[1])
f.set(xlabel="Match Category", ylabel="Percentage")
f.bar_label(f.containers[0], fmt='%.f%%')
f.set(title='Matched organism %, 10 most frequent organisms' + '\n100 most frequent organism transcripts' 
       + '\nLibrary source min-match percentage: 2' 
       + '\nLibrary source min frequency ratio: 2')
plt.savefig("org_test_results.png", dpi=250, bbox_inches='tight')

#### Histograms for predicted adapters

In [None]:
# Histogram of 1st predicted adapter percent #1
# # Drop 0 percentages of both SE and PE reads
result_final_n = result_final.drop(result_final[
    (result_final['1_percent_1'] == 0) &
    (result_final['2_percent_1'] == 0) |
    (result_final['1_percent_1'] == 0) &
    result_final['2_percent_1'].isna()].index)
all_percent = pd.concat([result_final_n['1_percent_1'],
                            result_final_n['2_percent_1']])
all_percent = all_percent[all_percent != 0]
fig, axs = plt.subplots(1, figsize=[8, 8])
sns.histplot(data=all_percent, binwidth=2).set(
    title='Fraction of reads containing most '
            + 'prevalent adapter\nID: ' + str(results_folder)
            + '\nNo. of records: ' + str(records)
            + '\nRead layout min-match percentage: ' + str(min_match)
            + '\nRead layout min frequency ratio: ' + str(min_freq))
plt.xlim(0, 100)
plt.savefig(str(RESULTS_HTS_DIR) + '/' + str(results_folder)
            + '/2_Hist_1st_pred_adapter_full.png', dpi=100)

# Histogram of 1st predicted adapter percent #2
fig, axs = plt.subplots(1, figsize=[8, 8])
sns.histplot(data=all_percent, binwidth=0.2).set(
    title='Fraction of reads containing most '
            + 'prevalent adapter\nID: '
            + str(results_folder)
            + '\nNo. of records: ' + str(records)
            + '\nRead layout min-match percentage: ' + str(min_match)
            + '\nRead layout min frequency ratio: ' + str(min_freq))
plt.xlim(0, 10)
plt.savefig(str(RESULTS_HTS_DIR) + '/' + str(results_folder)
            + '/3_Hist_1st_pred_adapter_10.png', dpi=100)

# Histogram of 1st vs 2nd predicted adapter ratio
result_final_n = result_final.drop(result_final[
    (result_final['1_percent_1'] == 0) &
    (result_final['2_percent_1'] == 0) |
    (result_final['1_percent_1'] == 0) &
    result_final['2_percent_1'].isna()].index)
result_final_n['1_ratio'] = (
    result_final_n['1_percent_1'] / (
        result_final_n['1_percent_2'] + 0.01))
result_final_n['2_ratio'] = (
    result_final_n['2_percent_1'] / (
        result_final_n['2_percent_2'] + 0.01))
all_ratios = pd.concat(
    [result_final_n['1_ratio'], result_final_n['2_ratio']])
fig, axs = plt.subplots(1, figsize=[8, 8])
sns.histplot(data=all_ratios).set(
    title='Fraction of reads with most prevalent adapter '
    + 'vs. second most prevalent\nID: '
    + str(results_folder)
    + '\nNo. of records: ' + str(records)
    + '\nRead layout min-match percentage: ' + str(min_match)
    + '\nRead layout min frequency ratio: ' + str(min_freq),
    xscale="log")
plt.savefig(str(RESULTS_HTS_DIR) + '/' + str(results_folder)
            + '/4_Hist_1st_vs_2nd_pred_adapter_ratio.png', dpi=100)

#### Barplots for performance times

In [None]:
data_5 = data.loc[data['process'].isin(['processing_dur','extracting_dur','kallisto_quant_dur','alignment_dur','cutadapt_dur'])].copy()
data_total = data.loc[data['process'] == 'total_dur'].copy()

# Replace NaN values in 'duration' column with zeros
data_5['duration'] = data_5['duration'].fillna('00:00:00')
data_total['duration'].fillna('00:00:00', inplace=True)

# Convert 'duration' column to timedelta format
data_5['duration'] = pd.to_timedelta(data_5['duration'])
data_total['duration'] = pd.to_timedelta(data_total['duration'])

# Extract total seconds from timedelta values and convert to numeric
data_5['duration'] = data_5['duration'].dt.total_seconds()
data_total['duration'] = data_total['duration'].dt.total_seconds()

fig, ax =plt.subplots(1,2, figsize=(9, 5), gridspec_kw={'width_ratios': [.80, .20]})
g = sns.barplot(data=data_5, x='process', y='duration', hue='perf_group', errorbar='se', palette="bright", alpha=.6, ax=ax[0])
g.set(xlabel="", ylabel="Duration (seconds)")
g.set_xticklabels(["Process records", "Extract read length", "Kallisto quantification", "STAR Alignment", "Cutadapt"], rotation=45)
g.legend(title="No. of records")

f = sns.barplot(data=data_total, x='process', y='duration', hue='perf_group', errorbar='se', palette="bright", alpha=.6, ax=ax[1])
f.set(xlabel="", ylabel="")
f.set_xticklabels(["Total HTSinfer run"], rotation=45)
f.legend_.remove()
sns.despine()
plt.savefig("perf_test_results.png", dpi=250, bbox_inches='tight')