feat: Add Build_Upset module (#204)

* feat: Add module for Build_Upset.py script * refact: Change genome to genome_id * feat: Add build upset to annotation subwf * docs: Update param docs * docs: Update schema and param docs * feat: Add dummy groups to test_full table
beiko-lab · Jul 25, 2024 · 9c16708 · 9c16708
1 parent 5d1250e
commit 9c16708
Show file tree

Hide file tree

Showing 9 changed files with 213 additions and 19 deletions.
diff --git a/bin/Build_Upset.py b/bin/Build_Upset.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+
+import argparse
+import pandas as pd
+from matplotlib import cm
+from matplotlib import pyplot as plt
+from upsetplot import UpSet, from_memberships
+import warnings
+
+# Suppress the UserWarning for tight_layout
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+def load_data(samplesheet_file, feature_profile_file):
+    """
+    Load samplesheet and feature profile data from TSV files.
+
+    Parameters:
+        samplesheet_file (str): Path to the samplesheet file.
+        feature_profile_file (str): Path to the feature profile file.
+
+    Returns:
+        pd.DataFrame: Samplesheet dataframe.
+        pd.DataFrame: Feature profile dataframe.
+    """
+    try:
+        samplesheet_df = pd.read_csv(samplesheet_file)
+        feature_df = pd.read_csv(feature_profile_file, sep='\t')
+    except FileNotFoundError as e:
+        print(f"Error: {e}")
+        raise
+
+    # Convert all columns in samplesheet_df to strings
+    samplesheet_df = samplesheet_df.astype(str)
+
+    return samplesheet_df, feature_df
+
+def process_data(samplesheet_df, feature_df, column_name):
+    """
+    Process the data to merge samplesheet and feature dataframes and create presence/absence matrix.
+
+    Parameters:
+        samplesheet_df (pd.DataFrame): Samplesheet dataframe.
+        feature_df (pd.DataFrame): Feature profile dataframe.
+        column_name (str): Column name for the feature.
+
+    Returns:
+        pd.DataFrame: Transposed presence/absence dataframe with groups column.
+    """
+    merged_df = feature_df.merge(samplesheet_df, left_on='genome_id', right_on='sample', how='left')
+
+    if column_name not in merged_df.columns:
+        print(f"Warning: Column '{column_name}' not found in samplesheet. Skipping.")
+        return None
+
+    if merged_df[column_name].nunique() <= 1:
+        print(f"Warning: Column '{column_name}' contains only a single unique value. Skipping.")
+        return None
+
+    selected_columns = ['genome_id', column_name] + list(feature_df.columns[1:])
+    feature_df = merged_df[selected_columns]
+    presence = feature_df.groupby(column_name).any().astype(int)
+    presence.drop('genome_id', axis=1, inplace=True, errors='ignore')
+    presence_transposed = presence.transpose()
+    presence_transposed.reset_index(inplace=True)
+    presence_transposed.rename(columns={'index': 'feature'}, inplace=True)
+    presence_transposed['category'] = presence_transposed['feature'].str.split('_', expand=True)[0]
+
+    presence_transposed['groups'] = presence_transposed.apply(
+        lambda row: ','.join([col for col in presence_transposed.columns[2:] if row[col] == 1]), axis=1)
+
+    presence_transposed = presence_transposed.loc[:, ['feature', 'category', 'groups']]
+
+    # print("Presence Transposed DataFrame with 'groups' column:\n", presence_transposed.head())  # Debug print to check the dataframe state
+
+    return presence_transposed
+
+def create_upset_plot(samplesheet_file, feature_profile_file, column_names, output_file_prefix):
+    """
+    Create an Upset plot from samplesheet and feature profile files.
+
+    Parameters:
+        samplesheet_file (str): Path to the samplesheet file.
+        feature_profile_file (str): Path to the feature profile file.
+        column_names (list of str): List of column names for features.
+        output_file_prefix (str): Prefix for the output Upset plot files.
+    """
+    samplesheet_df, feature_df = load_data(samplesheet_file, feature_profile_file)
+
+    for column_name in column_names:
+        presence_transposed = process_data(samplesheet_df, feature_df, column_name)
+
+        if presence_transposed is None:
+            continue
+
+        if 'groups' not in presence_transposed.columns:
+            print(f"Error: 'groups' column was not created for {column_name}. Skipping.")
+            continue
+
+        # Ensure 'groups' column is processed correctly
+        memberships = presence_transposed['groups'].str.split(",").apply(lambda x: frozenset(x))
+        rows_by_group = from_memberships(memberships, data=presence_transposed.set_index('feature'))
+
+        # print("Rows by group:\n", rows_by_group.head())  # Debug print to check rows_by_group structure
+
+        # Create UpSet plot
+        fig, ax = plt.subplots(figsize=(24, 12))  # Increased width for better layout
+        upset = UpSet(rows_by_group, show_counts=True, intersection_plot_elements=0, min_subset_size=1)
+        upset.add_stacked_bars(by="category", colors=cm.Pastel1, title="category", elements=10)
+        upset.plot(fig=fig)
+
+        # Remove the black frame by turning off the top and right spines
+        ax.spines[['top','right','left','bottom']].set_visible(False)
+        ax.tick_params(left=False)
+        ax.tick_params(bottom=False)
+        ax.tick_params(labelleft=False)
+        ax.tick_params(labelbottom=False)
+
+        # Adjust layout to move legend outside the figure
+        plt.tight_layout(rect=[0.1, 0.1, 0.75, 1])  # Adjust rect to leave space on the right for legend
+        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
+
+        output_file = f"{output_file_prefix}_{column_name}.png"
+        # Overwrite the existing file if it exists
+        plt.savefig(output_file)
+        plt.close(fig)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Create Upset plot from samplesheet and feature profile files.')
+    parser.add_argument('-s', '--samplesheet_file', type=str, required=True, help='Path to the samplesheet file in TSV format')
+    parser.add_argument('-f', '--feature_profile_file', type=str, required=True, help='Path to the feature profile file in TSV format')
+    parser.add_argument('-c', '--columns', type=str, required=True, help='Comma-separated list of column names for features')
+    parser.add_argument('-o', '--output_file_prefix', type=str, required=True, help='Prefix for the output Upset plot files (e.g., upset_plot)')
+    args = parser.parse_args()
+
+    column_names = args.columns.split(',')
+
+    create_upset_plot(args.samplesheet_file, args.feature_profile_file, column_names, args.output_file_prefix)
diff --git a/conf/modules.config b/conf/modules.config
@@ -466,6 +466,13 @@ process {
         ]
     }
 
+    withName: BUILD_UPSET {
+        publishDir = [
+            path: { "${params.outdir}/annotation/upsets" },
+            mode: params.publish_dir_mode,
+        ]
+    }
+
     // Recombination
     withName: VERTICALL_PAIRWISE {
         ext.prefix = { "cluster_${cluster}" }

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -20,6 +20,7 @@ params {
     use_prokka = true
     use_ppanggolin = true
     skip_poppunk = true
+    upset_plot_columns = "group_one,group_two"
 }
 
 process {

diff --git a/docs/params.md b/docs/params.md
@@ -48,6 +48,7 @@ Parameters for the annotation subworkflow
 | `min_qcover` | Minimum coverage of each match for filtering | `number` | 0.6 |
 | `skip_profile_creation` | Skip annotation feature profile creation | `boolean` |  |
 | `feature_profile_columns` | Columns to include in the feature profile | `string` | mobsuite,rgi,cazy,vfdb,iceberg,bacmet |
+| `upset_plot_columns` | Columns to use for making Upset plots of genome features | `string` |  |
 
 ## Phylogenomics
 
@@ -71,8 +72,8 @@ Parameters for the lineage subworkflow
 | `poppunk_model` | Which PopPunk model to use (bgmm, dbscan, refine, threshold or lineage) | `string` |  |
 | `run_poppunk_qc` | Whether to run the QC step for PopPunk | `boolean` |  |
 | `enable_subsetting` | Enable subsetting workflow based on genome similarity | `boolean` |  |
-| `core_similarity` | Similarity threshold for core genomes | `number` | 99.99 |
-| `accessory_similarity` | Similarity threshold for accessory genes | `number` | 99 |
+| `core_similarity` | Similarity threshold for core genomes | `number` | 99.9 |
+| `accessory_similarity` | Similarity threshold for accessory genes | `number` | 99.0 |
 
 ## Gene Order
 
@@ -140,9 +141,9 @@ Set the top limit for requested resources for any single job.
 
 | Parameter | Description | Type | Default |
 |-----------|-----------|-----------|-----------|
-| `max_cpus` | Maximum number of CPUs that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`</small></details>| `integer` | 16 |
-| `max_memory` | Maximum amount of memory that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`</small></details>| `string` | 128.GB |
-| `max_time` | Maximum amount of time that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`</small></details>| `string` | 240.h |
+| `max_cpus` | Maximum number of CPUs that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`</small></details>| `integer` | 72 |
+| `max_memory` | Maximum amount of memory that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`</small></details>| `string` | 125.GB |
+| `max_time` | Maximum amount of time that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`</small></details>| `string` | 168.h |
 
 ## Generic options
 

diff --git a/modules/local/build_upset/main.nf b/modules/local/build_upset/main.nf
@@ -0,0 +1,32 @@
+process BUILD_UPSET {
+    label 'process_single'
+
+    conda "conda-forge::upsetplot=0.9.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'docker://docker.io/jvfe/arete_upset:v0.1.0':
+        'docker.io/jvfe/arete_upset:v0.1.0' }"
+
+    input:
+    path feature_profile
+    path samplesheet
+    val samplesheet_columns
+
+    output:
+    path "BuildUpset*png", emit: pngs
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    Build_Upset.py \\
+        --output_file_prefix BuildUpset \\
+        --feature_profile_file $feature_profile \\
+        --samplesheet_file $samplesheet \\
+        --columns $samplesheet_columns
+    """
+    stub:
+    """
+    touch BuildUpset_dummy.png
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -37,6 +37,7 @@ params {
     annotation_tools           = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report'
     feature_profile_columns    = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet'
     feature_dispersion_columns = null
+    upset_plot_columns         = null
     min_pident                 = 60
     min_qcover                 = 0.6
     skip_profile_creation      = false

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -148,6 +148,11 @@
                     "fa_icon": "fas fa-columns",
                     "description": "Columns to include in the feature profile",
                     "pattern": "^((rgi|mobsuite|vfdb|cazy|bacmet|iceberg)?,?)*(?<!,)$"
+                },
+                "upset_plot_columns": {
+                    "type": "string",
+                    "fa_icon": "fas fa-columns",
+                    "description": "Columns to use for making Upset plots of genome features"
                 }
             }
         },
@@ -215,13 +220,13 @@
                 },
                 "core_similarity": {
                     "type": "number",
-                    "default": 99.99,
+                    "default": 99.9,
                     "fa_icon": "fas fa-clone",
                     "description": "Similarity threshold for core genomes"
                 },
                 "accessory_similarity": {
                     "type": "number",
-                    "default": 99,
+                    "default": 99.0,
                     "fa_icon": "far fa-clone",
                     "description": "Similarity threshold for accessory genes"
                 }
@@ -435,15 +440,15 @@
                 "max_cpus": {
                     "type": "integer",
                     "description": "Maximum number of CPUs that can be requested for any single job.",
-                    "default": 16,
+                    "default": 72,
                     "fa_icon": "fas fa-microchip",
                     "hidden": true,
                     "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`"
                 },
                 "max_memory": {
                     "type": "string",
                     "description": "Maximum amount of memory that can be requested for any single job.",
-                    "default": "128.GB",
+                    "default": "125.GB",
                     "fa_icon": "fas fa-memory",
                     "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$",
                     "hidden": true,
@@ -452,7 +457,7 @@
                 "max_time": {
                     "type": "string",
                     "description": "Maximum amount of time that can be requested for any single job.",
-                    "default": "240.h",
+                    "default": "168.h",
                     "fa_icon": "far fa-clock",
                     "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$",
                     "hidden": true,

diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf
@@ -35,6 +35,7 @@ include { CONCAT_OUTPUT as CONCAT_MOBSUITE;
           CONCAT_OUTPUT as CONCAT_INTEGRONS } from '../../modules/local/concat_output.nf'
 include { FILTER_AND_CONCAT_MATCHES as FILTER_RGI } from '../../modules/local/filter_matches'
 include { CREATE_REPORT } from '../../modules/local/create_report'
+include { BUILD_UPSET } from '../../modules/local/build_upset/main'
 
 //
 // SUBWORKFLOWS
@@ -356,6 +357,15 @@ workflow ANNOTATE_ASSEMBLIES {
             profile = (params.skip_profile_creation) ? profile : CREATE_REPORT.out.profile
         }
 
+        if (params.upset_plot_columns) {
+            BUILD_UPSET(
+                profile,
+                file(params.input_sample_table),
+                params.upset_plot_columns
+            )
+        }
+
+
     emit:
         annotation = concat_annotation
         annotation_software = ch_software_versions

diff --git a/test/test_full_samplesheet.csv b/test/test_full_samplesheet.csv
@@ -1,9 +1,9 @@
-sample,fna_file_path
-ED073,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED073.fa
-ED142,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED142.fa
-ED195,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED195.fa
-ED287,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED287.fa
-ED413,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED413.fa
-ED423,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED423.fa
-ED603,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED603.fa
-ED644,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED644.fa
+sample,fna_file_path,group_one,group_two
+ED073,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED073.fa,test2,group1
+ED142,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED142.fa,test2,group1
+ED195,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED195.fa,test2,group1
+ED287,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED287.fa,test1,group1
+ED413,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED413.fa,test3,group2
+ED423,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED423.fa,test3,group2
+ED603,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED603.fa,test1,group2
+ED644,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED644.fa,test1,group2