<a href="https://colab.research.google.com/github/ccdmb/catastrophy/blob/master/CATAStropy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CATAStrophy

This is a simple GUI for the CATAStrophy pipeline.

- Paper: https://doi.org/10.3389/fmicb.2019.03088
- GitHub: https://github.com/ccdmb/catastrophy

To run the CATAStrophy pipeline, click `Runtime>Restart and run all` in the colab menu. Then:

1. In the file browser to the left of the screen, right click on the `fastas` folder and select upload.
2. Upload any fasta files that you want to run through CATAStrophy.
3. Click the `refresh` button below.
4. If you uploaded any undesired files or you wish to re-run the program with only a subset of the fastas, you can select/unselect the files using the select menu (control-click to select/deselect individuals, shift-click to select many files).
5. Select the version of dbCAN to search, we have trained different models for different versions so the results may be slightly different.
6. Click "Run" to run the CATAStrophy pipeline.

The program will download the correct version of dbCAN, run HMMER, and run CATAStrophy for you. The program typically takes roughly 5 minutes per-proteome per CPU. Coogle colab notebooks have a default of 2 CPUs so you can expect 1 or 2 proteomes to run in about 5 minutes, or 3 (or 4) proteomes to run in a bit over 10 minutes etc.

If you wish, you can add re-run steps 1-6 to add new proteomes. The notebook will cache already computed results for you.

**IMPORTANT: do not re-run any code cells, as it may result in odd behaviour. Please only use the buttons etc provided. If you do re-run code, please select `Runtime>Restart and run all` as before. You can hit the Run button and your previous results should still be there.**

We include a basic interactive table and scatterplot to browse your results.
To change the axes or nomenclatures in the plot, select the changes you want to make and click the `render` button. Your new proteomes are labelled with the "nomenclature" `user`. You can highlight specific nomenclatures by clicking the circles in the legend (Shift-click to highlight multiple). Double clicking any circle in the legend will return the default plot. 

You can download the results by clicking the `Download` button, or by selecting the files you want from the file browser on the left.
Note that the download button compresses the results into a zip file, and can take some time to download. Scroll the the bottom of the output to see a small progress bar. Occasionally it won't work and you may have to download from the file browser at the left yourself (tip: right click in the browser and select `refresh`).

**Note: Google colab environments will automatically delete the "Runtime" (i.e. all of the saved files and installed software) after a few hours of non-use. Please make sure your download your results when you are finished.**

Troubleshooting:

- If you get an error about "Runtimes" or the notebook doesn't appear to be doing anything, select the dropdown menu to the top right of the page and select `connect to runtime`.

In [None]:
#@title
import os
import shutil
from os.path import join as pjoin
from subprocess import run


def install_hmmer():
    hmmscan_absent = shutil.which("hmmscan") is None
    hmmpress_absent = shutil.which("hmmpress") is None

    if hmmscan_absent or hmmpress_absent:
        try:
            t = run(
                "apt-get install hmmer",
                shell=True,
                check=True,
                capture_output=True
            )
        except Exception as e:
            print(t.stdout.decode())
            print(t.stderr.decode())
            raise e

    return


def install_catastrophy():
    try:
        import catas  # noqa: W0611
    except ModuleNotFoundError:
        try:
            t = run(
                "pip install catastrophy",
                shell=True,
                check=True,
                capture_output=True
            )
        except Exception as e:
            print(t.stdout.decode())
            print(t.stderr.decode())
            raise e
    return


install_hmmer()
install_catastrophy()


from catas.pipeline.main import runner as pipeline  # noqa: E402
from catas.data import Version, nomenclatures  # noqa: E402
from google.colab import files  # noqa: E402
from google.colab import data_table  # noqa: E402
data_table.enable_dataframe_formatter()

import ipywidgets as widgets  # noqa: E402
from IPython.display import display  # noqa: E402

import pandas as pd  # noqa: E402
import numpy as np  # noqa: E402
import altair as alt  # noqa: E402

os.makedirs("fastas", exist_ok=True)
fastas = os.listdir("fastas")

refresh_button = widgets.Button(description="Refresh", disabled=False)
fasta_select = widgets.SelectMultiple(
    options=fastas,
    value=tuple(fastas),
    description="Select files"
)

version = widgets.Dropdown(
    options=Version,
    value=Version.latest(),
    description="Version:"
)

run_button = widgets.Button(description="Run", disabled=len(fastas) < 1)
run_output = widgets.Output()

download_button = widgets.Button(description="Download", disabled=True)

nomenclature_select = widgets.Select(
    options=["nomenclature1", "nomenclature2", "nomenclature3"],
    value="nomenclature3",
    description="Select nomenclature"
)

xaxis_select = widgets.Select(
    options=[f"pc{p:0>2}" for p in range(1, 17)],
    value="pc01",
    description="Select X-axis"
)

yaxis_select = widgets.Select(
    options=[f"pc{p:0>2}" for p in range(1, 17)],
    value="pc02",
    description="Select Y-axis"
)

render_button = widgets.Button(description="Render", disabled=False)

table_output = widgets.Output()
plot_output = widgets.Output()


# Not used, It wasn't reliable.
"""
def upload_fastas():
    old_dir = os.getcwd()
    os.chdir(pjoin(old_dir, "fastas"))
    uploaded = files.upload()
    os.chdir(old_dir)
    return
"""


def on_refresh(change):
    os.makedirs("fastas", exist_ok=True)

    old_values = fasta_select.get_interact_value()
    old_options = fasta_select.options
    to_exclude = set(old_options).difference(old_values)

    fasta_select.options = os.listdir("fastas")
    fasta_select.value = old_values + tuple(
        k for k
        in os.listdir("fastas")
        if k not in to_exclude
    )
    return


def on_select_value_change(change):
    run_button.disabled = len(fasta_select.get_interact_value()) < 1


def get_results_df():
    pca = pd.read_csv("results/pca.tsv", sep="\t", na_values=".")
    columns = list(pca.columns)
    columns[0] = "label"
    pca.columns = columns

    new_indices = pca.loc[pca.genome.isnull(), ].index.values
    pca.loc[
        new_indices,
        ["nomenclature1", "nomenclature2", "nomenclature3"]
    ] = "user"
    del columns
    return pca


def display_results_df(pca):
    new_indices = pca.loc[pca.genome.isnull(), ].index.values
    training_indices = np.setdiff1d(pca.index.values, new_indices)

    with table_output:
        display(data_table.DataTable(
            pca.loc[
                np.concatenate([new_indices, training_indices]),
                [c for c in pca.columns if not c.startswith("pc")]
            ],
            include_index=False,
            num_rows_per_page=10,
        ))


def on_render(b):
    plot_it()
    return


def plot_it():
    nomenclature = nomenclature_select.value
    selection = alt.selection_multi(fields=[nomenclature])
    x = xaxis_select.value
    y = yaxis_select.value

    color = alt.condition(
        selection,
        alt.Color(f'{nomenclature}:N', legend=None),
        alt.value('lightgray')
    )

    scatter = alt.Chart(pca).mark_circle(size=50).encode(
        x=f"{x}:Q",
        y=f"{y}:Q",
        color=color,
        tooltip=["label:N"]
    ).properties(
        width=800,
        height=500,
    ).interactive()

    order = ["user"] + nomenclatures()[nomenclature]

    legend = alt.Chart(pca).mark_circle(size=100).encode(
        y=alt.Y(
            f'{nomenclature}:O',
            axis=alt.Axis(orient='right'),
            sort=order
        ),
        color=color
    ).properties(
        width=50,
        height=200,
    ).add_selection(
        selection
    )
    plot_output.clear_output()

    with plot_output:
        display((scatter | legend).configure_axis(
            labelFontSize=14,
            titleFontSize=16,
        ))
    return


def on_run(change):
    run_output.clear_output()
    selected = fasta_select.get_interact_value()
    run_fastas = [pjoin("fastas", s) for s in selected]

    download_button.disabled = False
    global pca
    pca = get_results_df()

    with run_output:
        pipeline(
            infiles=run_fastas,
            version=version.value,
            outdir="results",
            hmms=None,
            hmmscan_path=shutil.which("hmmscan"),
            hmmpress_path=shutil.which("hmmpress"),
            ncpu=-1,
            correct=False,
            quiet=False,
        )

        render_button.on_click(on_render)
        selectors = widgets.HBox([
            nomenclature_select,
            xaxis_select,
            yaxis_select
        ])
        layout = widgets.VBox([
            table_output,
            selectors,
            render_button,
            plot_output,
        ])
        display(layout)
        display_results_df(pca)
        plot_it()
    return


def on_download(change):
    import datetime
    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    basename = f"{now}-catastrophy_results"
    shutil.make_archive(basename, "zip", "results")
    print(basename)
    files.download(f"{basename}.zip")
    return


refresh_button.on_click(on_refresh)
fasta_select.observe(on_select_value_change)
run_button.on_click(on_run)
download_button.on_click(on_download)

buttons = widgets.HBox([refresh_button, run_button, download_button])
vbox = widgets.VBox([buttons, fasta_select, version, run_output])
display(vbox)