# Counting CAZymes in preparation for PCA.

The next step after running HMMER and the dbCAN script is to count the number of occurrences of each CAZyme.
We'll use exactly the same counting functions and parsers to do this as we will use in the distributed version of CATAStrophy.

In [1]:
try:
    from importlib import reload
except ImportError:
    # We're probably using python2
    pass

import os
from os import listdir
from os.path import join as pjoin
from os import makedirs

from collections import defaultdict
import datetime

import re

In [2]:
import pandas as pd
import numpy as np

In [3]:
VERSIONS = ["v4", "v5", "v6", "v7"]

TODAY = "20180324" #datetime.datetime.utcnow().strftime("%Y%m%d")
TODAY = "20190311"

The first thing to do is to grab the HMM names and order them so that we can make sure the model works for user input (if the columns are all jumbled up we'll get the wrong answer).

In [4]:
hmms = defaultdict(list)

regex = re.compile(r"\s+")
hmm_regex = re.compile(r"\.hmm$")

VERSIONS = {
    "v4": "data/dbCAN-fam-HMMs.txt.v4",
    "v5": "data/dbCAN-fam-HMMs.txt.v5",
    "v6": "data/dbCAN-fam-HMMs.txt.v6",
    "v7": "data/dbCAN-HMMdb-V7.txt",
}

for version, db in VERSIONS.items():
    with open(db, "r") as handle:
        for line in handle:
            if not line.startswith("NAME"):
                continue
            hmms[version].append(regex.split(hmm_regex.sub("", line.strip(), count=1))[1])

for version in VERSIONS:
    hmms[version].sort()
hmms["v5"][:5]

['AA1', 'AA10', 'AA11', 'AA12', 'AA13']

Here we have all of the possible HMM/CAZY names from the dbCAN database in a list.
I'll save this in a JSON file in the CATAS data directory to reuse later.

In [5]:
import json

for version in VERSIONS:
    with open("../catas/data/{}-{}-cazy_list.json".format(version, TODAY), "w") as handle:
        json.dump(hmms[version], handle)

The next thing to do is count the occurrences of each hmm in each dbCAN result file.
We have some functions in the CATAS module to do most of this for us.

In [6]:
from os.path import splitext

from catas import count
reload(count)
from catas.parsers import FileType
from catas.parsers import parse
from catas.count import cazy_counts_multi
from catas.data import cazy_list
from catas.data import Version

`cazy_counts` loops through the file and returns a list of tuples containing counts.

Just to keep things pleasant, we'll also use the filename (sans extension) as the label for the row.

First though i'd like to see how fast each of the parsers do.

In [7]:
%%timeit -n 5 -r 3
with open("01-run_hmms/v5/Abisporus_varbisporusH97.v2.FilteredModels3.proteins_hmmer.csv") as handle:
    parse(handle, format=FileType.hmmer_domtab, version="v5")

The slowest run took 8.88 times longer than the fastest. This could mean that an intermediate result is being cached.
66.2 µs ± 67.6 µs per loop (mean ± std. dev. of 3 runs, 5 loops each)


In [8]:
%%timeit -n 5 -r 3
with open("01-run_hmms/v5/Abisporus_varbisporusH97.v2.FilteredModels3.proteins_hmmer.txt") as handle:
    parse(handle, format=FileType.hmmer_text, version="v5")

The slowest run took 5.66 times longer than the fastest. This could mean that an intermediate result is being cached.
43.9 µs ± 35 µs per loop (mean ± std. dev. of 3 runs, 5 loops each)


In [9]:
%%timeit -n 5 -r 3
with open("01-run_hmms/v5/Abisporus_varbisporusH97.v2.FilteredModels3.proteins_hmmer_dbcan.csv") as handle:
    parse(handle, format=FileType.dbcan, version="v5")

The slowest run took 4.43 times longer than the fastest. This could mean that an intermediate result is being cached.
60.1 µs ± 45.2 µs per loop (mean ± std. dev. of 3 runs, 5 loops each)


OK so clearly our reimplementation of the dbcan parser with the hmmer text output is a bit slow.
I think that the domain table processing is probably on par with the dbcan output when you consider that the dbcan input is already processed.

The bottom line is, if you're going to run CATAStrophy on a lot of proteomes, you should probably use the domain tables or the dbcan processed output.

OK so lets find the counts of CAZymes that we found.
I'll use the dbcan output because it's quicker.

In [10]:
import re
regex = re.compile("_hmmer_dbcan.csv")
dfs = dict()

for version in VERSIONS:
    files = [f for f in listdir(pjoin("01-run_hmms", version)) if f.endswith("_dbcan.csv")]
    try:
        handles = [open(pjoin("01-run_hmms", version, f), "r") for f in files]
        labels = [regex.sub("", f) for f in files]
        parsed = [parse(h, format=FileType.dbcan, version=version) for h in handles]
        
        cnts = cazy_counts_multi(parsed, labels=labels, required_cols=hmms[version])
        dfs[version] = cnts

    finally:
        for h in handles:
            h.close()
dfs["v5"]

<catas.matrix.Matrix at 0x7f8f96cc7b00>

In [11]:
dfs["v4"].as_df().head()

Unnamed: 0,AA1,AA10,AA11,AA12,AA13,AA2,AA3,AA4,AA5,AA6,...,PL3,PL4,PL5,PL6,PL7,PL8,PL9,SLH,cohesin,dockerin
Sacce1_GeneCatalog_proteins_20101210.aa,1,0,0,0,0,1,0,3,0,3,...,0,0,0,0,0,0,0,0,0,0
pythium_ultimum_proteins,0,0,0,0,0,2,3,3,0,9,...,15,2,0,0,0,0,0,2,0,1
Sporisorium_reilianum.GCA_000230245.1.27.pep.all,4,1,0,0,0,3,9,1,3,1,...,0,0,0,0,0,0,0,0,0,0
Macph1_GeneCatalog_proteins_20131211.aa,3,0,8,0,0,9,34,4,2,1,...,9,6,0,0,0,0,1,0,0,0
Clafu1_GeneCatalog_proteins_20110826.aa,4,0,5,2,0,5,21,1,3,1,...,3,2,0,0,1,0,0,0,0,0


We'll just read the matrices in as dataframes for now.

In [12]:
for version in dfs:
    df = dfs[version].as_df()
    df.index.name = "label"
    dfs[version] = df
dfs["v4"].head()

Unnamed: 0_level_0,AA1,AA10,AA11,AA12,AA13,AA2,AA3,AA4,AA5,AA6,...,PL3,PL4,PL5,PL6,PL7,PL8,PL9,SLH,cohesin,dockerin
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sacce1_GeneCatalog_proteins_20101210.aa,1,0,0,0,0,1,0,3,0,3,...,0,0,0,0,0,0,0,0,0,0
pythium_ultimum_proteins,0,0,0,0,0,2,3,3,0,9,...,15,2,0,0,0,0,0,2,0,1
Sporisorium_reilianum.GCA_000230245.1.27.pep.all,4,1,0,0,0,3,9,1,3,1,...,0,0,0,0,0,0,0,0,0,0
Macph1_GeneCatalog_proteins_20131211.aa,3,0,8,0,0,9,34,4,2,1,...,9,6,0,0,0,0,1,0,0,0
Clafu1_GeneCatalog_proteins_20110826.aa,4,0,5,2,0,5,21,1,3,1,...,3,2,0,0,1,0,0,0,0,0


Now I'll read the labels from our species file which contains lifestyle columns as well as propernames.

In [13]:
species = pd.read_csv("20170531-trophic_prediction_fastas/species_for_catastrophy.csv")

from os.path import splitext

species.loc[~species["File"].isnull(), "BaseFile"] = [
    splitext(f)[0]
    for f 
    in species.loc[~species["File"].isnull(), "File"]
    ]

species = species[~species["File"].isnull()].set_index("BaseFile", drop=True)
species = species[["Species", "nomenclature1", "nomenclature2", "nomenclature3"]]
species[:5]

Unnamed: 0_level_0,Species,nomenclature1,nomenclature2,nomenclature3
BaseFile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
atakeA7993,Aciculosporium take A7993,symbiont,biotroph,biotroph 1
Abisporus_varbisporusH97.v2.FilteredModels3.proteins,Agaricus bisporus,saprotroph,saprotroph,saprotroph
candida_albugo_ncbi,Albugo candida,biotroph,biotroph,biotroph 2
Albugo_laibachii.ENA1.27.pep.all,Albugo laibachii,biotroph,biotroph,biotroph 2
Alternaria_brassicicola_proteins,Alternaria brassicicola,necrotroph,necrotroph,necrotroph - narrow host range


And I merge the two dataframes for ease of use.

In [14]:
if not os.path.exists("02-count_cazymes"):
    makedirs("02-count_cazymes")

counts = dict()
for version in VERSIONS:
    counts[version] = pd.merge(left=species, right=dfs[version], left_index=True, right_index=True)
    counts[version].reset_index(drop=True, inplace=True)
    #counts.rename(columns={"nomenclature3": "nomenclature"}, inplace=True)
    counts[version].to_csv("02-count_cazymes/{}-cazy_counts.csv".format(version), sep="\t", index=False)

counts[version][:5]

Unnamed: 0,Species,nomenclature1,nomenclature2,nomenclature3,AA1,AA10,AA11,AA12,AA13,AA14.phmm,...,PL8_2,PL8_3,PL9,PL9_1,PL9_2,PL9_3,PL9_4,SLH,cohesin,dockerin
0,Agaricus bisporus,saprotroph,saprotroph,saprotroph,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Albugo laibachii,biotroph,biotroph,biotroph 2,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Alternaria brassicicola,necrotroph,necrotroph,necrotroph - narrow host range,0,0,4,3,1,0,...,0,0,0,0,0,1,0,0,0,0
3,Armillaria mellea,necrotroph,necrotroph,necrotroph - broad host range,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Aspergillus fumigatus,saprotroph,saprotroph,saprotroph,0,0,3,0,0,0,...,0,0,0,0,0,1,0,0,0,0


Great! Now we can start the PCA.

## Preparing test data

In [15]:
for version in VERSIONS:
    file_ = "{}-{}-test_dbcan.csv".format(version, TODAY)

    with open(pjoin("../catas/data", file_), "r") as handle:
        parsed = parse(handle, format=FileType.dbcan)
        parsed = list(parsed)
        cnts = cazy_counts_multi([handle], [file_], required_cols=hmms[version])

    cnts.write("../catas/data/{}-{}-test_counts.npz".format(version, TODAY))

In [16]:
from catas.matrix import Matrix
x = Matrix.read("../catas/data/v5-{}-test_counts.npz".format(TODAY))
x.arr[0, 1:5]

array([0, 0, 0, 0])