## Preamble

### Project Template

In [None]:
%load_ext autoreload

In [None]:
import os as _os

_os.chdir(_os.environ["PROJECT_ROOT"])
_os.path.realpath(_os.path.curdir)

### Imports

In [None]:
import os
import subprocess
import sys
import time
from datetime import datetime
from glob import glob
from itertools import chain, product
from tempfile import mkstemp

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import sfacts as sf
import statsmodels.api as sm
import statsmodels.formula.api as smf
import xarray as xr
from mpl_toolkits.axes_grid1 import make_axes_locatable
from statsmodels.stats.multitest import fdrcorrection
from tqdm import tqdm

import lib.plot
from lib.dissimilarity import load_dmat_as_pickle
from lib.pandas_util import align_indexes, aligned_index, idxwhere, invert_mapping

In [None]:
import lib.thisproject.data

### Set Style

In [None]:
sns.set_context("talk")
plt.rcParams["figure.dpi"] = 100

## Load Metadata

In [None]:
# File paths
species = "103694"
c75_clust_inpath = f"data/group/hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_v15-v22-agg75.spgc-fit.uhgg-strain_gene.gene_clust-t10.tsv"
gene_family_inpath = f"ref/midasdb_uhgg_v15/pangenomes/{species}/gene_info.txt"
gene_phage_inpath = f"ref/midasdb_uhgg_v15/pangenomes/{species}/genomad_virus.tsv"
gene_amr_inpath = f"ref/midasdb_uhgg_v15/pangenomes/{species}/resfinder.tsv"
gene_plasmid_inpath = f"ref/midasdb_uhgg_v15/pangenomes/{species}/genomad_plasmid.tsv"
morans_inpath = f"data/group/hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_v15-v22-agg75.spgc-fit.uhgg-strain_gene.morans_i.tsv"

pdist_inpath = 'data/group/hmp2/species/sp-102506/r.proc.gtpro.filt-poly05-cvrg05.ss-g10000-block0-seed0.fit-sfacts37-s85-seed0.spgc_ss-all.geno_uhgg-v15_pdist-mask10-pseudo10.pkl'  # f"data/group/hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.spgc_ss-all.geno_uhgg-v15_pdist-mask10-pseudo10.pkl"
strain_gene_inpath = f"data/group/hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_v15-v22-agg75.spgc-fit.uhgg-strain_gene.tsv"

In [None]:
morans_inpath

In [None]:
c75_by_clust = pd.read_table(c75_clust_inpath, names=["centroid_75", "cluster"])
c75_by_clust

In [None]:
c75_by_clust.cluster.value_counts().head(20)

In [None]:
gene_family = pd.read_table(gene_family_inpath, index_col="gene_id")
centroid75_num_members = gene_family.centroid_75.value_counts()
gene_family

In [None]:
len(set(gene_family.centroid_75))

### AMR Annotations

In [None]:
amr_annotations = pd.read_table(gene_amr_inpath, index_col=["centroid_99"]).rename_axis(
    index="gene_id"
)

In [None]:
gene_by_amr = amr_annotations.accession_no.reset_index().drop_duplicates()
gene_by_amr

In [None]:
len(gene_by_amr.gene_id.unique())

In [None]:
amr_annotations.loc[list(set(gene_by_amr.gene_id) - set(gene_family.index))]

In [None]:
gene_by_amr

In [None]:
c75_by_amr_votes = (
    gene_by_amr.join(
        gene_family.centroid_75.reindex(gene_by_amr.gene_id.unique()), on="gene_id"
    )[["centroid_75", "accession_no"]].value_counts()
    / centroid75_num_members
)
plt.hist(c75_by_amr_votes)

In [None]:
c75_by_amr = c75_by_amr_votes[lambda x: x >= 0.5].reset_index()[
    ["centroid_75", "accession_no"]
]
c75_by_amr

In [None]:
c75_is_amr_list = c75_by_amr.centroid_75.unique()
len(c75_is_amr_list)

### Phage Annotations

In [None]:
phage_annotations = pd.read_table(
    gene_phage_inpath, index_col="centroid_99"
).rename_axis("gene_id")

In [None]:
gene_by_phage = (
    phage_annotations[lambda x: x.annotation_accessions != "."]
    .annotation_accessions.str.split(";")
    .explode()
    .reset_index()
    .drop_duplicates()
)

In [None]:
# Features annotated as phage genes that don't show up in gene_info.txt (seems to be because they're <200 bp)
phage_annotations.loc[list(set(gene_by_phage.gene_id) - set(gene_family.index))]

In [None]:
c75_by_phage_votes = (
    gene_by_phage.join(
        gene_family.centroid_75.reindex(gene_by_phage.gene_id.unique()), on="gene_id"
    )[["centroid_75", "annotation_accessions"]].value_counts()
    / centroid75_num_members
)
plt.hist(c75_by_phage_votes)

In [None]:
c75_by_phage = c75_by_phage_votes[lambda x: x >= 0.5].reset_index()[
    ["centroid_75", "annotation_accessions"]
]
c75_by_phage

In [None]:
c75_is_phage_list = c75_by_phage.centroid_75.unique()
c75_is_phage_list

### Plasmid Annotations

In [None]:
plasmid_annotations = pd.read_table(
    gene_plasmid_inpath, index_col="centroid_99"
).rename_axis("gene_id")

In [None]:
plasmid_annotations.shape

In [None]:
gene_by_plasmid = (
    plasmid_annotations[lambda x: x.annotation_accessions != "."]
    .annotation_accessions.str.split(";")
    .explode()
    .reset_index()
    .drop_duplicates()
)

In [None]:
gene_by_plasmid.shape

In [None]:
# Features annotated as plasmid genes that don't show up in gene_info.txt (seems to be because they're <200 bp)
plasmid_annotations.loc[list(set(gene_by_plasmid.gene_id) - set(gene_family.index))]

In [None]:
c75_by_plasmid_votes = (
    gene_by_plasmid.join(
        gene_family.centroid_75.reindex(gene_by_plasmid.gene_id.unique()), on="gene_id"
    )[["centroid_75", "annotation_accessions"]].value_counts()
    / centroid75_num_members
)
plt.hist(c75_by_plasmid_votes, bins=100)
None

In [None]:
c75_by_plasmid = c75_by_plasmid_votes[lambda x: x >= 0.5].reset_index()[
    ["centroid_75", "annotation_accessions"]
]
c75_by_plasmid

In [None]:
c75_is_plasmid_list = c75_by_plasmid.centroid_75.unique()

c75_is_plasmid_list, len(c75_is_plasmid_list)

## Analyze Clusters

In [None]:
clust_size = c75_by_clust.cluster.value_counts()

large_cluster_list = idxwhere(clust_size.drop([-1, -2, -3, -4]) >= 5)
len(large_cluster_list)

In [None]:
clust_phage_info = (
    c75_by_clust.assign(
        is_phage=lambda x: x.centroid_75.isin(c75_is_phage_list),
        is_amr=lambda x: x.centroid_75.isin(c75_is_amr_list),
        is_plasmid=lambda x: x.centroid_75.isin(c75_is_plasmid_list),
    )[["cluster", "is_phage"]]
    .value_counts()
    .unstack("is_phage", fill_value=0)
    .rename(columns={True: "count_phage_genes", False: "count_not_phage_genes"})
    .assign(frac_phage_genes=lambda x: x.count_phage_genes / x.sum(1))
    .sort_values("frac_phage_genes", ascending=False)
)

plt.hist(
    clust_phage_info.loc[large_cluster_list].frac_phage_genes,
    bins=np.linspace(1e-5, 1, num=20),
)

In [None]:
clust_plasmid_info = (
    c75_by_clust.assign(
        is_phage=lambda x: x.centroid_75.isin(c75_is_phage_list),
        is_amr=lambda x: x.centroid_75.isin(c75_is_amr_list),
        is_plasmid=lambda x: x.centroid_75.isin(c75_is_plasmid_list),
    )[["cluster", "is_plasmid"]]
    .value_counts()
    .unstack("is_plasmid", fill_value=0)
    .rename(columns={True: "count_plasmid_genes", False: "count_not_plasmid_genes"})
    .assign(frac_plasmid_genes=lambda x: x.count_plasmid_genes / x.sum(1))
    .sort_values("frac_plasmid_genes", ascending=False)
)

plt.hist(
    clust_plasmid_info.loc[large_cluster_list].frac_plasmid_genes,
    bins=np.linspace(1e-5, 1, num=20),
)

In [None]:
clust_amr_info = (
    c75_by_clust.assign(
        is_phage=lambda x: x.centroid_75.isin(c75_is_phage_list),
        is_amr=lambda x: x.centroid_75.isin(c75_is_amr_list),
        is_plasmid=lambda x: x.centroid_75.isin(c75_is_plasmid_list),
    )[["cluster", "is_amr"]]
    .value_counts()
    .unstack("is_amr", fill_value=0)
    .rename(columns={True: "count_amr_genes", False: "count_not_amr_genes"})
    .assign(frac_amr_genes=lambda x: x.count_amr_genes / x.sum(1))
    .sort_values("frac_amr_genes", ascending=False)
)

clust_amr_info

In [None]:
clust_amr_info.head(10)

In [None]:
clust_phage_info.loc[[198, 1855, 2199, 1925, 1868, 1869]]

In [None]:
clust_phage_info.join(clust_amr_info).join(clust_plasmid_info).assign(
    clust_size=clust_size
).sort_values("frac_amr_genes", ascending=False).head(10)

In [None]:
d = (
    clust_phage_info.join(clust_amr_info)
    .join(clust_plasmid_info)
    .assign(clust_size=clust_size)
)
plt.hist(d[d.clust_size >= 5].frac_plasmid_genes, bins=20)

In [None]:
d.sort_values("count_plasmid_genes", ascending=False).head(20)

In [None]:
d.drop([-1, -2, -3, -4]).assign(singleton=lambda x: x.clust_size == 1).groupby(
    "singleton"
)[["count_not_plasmid_genes", "count_plasmid_genes"]].sum()

In [None]:
morans_i = pd.read_table(
    morans_inpath, names=["centroid_75", "stat"], index_col="centroid_75"
).stat
morans_i

In [None]:
plt.hist(morans_i.drop(c75_is_plasmid_list), bins=100)
plt.hist(morans_i.loc[c75_is_plasmid_list], bins=100)
plt.yscale("log")
None

In [None]:
d = c75_by_clust.set_index("centroid_75")[lambda x: x.cluster >= 0].assign(
    morans_i=morans_i,
    is_phage=lambda x: x.index.isin(c75_is_phage_list),
    is_amr=lambda x: x.index.isin(c75_is_amr_list),
    is_plasmid=lambda x: x.index.isin(c75_is_plasmid_list),
)

d.sort_values(['morans_i', 'is_plasmid'], ascending=False).head(20)

### Cluster 2894

In [None]:
clust_2894_gene_list = idxwhere(c75_by_clust.set_index('centroid_75').cluster == 2894)

d.loc[clust_2894_gene_list]

In [None]:
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform


geno_dmat = lib.dissimilarity.load_dmat_as_pickle(pdist_inpath)
geno_linkage = linkage(squareform(geno_dmat), optimal_ordering=True)
strain_gene = pd.read_table(strain_gene_inpath, index_col='gene_id')

In [None]:
ref_strain_gene = pd.read_table("data/species/sp-102506/midasdb.gene75_v15.uhgg-strain_gene.tsv", index_col='gene_id')

In [None]:
strain_geno_linkage = linkage(geno_dmat.loc[strain_gene.columns, strain_gene.columns], optimal_ordering=True)
sns.clustermap(strain_gene.loc[clust_2894_gene_list], col_linkage=strain_geno_linkage)

In [None]:
sns.clustermap(pd.concat([ref_strain_gene.loc[clust_2894_gene_list], strain_gene.loc[clust_2894_gene_list]], axis=1), col_linkage=geno_linkage)

In [None]:
species = '102272'

cog_category_inpath = f'data/species/sp-{species}/midasdb_v15.emapper.gene75_x_cog_category.tsv'
eggnog_inpath = f'data/species/sp-{species}/midasdb_v15.emapper.gene75_x_eggnog.tsv'
ko_inpath = f'data/species/sp-{species}/midasdb_v15.emapper.gene75_x_ko.tsv'
amr_inpath = f'data/species/sp-{species}/midasdb_v15.gene75_x_amr.tsv'
plasmid_inpath = f'data/species/sp-{species}/midasdb_v15.gene75_x_genomad_plasmid.tsv'
phage_inpath = f'data/species/sp-{species}/midasdb_v15.gene75_x_genomad_virus.tsv'
emapper_inpath = f'ref/midasdb_uhgg_v15/pangenomes/{species}/eggnog.tsv'
clust_inpath = f'data/group/hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_v15-v22-agg75.spgc-fit.uhgg-strain_gene.gene_clust-t10.tsv'
gene_family_inpath = f'ref/midasdb_uhgg_v15/pangenomes/{species}/gene_info.txt'
morans_i_inpath = f'data/group/hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_v15-v22-agg75.spgc-fit.uhgg-strain_gene.morans_i.tsv'
prevalence_inpath = f'data/group/hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_v15-v22-agg75.spgc-fit.uhgg-strain_gene.prevalence.tsv'

pdist_inpath = f'data/group/hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.spgc_ss-all.geno_uhgg-v15_pdist-mask10-pseudo10.pkl'
strain_gene_inpath = f"data/group/hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_v15-v22-agg75.spgc-fit.uhgg-strain_gene.tsv"

In [None]:
gene_family = pd.read_table(gene_family_inpath, index_col="gene_id")

In [None]:
emapper_all = pd.read_table(emapper_inpath, index_col='#query').rename_axis('centroid_99')
description = emapper_all.reindex(gene_family.centroid_75.unique()).dropna(subset=['seed_ortholog']).rename_axis('centroid_75')

In [None]:
phage = pd.read_table(phage_inpath)#.groupby('centroid_75')
plasmid = pd.read_table(plasmid_inpath)
amr = pd.read_table(amr_inpath)

clust = pd.read_table(clust_inpath, names=['centroid_75', 'clust'], index_col='centroid_75').clust
clust_size = clust.value_counts()
clust = clust[lambda x: (x >= 0) & x.isin(idxwhere(clust_size > 1))]

morans_i = pd.read_table(morans_i_inpath, names=['centroid_75', 'morans_i'], index_col='centroid_75').morans_i.dropna()
prevalence = pd.read_table(prevalence_inpath, names=['centroid_75', 'prevalence'], index_col='centroid_75').prevalence

In [None]:
eggnog = pd.read_table(eggnog_inpath)
cog_category = pd.read_table(cog_category_inpath)

In [None]:
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform

strain_gene = pd.read_table(strain_gene_inpath, index_col='gene_id')
geno_dmat = lib.dissimilarity.load_dmat_as_pickle(pdist_inpath).loc[strain_gene.columns, strain_gene.columns]
geno_linkage = linkage(squareform(geno_dmat), optimal_ordering=False)

In [None]:
plt.hist(morans_i, bins=100)
None

In [None]:
d = description[['Preferred_name', 'Description']].reindex(gene_family.centroid_75.unique()).assign(morans_i=morans_i).assign(
    cog_category=cog_category.groupby('centroid_75').cog_category.apply(''.join),
    eggnog=eggnog.groupby('centroid_75').eggnog.apply(';'.join),
    clust=clust,
    csize=lambda x: x.clust.map(clust_size),
    plasmid=plasmid.groupby('centroid_75').annotation_accessions.apply(';'.join),
    phage=phage.groupby('centroid_75').annotation_accessions.apply(';'.join),
    amr=amr.groupby('centroid_75').accession_no.apply(';'.join),
    prevalence=prevalence,
)

d[lambda x: ~x.clust.isna() & (prevalence > 0.1) & (prevalence < 0.9)].sort_values('morans_i', ascending=False).head(100).tail(50)

In [None]:
d[lambda x: x.clust == 3669.0].sort_values('prevalence')

In [None]:
emapper_all[['contig_id', 'start', 'end', 'strand']].loc[['GUT_GENOME002517_01711', 'GUT_GENOME002517_01829', 'GUT_GENOME002517_01830', 'GUT_GENOME002517_01980']]

In [None]:
gene_family

In [None]:
gene_family[lambda x: x.centroid_75.isin(idxwhere(clust == 3669))].assign(
    genome_id=lambda x: x.index.to_series().str.rsplit("_", n=1).str[0]
)[["genome_id", "centroid_75"]].reset_index().groupby(
    ["genome_id", "centroid_75"]
).gene_id.apply(
    ",".join
).unstack(
    "centroid_75", fill_value=""
).T.sort_values(
    "GUT_GENOME005078"
)

In [None]:
sns.clustermap(strain_gene.loc[idxwhere(clust == 3669.0)], col_linkage=geno_linkage)

In [None]:
d = description[['Description', 'Preferred_name']].assign(
    phage=phage.groupby('centroid_75').agg(';'.join),
    plasmid=plasmid.groupby('centroid_75').agg(';'.join),
    amr=amr.groupby('centroid_75').agg(';'.join),
    clust=clust,
    morans_i=morans_i,
    prevalence=prevalence,
)

d.dropna(subset=['amr']).sort_values('prevalence', ascending=False).head(20)

In [None]:
geno_dmat

In [None]:
plt.hist(d.prevalence)
plt.yscale('log')

In [None]:
spgc_gene_content_inpath = pd.read_table(f'data/group/hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_v15-v22-agg75.spgc-fit.uhgg-strain_gene.tsv', index_col='gene_id')
x = spgc_gene_content_inpath.loc[idxwhere(clust == 2443)]
sns.clustermap(x)

In [None]:
d = gene_family[lambda x: x.centroid_75.isin(idxwhere(clust == 1925))].assign(genome_id=lambda x: x.index.to_series().str.split('_').str[1])[['genome_id', 'centroid_75']].value_counts().unstack('centroid_75', fill_value=0)

sns.clustermap(d)

In [None]:
d.assign(is_plasmid=lambda x: ~x.plasmid.isna())[
    ["clust", "is_plasmid"]
].value_counts().unstack("is_plasmid", fill_value=0).rename(
    index=int, columns={True: "count_plasmid", False: "count_not_plasmid"}
).sort_values(
    "count_plasmid", ascending=False
).head(
    20
)

In [None]:
d.loc[lambda x: x.clust == 4782]

In [None]:
x = 'morans_i'

phage_x = d[~d.phage.isna()][x].dropna()
plasmid_x = d[~d.plasmid.isna()][x].dropna()
mobile_x = d[~d.phage.isna() | ~d.plasmid.isna()][x].dropna()
not_mobile_x = d[d.phage.isna() & d.plasmid.isna()][x].dropna()

bins = np.linspace(np.min(d[x]), np.max(d[x]), num=10)
plt.hist(not_mobile_x, bins=bins, alpha=0.7, density=True)
plt.hist(plasmid_x, bins=bins, alpha=0.5, density=True)
plt.hist(phage_x, bins=bins, alpha=0.5, density=True)
plt.yscale('log')
sp.stats.mannwhitneyu(mobile_x, not_mobile_x)

In [None]:
plt.scatter('prevalence', 'morans_i', data=d.dropna(subset=['prevalence', 'morans_i']), s=1)