## Preamble

In [None]:
%load_ext autoreload

In [None]:
import os as _os

_os.chdir(_os.environ["PROJECT_ROOT"])
_os.path.realpath(_os.path.curdir)

### Imports

In [None]:
import os
import subprocess
import time
from itertools import chain, product
from tempfile import mkstemp
from warnings import filterwarnings

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import sfacts as sf
import statsmodels.formula.api as smf
import xarray as xr
from mpl_toolkits.axes_grid1 import make_axes_locatable

# from fastcluster import linkage
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import pdist, squareform
from statsmodels.graphics.regressionplots import influence_plot
from statsmodels.stats.multitest import fdrcorrection
from tqdm import tqdm

import lib.plot
import lib.thisproject.data
from lib.pandas_util import align_indexes, aligned_index, idxwhere, invert_mapping

In [None]:
sns.set_context("paper")
plt.rcParams["figure.dpi"] = 50

In [None]:
def _calculate_2tailed_pvalue_from_perm(obs, perms):
    hypoth_left = perms > obs
    hypoth_right = perms < obs
    null_p_left = (hypoth_left.sum() + 1) / (len(hypoth_left) + 1)
    null_p_right = (hypoth_right.sum() + 1) / (len(hypoth_right) + 1)
    return np.minimum(null_p_left, null_p_right) * 2

In [None]:
def linkage_order(linkage, labels):
    return labels[sp.cluster.hierarchy.to_tree(linkage).pre_order(lambda x: x.id)]


def is_prime(n):
    if n <= 1:
        return False
    for i in range(2, int(n**0.5) + 1):
        if n % i == 0:
            return False
    return True


def iterate_primes_up_to(n, return_index=False):
    n = int(np.ceil(n))
    idx = 0
    for i in range(n):
        if is_prime(i):
            if return_index:
                yield (idx, i)
            else:
                yield i
            idx += 1


def maximally_shuffled_order(sorted_order):
    n = len(sorted_order)
    primes_list = list(iterate_primes_up_to(np.sqrt(n)))
    table = pd.DataFrame(np.arange(n), index=sorted_order, columns=["original_order"])
    for prime in primes_list:
        table[prime] = table.original_order % prime
    table.sort_values(primes_list).original_order.values
    table = table.assign(new_order=table.sort_values(primes_list).original_order.values)
    z = table.sort_values("new_order").original_order.values
    table["delta"] = [np.nan] + list(z[1:] - z[:-1])
    return table.sort_values("new_order").index.to_list()

## Construct Metadata

In [None]:
pair_type_palette={'Transition': 'plum', 'EEN': 'pink', 'PostEEN': 'lightblue'}

diet_palette = {
    "EEN": "lightgreen",
    "PostEEN": "lightblue",
    "InVitro": "plum",
    "PreEEN": "lightpink",
}

subject_order = [
    "A",
    "B",
    "H",
    "C",
    "D",
    "E",
    "F",
    "G",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
]

# NOTE: Requires a dummy value because I want exactly 20 items.
subject_palette = lib.plot.construct_ordered_palette(
    subject_order + [f"dummy{i}" for i in range(20 - len(subject_order))], cm="tab20"
)
subject_palette["X"] = "black"
pair_type_order = ["EEN", "Transition", "PostEEN"]
pair_type_marker_palette = {"EEN": "s", "Transition": ">", "PostEEN": "o"}
pair_type_linestyle_palette = {"EEN": ":", "Transition": "-.", "PostEEN": "-"}

In [None]:
sample = (
    pd.read_table("meta/een-mgen/sample.tsv")
    .assign(
        label=lambda x: x[
            ["collection_date_relative_een_end", "diet_or_media", "sample_id"]
        ].apply(tuple, axis=1)
    )
    .set_index("sample_id")
)
subject = pd.read_table("meta/een-mgen/subject.tsv", index_col="subject_id")

In [None]:
rotu_counts = pd.read_table(
    "data/group/een/a.proc.zotu_counts.tsv", index_col="#OTU ID"
).rename_axis(index="zotu", columns="sample_id")
rotu_taxonomy = rotu_counts.taxonomy
rotu_counts = rotu_counts.drop(columns=["taxonomy"]).T
rotu_rabund = rotu_counts.divide(rotu_counts.sum(1), axis=0)

sample_rotu_bc_linkage = sp.cluster.hierarchy.linkage(
    rotu_rabund, method="average", metric="braycurtis", optimal_ordering=True
)

In [None]:
missing_samples = sorted(idxwhere(~rotu_counts.index.to_series().isin(sample.index)))
print(len(missing_samples), ", ".join(missing_samples))

In [None]:
x = rotu_rabund
row_colors = pd.DataFrame(
    dict(
        subj=sample.subject_id.map(subject_palette),
        swap=sample.index.to_series()
        .isin(["CF_11", "CF_15"])
        .replace({False: "grey", True: "black"}),
    )
)
row_linkage = sample_rotu_bc_linkage

sns.clustermap(
    rotu_rabund,
    norm=mpl.colors.PowerNorm(1 / 5),
    row_colors=row_colors,
    row_linkage=row_linkage,
)

In [None]:
pd.read_table(
    "data/group/een/r.proc.gtpro.species_depth.tsv")

In [None]:
gtpro_depth = (pd.read_table(
    "data/group/een/r.proc.gtpro.species_depth.tsv",
    index_col=['sample', "species_id"],
    )
    .depth.unstack(fill_value=0)
    .rename(columns=str, index=lambda x: "CF_" + str(int(x.split("_")[1])))
    .rename({'CF_15': 'CF_11', 'CF_11': 'CF_15'})  # Sample swap
)
gtpro_rabund = gtpro_depth.divide(gtpro_depth.sum(1), axis=0)

gtpro_rabund

In [None]:
motu_depth = (pd.read_table(
    "data/group/een/r.proc.gene99_new-v22-agg75.spgc_specgene-ref-t25-p95.species_depth.tsv",
    names=['sample', "species_id", 'depth'], index_col=['sample', "species_id"],
    )
    .depth.unstack(fill_value=0)
    .rename(columns=str, index=lambda x: "CF_" + str(int(x.split("_")[1])))
    .rename({'CF_15': 'CF_11', 'CF_11': 'CF_15'})  # Sample swap
)
motu_rabund = motu_depth.divide(motu_depth.sum(1), axis=0)

motu_rabund

In [None]:
x, y = align_indexes(motu_rabund, rotu_rabund)


x_linkage = linkage(x, method="average", metric="braycurtis", optimal_ordering=True)
y_linkage = linkage(y, method="average", metric="braycurtis", optimal_ordering=True)
colors = pd.DataFrame(
    dict(
        subj=sample.subject_id.map(subject_palette),
        swap=sample.index.to_series()
        .isin(["CF_11", "CF_15"])
        .replace({False: "grey", True: "black"}),
    )
)

x_pdist = pd.DataFrame(
    squareform(pdist(x, metric="braycurtis")), index=x.index, columns=x.index
)
sns.clustermap(
    x_pdist,
    row_linkage=y_linkage,
    col_linkage=x_linkage,
    row_colors=colors,
    col_colors=colors,
)

In [None]:
x, y = align_indexes(motu_rabund, gtpro_rabund)


x_linkage = linkage(x, method="average", metric="braycurtis", optimal_ordering=True)
y_linkage = linkage(y, method="average", metric="braycurtis", optimal_ordering=True)
colors = pd.DataFrame(
    dict(
        subj=sample.subject_id.map(subject_palette),
        swap=sample.index.to_series()
        .isin(["CF_11", "CF_15"])
        .replace({False: "grey", True: "black"}),
    )
)

x_pdist = pd.DataFrame(
    squareform(pdist(x, metric="braycurtis")), index=x.index, columns=x.index
)
sns.clustermap(
    x_pdist,
    row_linkage=y_linkage,
    col_linkage=x_linkage,
    row_colors=colors,
    col_colors=colors,
)

In [None]:
bins = np.linspace(0, 30_000, num=200)

fig, axs = plt.subplots(2, sharex=True)

for (title, x), ax in zip(
    dict(
        total_depth_by_sample=motu_depth.sum(1),
        total_depth_by_species=motu_depth.sum(0),
    ).items(),
    axs.flatten(),
):
    ax.hist(x, bins=np.logspace(-1, 5, num=100))
    ax.set_title(title)
    ax.set_xscale("log")
fig.tight_layout()

In [None]:
motu_rabund.mean().sort_values(ascending=False).head(20)

In [None]:
n_species = 10
top_motus = (
    (motu_rabund > 1e-5).sum().sort_values(ascending=False).head(n_species).index
)

fig, axs = plt.subplots(
    n_species, figsize=(5, 0.3 * n_species), sharex=True, sharey=True
)

bins = np.logspace(-8, 1, num=51)

for species_id, ax in zip(top_motus, axs):
    # ax.hist(rabund_subset[species_id], bins=bins, alpha=0.7)
    ax.hist(motu_rabund[species_id], bins=bins, alpha=0.7)
    ax.set_xscale("log")
    prevalence = (motu_rabund[species_id] > 1e-5).mean()
    ax.set_title("")
    # ax.set_xticks()
    # ax.set_yticks()
    ax.yaxis.set_visible(False)
    ax.xaxis.set_visible(False)
    ax.patch.set_alpha(0.0)
    for spine in ["left", "right", "top", "bottom"]:
        ax.spines[spine].set_visible(False)
    ax.annotate(
        f"{species_id} ({prevalence:0.0%})",
        xy=(0.05, 0.1),
        ha="left",
        xycoords="axes fraction",
    )
    ax.set_xlim(left=1e-9)
    ax.set_ylim(top=20)
    ax.axvline(1e-5, lw=1, linestyle=":", color="k")

ax.xaxis.set_visible(True)
ax.spines["bottom"].set_visible(True)
ax.set_xticks([1e-4, 1e-2, 1e-0])
ax.set_xticklabels(["0.01%", "1%", "100%"])
ax.set_xlabel("Relative Abundance")

# fig.subplots_adjust(hspace=-0.75)

In [None]:
def parse_taxonomy_string(taxonomy_string):
    values = taxonomy_string.split(";")
    return pd.Series(values, index=["d__", "p__", "c__", "o__", "f__", "g__", "s__"])

In [None]:
motu_taxonomy_inpath = "ref/uhgg_genomes_all_v2.tsv"

_motu_taxonomy = (
    pd.read_table(motu_taxonomy_inpath)[lambda x: x.Genome == x.Species_rep]
    .assign(species_id=lambda x: "1" + x.MGnify_accession.str.split("-").str[2])
    .set_index("species_id")
)

# motu_lineage_string = _motu_taxonomy.Lineage

motu_taxonomy = _motu_taxonomy.Lineage.apply(
    parse_taxonomy_string
)  # .assign(taxonomy_string=motu_lineage_string)
motu_taxonomy

In [None]:
for _species_id in top_motus.astype(str):
    print(_species_id, ":", ";".join(motu_taxonomy.loc[_species_id].values))

In [None]:
motu_taxonomy[lambda x: x.s__.str.endswith("hansenii")]

In [None]:
for _species_id in ["102544", "102506", "101303", "100150", "102330", "101704"]:
    print(
        _species_id,
        (motu_rabund[_species_id] > 0.0001).mean().round(2),
        (motu_rabund[_species_id] > 0.001).mean().round(2),
        motu_taxonomy.loc[_species_id].s__,
        sep="\t\t",
    )

In [None]:
for _species_id in ["100323", "101396", "101493", "102351"]:
    print(
        _species_id,
        (motu_rabund[_species_id] > 0.0001).mean().round(2),
        (motu_rabund[_species_id] > 0.001).mean().round(2),
        motu_taxonomy.loc[_species_id].s__,
        sep="\t\t",
    )

In [None]:
n_species = 20
top_motus = (
    (motu_rabund > 1e-3).sum().sort_values(ascending=False).head(n_species).index
)

fig, axs = plt.subplots(
    n_species, figsize=(5, 0.3 * n_species), sharex=True, sharey=True
)

bins = np.logspace(-8, 1, num=51)

for species_id, ax in zip(top_motus, axs):
    # ax.hist(rabund_subset[species_id], bins=bins, alpha=0.7)
    ax.hist(motu_rabund[species_id], bins=bins, alpha=0.7)
    ax.set_xscale("log")
    prevalence = (motu_rabund[species_id] > 1e-3).mean()
    ax.set_title("")
    # ax.set_xticks()
    # ax.set_yticks()
    ax.yaxis.set_visible(False)
    ax.xaxis.set_visible(False)
    ax.patch.set_alpha(0.0)
    for spine in ["left", "right", "top", "bottom"]:
        ax.spines[spine].set_visible(False)
    ax.annotate(
        f"{species_id} ({prevalence:0.0%})",
        xy=(0.05, 0.1),
        ha="left",
        xycoords="axes fraction",
    )
    ax.set_xlim(left=1e-9)
    ax.set_ylim(top=20)
    ax.axvline(1e-5, lw=1, linestyle=":", color="k")

ax.xaxis.set_visible(True)
ax.spines["bottom"].set_visible(True)
ax.set_xticks([1e-4, 1e-2, 1e-0])
ax.set_xticklabels(["0.01%", "1%", "100%"])
ax.set_xlabel("Relative Abundance")

# fig.subplots_adjust(hspace=-0.75)

In [None]:
sotu_depth = []
missing_files = []
for species_id in motu_depth.columns:
    path = f"data/group/een/species/sp-{species_id}/r.proc.gtpro.sfacts-fit.comm.tsv"
    try:
        d = (
            pd.read_table(path, index_col=["sample", "strain"])
            .squeeze()
            .unstack()
            .rename(columns=str, index=lambda x: "CF_" + str(int(x.split("_")[1])))
            .rename({'CF_11': 'CF_15', 'CF_15': 'CF_11'})  # Sample swap.
        )
    except FileNotFoundError:
        missing_files.append(path)
        d = pd.DataFrame([])
    _keep_strains = idxwhere(d.sum() > 0.05)
    assert d.index.isin(motu_depth.index).all()
    d = d.reindex(index=motu_depth.index, columns=_keep_strains, fill_value=0)
    d = d.assign(__other=lambda x: 1 - x.sum(1)).rename(columns={"__other": -1})
    d[d < 0] = 0
    d = d.divide(d.sum(1), axis=0)
    d = d.multiply(motu_depth[species_id], axis=0)
    d = d.rename(columns=lambda s: f"{species_id}_{s}")
    sotu_depth.append(d)
sotu_depth = pd.concat(sotu_depth, axis=1)
sotu_rabund = sotu_depth.divide(sotu_depth.sum(1), axis=0)
len(motu_depth.columns), len(missing_files)

In [None]:
plt.hist(sotu_depth.values.flatten() + 1e-10, bins=np.logspace(-10, 0))
plt.yscale("log")
plt.xscale("log")

In [None]:
for _species_id in top_motus.astype(str):
    print(_species_id, ":", ";".join(motu_taxonomy.loc[_species_id].values))

In [None]:
x, y = align_indexes(sotu_rabund, motu_rabund)

x_linkage = linkage(x, method="average", metric="braycurtis", optimal_ordering=True)
y_linkage = linkage(y, method="average", metric="braycurtis", optimal_ordering=True)

colors = pd.DataFrame(
    dict(
        subj=sample.subject_id.map(subject_palette),
        swap=sample.index.to_series()
        .isin(["CF_11", "CF_15", "CF_431", "CF_427"])
        .replace({False: "grey", True: "black"}),
    )
)

x_pdist = pd.DataFrame(
    squareform(pdist(x, metric="braycurtis")), index=x.index, columns=x.index
)
sns.clustermap(
    x_pdist,
    row_linkage=y_linkage,
    col_linkage=x_linkage,
    row_colors=colors,
    col_colors=colors,
)

In [None]:
sample[["subject_id", "sample_type"]].value_counts().unstack(fill_value=0).sort_values(
    "human", ascending=False
).head(20)

In [None]:
sample[["subject_id", "diet_or_media"]].value_counts().unstack(
    fill_value=0
).sort_values("EEN", ascending=False).head(20)

## Understand Fermenter Expmt.

In [None]:
sample[lambda x: x.subject_id.isin(["A", "B", "H"]) & x.index.isin(motu_rabund.index)][
    ["subject_id", "source_samples", "sample_type", "diet_or_media", "status_mouse_inflamed"]
].value_counts(dropna=False).sort_index()

In [None]:
def _label_experiment_sample(x):
    if x.sample_type == "human":
        label = f"[{x.name}] {x.collection_date_relative_een_end} {x.diet_or_media}"
    elif x.sample_type in ["Fermenter_inoculum"]:
        label = f"[{x.name}] {x.source_samples} inoc {x.diet_or_media}"
    elif x.sample_type in ["Fermenter"]:
        label = f"[{x.name}] {x.source_samples} frmnt {x.diet_or_media}"
    elif x.sample_type in ["mouse"]:
        if x.status_mouse_inflamed == 'Inflamed':
            label = f"[{x.name}] {x.source_samples} 🐭 {x.mouse_genotype} {x.diet_or_media} inflam"
        elif x.status_mouse_inflamed == 'not_Inflamed':
            label = f"[{x.name}] {x.source_samples} 🐭 {x.mouse_genotype} {x.diet_or_media} not_inf"
        else:
            raise ValueError(f"sample type {x.status_mouse_inflamed} not understood")
    else:
        raise ValueError(f"sample type {x.sample_type} not understood")
    return label


sample.sort_values(
    [
        "subject_id",
        "collection_date_relative_een_end",
        "source_samples",
        "sample_type",
        "diet_or_media",
    ]
).assign(label=lambda d: d.apply(_label_experiment_sample, axis=1)).label

In [None]:
sample[
    lambda x: x.subject_id.isin(["A"]) & x.index.isin(motu_rabund.index)
].sort_values(
    [
        "subject_id",
        "collection_date_relative_een_end",
        "sample_type",
        "source_samples",
        "diet_or_media",
        "mouse_genotype",
        "status_mouse_inflamed",
    ]
).assign(
    label=lambda d: d.apply(_label_experiment_sample, axis=1)
)

In [None]:
sample[
    lambda x: x.subject_id.isin(["B"]) & x.index.isin(motu_rabund.index)
].sort_values(
    [
        "subject_id",
        "collection_date_relative_een_end",
        "source_samples",
        "sample_type",
        "diet_or_media",
        "mouse_genotype",
        "status_mouse_inflamed",
    ]
).assign(
    label=lambda d: d.apply(_label_experiment_sample, axis=1)
)

In [None]:
sample[
    lambda x: x.subject_id.isin(["H"]) & x.index.isin(motu_rabund.index)
].sort_values(
    [
        "subject_id",
        "collection_date_relative_een_end",
        "source_samples",
        "sample_type",
        "diet_or_media",
        "mouse_genotype",
        "status_mouse_inflamed",
    ]
).assign(
    label=lambda d: d.apply(_label_experiment_sample, axis=1)
)

## Confirm Sample Swap

In [None]:
d = sotu_rabund.loc[:, lambda x: (x > 1e-5).sum() > 1]
sample_linkage_strain_rabund = linkage(
    d,
    method="average",
    metric="braycurtis",
    optimal_ordering=True,
)
colors = pd.DataFrame(
    dict(
        subj=sample.subject_id.map(subject_palette),
        # type=sample.diet_or_media.map(diet_palette),
        swap=sample.index.to_series()
        .isin(["CF_11", "CF_15", "CF_431", "CF_427", "CF_402"])
        .map({True: "black", False: "grey"}),
    )
)

cg = sns.clustermap(
    d.T,
    norm=mpl.colors.PowerNorm(1 / 5),
    col_linkage=sample_linkage_strain_rabund,
    metric="cosine",
    xticklabels=1,
    figsize=(18, 10),
    col_colors=colors,
    dendrogram_ratio=(0.05, 0.05),
    yticklabels=0,
)
cg.ax_cbar.set_visible(False)

In [None]:
d = motu_rabund.loc[:, lambda x: (x > 1e-5).sum() > 1]
sample_linkage_strain_rabund = linkage(
    d,
    method="average",
    metric="braycurtis",
    optimal_ordering=True,
)
colors = pd.DataFrame(
    dict(
        subj=sample.subject_id.map(subject_palette),
        # type=sample.diet_or_media.map(diet_palette),
        swap=sample.index.to_series()
        .isin(["CF_11", "CF_15", "CF_431", "CF_427", "CF_402"])
        .map({True: "black", False: "grey"}),
    )
)

cg = sns.clustermap(
    d.T,
    norm=mpl.colors.PowerNorm(1 / 5),
    col_linkage=sample_linkage_strain_rabund,
    metric="cosine",
    xticklabels=1,
    figsize=(15, 10),
    col_colors=colors,
    dendrogram_ratio=(0.05, 0.05),
    yticklabels=0,
)
cg.ax_cbar.set_visible(False)

In [None]:
d = rotu_rabund.loc[:, lambda x: (x > 1e-5).sum() > 1]
sample_linkage_strain_rabund = linkage(
    d,
    method="average",
    metric="braycurtis",
    optimal_ordering=True,
)
colors = pd.DataFrame(
    dict(
        subj=sample.subject_id.map(subject_palette),
        # type=sample.diet_or_media.map(diet_palette),
        swap=sample.index.to_series()
        .isin(["CF_11", "CF_15", "CF_431", "CF_427", "CF_402"])
        .map({True: "black", False: "grey"}),
    )
)

cg = sns.clustermap(
    d.T,
    norm=mpl.colors.PowerNorm(1 / 5),
    col_linkage=sample_linkage_strain_rabund,
    metric="cosine",
    xticklabels=1,
    figsize=(85, 10),
    col_colors=colors,
    dendrogram_ratio=(0.05, 0.05),
    yticklabels=0,
)
cg.ax_cbar.set_visible(False)

# fig = plt.gcf()
# fig.savefig('fig/een_zotus_clustermap.png')

In [None]:
for k in subject_palette:
    plt.scatter([], [], color=subject_palette[k], label=k)
k = "other"
plt.scatter([], [], color=subject_palette[k], label=k)
plt.legend(ncols=5)

In [None]:
suspect_labels = ["CF_11", "CF_15", "CF_431", "CF_427", "CF_402"]
focal_subjects = list(sample.loc[suspect_labels].subject_id.unique())
focal_samples = idxwhere(
    sample.subject_id.isin(focal_subjects)
    & sample.index.to_series().isin(sotu_rabund.index)
)

d = sotu_rabund.loc[focal_samples].loc[:, lambda x: (x > 1e-5).sum() > 1]
sample_linkage_strain_rabund = linkage(
    d,
    method="average",
    metric="braycurtis",
    optimal_ordering=True,
)
colors = pd.DataFrame(
    dict(
        subj=sample.subject_id.map(subject_palette),
        # type=sample.diet_or_media.map(diet_palette),
        swap=sample.index.to_series()
        .isin(suspect_labels)
        .map({True: "black", False: "grey"}),
    )
)

cg = sns.clustermap(
    d.T,
    norm=mpl.colors.PowerNorm(1 / 5),
    col_linkage=sample_linkage_strain_rabund,
    metric="cosine",
    xticklabels=1,
    figsize=(15, 10),
    col_colors=colors,
    dendrogram_ratio=(0.05, 0.05),
    yticklabels=0,
)
cg.ax_cbar.set_visible(False)

In [None]:
suspect_labels = ["CF_11", "CF_15", "CF_431", "CF_427", "CF_402"]
focal_subjects = list(sample.loc[suspect_labels].subject_id.unique())
focal_samples = idxwhere(
    sample.subject_id.isin(focal_subjects)
    & sample.index.to_series().isin(rotu_rabund.index)
)

d = rotu_rabund.loc[focal_samples].loc[:, lambda x: (x > 1e-5).sum() > 1]
sample_linkage_strain_rabund = linkage(
    d,
    method="average",
    metric="braycurtis",
    optimal_ordering=True,
)
colors = pd.DataFrame(
    dict(
        subj=sample.subject_id.map(subject_palette),
        # type=sample.diet_or_media.map(diet_palette),
        swap=sample.index.to_series()
        .isin(suspect_labels)
        .map({True: "black", False: "grey"}),
    )
)

cg = sns.clustermap(
    d.T,
    norm=mpl.colors.PowerNorm(1 / 5),
    col_linkage=sample_linkage_strain_rabund,
    metric="cosine",
    xticklabels=1,
    figsize=(15, 10),
    col_colors=colors,
    dendrogram_ratio=(0.05, 0.05),
    yticklabels=0,
)
cg.ax_cbar.set_visible(False)

In [None]:
sample.loc[suspect_labels]

In [None]:
all_new_samples_list = [
    "CF_379",
    "CF_380",
    "CF_381",
    "CF_384",
    "CF_385",
    "CF_386",
    "CF_426",
    "CF_427",
    "CF_428",
    "CF_429",
    "CF_430",
    "CF_431",
    "CF_395",
    "CF_397",
    "CF_402",
    "CF_406",
    "CF_408",
    "CF_409",
    "CF_140",
    "CF_141",
    "CF_142",
    "CF_149",
    "CF_150",
    "CF_151",
    "CF_170",
    "CF_171",
    "CF_172",
    "CF_173",
    "CF_174",
    "CF_175",
    "CF_152",
    "CF_153",
    "CF_154",
    "CF_155",
    "CF_156",
    "CF_157",
    "CF_115",
    "CF_116",
    "CF_117",
    "CF_118",
    "CF_119",
    "CF_120",
    "CF_127",
    "CF_128",
    "CF_130",
    "CF_131",
    "CF_132",
    "CF_133",
    "CF_667",
    "CF_668",
    "CF_669",
    "CF_670",
    "CF_671",
    "CF_672",
]

sample.loc[all_new_samples_list]