## Preamble

### Project Template

In [None]:
%load_ext autoreload

In [None]:
import os as _os

_os.chdir(_os.environ["PROJECT_ROOT"])
_os.path.realpath(_os.path.curdir)

### Imports

In [None]:
import os
import subprocess
import sys
import time
from datetime import datetime
from glob import glob
from itertools import chain, product
from tempfile import mkstemp

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import xarray as xr
from mpl_toolkits.axes_grid1 import make_axes_locatable
from statsmodels.stats.multitest import fdrcorrection
from tqdm import tqdm

import sfacts as sf

import lib.plot
from lib.pandas_util import align_indexes, aligned_index, idxwhere, invert_mapping

## Style

In [None]:
sns.set_context("paper")
plt.rcParams["figure.dpi"] = 100

## Analysis Parameters

In [None]:
ambiguity_threshold = 0.1

# Load and Check Data

## Select some GTDB genomes missing from UHGG

### Pick some examples genomes and generate the necessary metadata files.

In [None]:
gtdb_genomes = pd.read_table('ref/gtdb/species/102506/metadata.tsv', index_col='accession')

In [None]:
gtdb_genomes[['ncbi_biosample']]

In [None]:
for i, (c, v) in enumerate(zip(gtdb_genomes.columns, gtdb_genomes.iloc[0].values)):
    print(i, c, v, sep='\t')

In [None]:
gtdb_genomes[['ncbi_assembly_name', 'ncbi_genbank_assembly_accession']][lambda x: x.ncbi_genbank_assembly_accession == 'GCA_011601075.1']

In [None]:
genome_meta = pd.read_table('meta/genome.tsv')

for i, (c, v) in enumerate(zip(genome_meta.columns, genome_meta.iloc[0].values)):
    print(i, c, v, sep='\t')

In [None]:
gtdb_genomes[lambda x: x.ncbi_assembly_name == 'ASM1160107v1'].squeeze().ncbi_biosample

In [None]:
gtdb_genomes.assign(
    genome_id=lambda x: 'Escherichia-coli-'+ x.ncbi_genbank_assembly_accession.str.replace('GCA', 'GCF'),
    species_id='102506',
    genome_path=lambda x: 'raw/genomes/gtdb/' + x.genome_id + '/assembly.fa',
    _old_genome_id='',
    ncbi_assembly_name=lambda x: x.ncbi_assembly_name,
    ncbi_assembly_biosample=lambda x: x.ncbi_biosample,
    comments=lambda x: 'GTDB Accession ' + x.index + ';',
)[genome_meta.columns].to_csv('meta/gtdb_genome.tsv', index=False, sep='\t')

## Compare Genotypes

In [None]:
spikein_assembly_inpath = 'data/group/spikein_benchmark/species/sp-102506/strain_genomes.gtpro.mgtp.nc'
spikein_assembly_geno = (
    sf.Metagenotype.load(spikein_assembly_inpath)
    .to_estimated_genotype()
    # .discretized(max_ambiguity=ambiguity_threshold)
)

In [None]:
midas_assembly_inpath = 'data/species/sp-102506/midasdb_v15.gtpro.mgtp.nc'
midas_assembly_geno = (
    sf.Metagenotype.load(midas_assembly_inpath)
    .to_estimated_genotype()
    # .discretized(max_ambiguity=ambiguity_threshold)
)

In [None]:
hmp2_inferred_inpath = 'data/group/hmp2/species/sp-102506/r.proc.gtpro.sfacts-fit.spgc_ss-all.mgtp.nc'
hmp2_inferred_geno = (
    sf.Metagenotype.load(hmp2_inferred_inpath)
    .to_estimated_genotype()
    # .discretized(max_ambiguity=ambiguity_threshold)
)

In [None]:
hmp2_to_midas_gdist = hmp2_inferred_geno.cdist(midas_assembly_geno)

In [None]:
spikein_to_midas_gdist = spikein_assembly_geno.cdist(midas_assembly_geno)

In [None]:
pd.DataFrame(dict(idxmin=spikein_to_midas_gdist.idxmin(axis=1), min=spikein_to_midas_gdist.min(axis=1)))