# Work out how to deal with taxonomy

Given contigs with taxonomy and KOs of interest, wth

## Setup

In [1]:
import os 
import re
import glob
import math
import json
import itertools
import numpy as np
import xarray as xr
import pandas as pd
import seaborn as sns
from time import time
from tqdm import tqdm
from scipy import stats
import matplotlib as mpl
from collections import * 
from functools import reduce
import matplotlib.font_manager
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
from scipy.spatial import distance
from scipy.cluster import hierarchy
from matplotlib.lines import Line2D
import matplotlib.patches as mpatches



In [4]:
# rpy2 imports
import rpy2
from rpy2 import robjects as ro
from rpy2.robjects import pandas2ri
# from rpy2.ipython.ggplot import image_png
from rpy2.robjects.packages import importr

# load rpy2 extension for ipython
pandas2ri.activate()
%load_ext rpy2.ipython

In [5]:
# install & import r package sctransform

# check if sctransform is installed
if not ro.packages.isinstalled('sctransform'):
    # select CRAN mirror
    utils = importr('utils')
    utils.chooseCRANmirror(ind=1)
    # install sctransform
    utils.install_packages(ro.vectors.StrVector(['sctransform']))
    
# check if glmGamPoi is installed
if not ro.packages.isinstalled('glmGamPoi'):
    print('Please install glmGamPoi: https://github.com/const-ae/glmGamPoi')

# import sctransform
sctransform = importr('sctransform')
rmatrix = importr('Matrix')

# should be version 0.4.1                          
print(sctransform.__version__)

Please install glmGamPoi: https://github.com/const-ae/glmGamPoi
0.4.1


In [6]:
os.getcwd()

'/scratch/bgrodner/repo-barnacle-manuscript/containers'

In [7]:
workdir = '/scratch/bgrodner/iron_ko_contigs'
os.chdir(workdir)


In [8]:
os.getcwd()

'/scratch/bgrodner/iron_ko_contigs'

In [None]:
os.listdir()

['config.yaml',
 'sc_run_snakemake.sh',
 '._.DS_Store',
 'file_table.240107.kofam_filt.csv',
 'file_table.240109.kofam_filt.csv',
 'file_table_new.csv',
 '._file_table.240109.kofam_filt.csv',
 'file_table.test.csv',
 '._file_table.240107.kofam_filt.csv',
 'ko00001.json',
 '._file_table_new.csv',
 'file_table.240114.kofam_filt.csv',
 'metat_search_results',
 '._file_table.csv',
 'file_table.240113.kofam_filt.csv',
 'file_table.240107.kofam_unfilt.csv',
 '._file_table.240108.kofam_filt.csv',
 'kofam_filenames.txt',
 '.etetoolkit',
 'iron_contigs.txt',
 '.DS_Store',
 'iron_KOs.txt',
 'file_table.240108.kofam_filt.csv',
 'file_table.csv',
 '._file_table.240107.kofam_unfilt.csv',
 '._plt_KO_contig_counts.png',
 '._file_table.test.csv',
 '._file_table.240113.kofam_filt.csv',
 '._file_table.240114.kofam_filt.csv',
 '.snakemake']


R[write to console]: 1: 
R[write to console]: In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
R[write to console]: 
 
R[write to console]:  library ‘/usr/lib/R/site-library’ contains no packages

R[write to console]: 2: 
R[write to console]: In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
R[write to console]: 
 
R[write to console]:  library ‘/usr/lib/R/site-library’ contains no packages

R[write to console]: 3: 
R[write to console]: In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
R[write to console]: 
 
R[write to console]:  library ‘/usr/lib/R/site-library’ contains no packages

R[write to console]: 4: 
R[write to console]: In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
R[write to console]: 
 
R[write to console]:  library ‘/usr/lib/R/site-library’ contains no packages

R[write to console]: 5: 
R[write to console]: In (function (package, help, pos = 2,

Helper functions

In [None]:
def general_plot(
    xlabel="", ylabel="", ft=12, dims=(5, 3), col="k", lw=1, pad=0, tr_spines=True
):
    fig, ax = plt.subplots(figsize=(dims[0], dims[1]), tight_layout={"pad": pad})
    for i in ax.spines:
        ax.spines[i].set_linewidth(lw)
    if not tr_spines:
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
    else:
        ax.spines["top"].set_color(col)
        ax.spines["right"].set_color(col)
    ax.spines["bottom"].set_color(col)
    ax.spines["left"].set_color(col)
    ax.tick_params(direction="in", labelsize=ft, color=col, labelcolor=col)
    ax.set_xlabel(xlabel, fontsize=ft, color=col)
    ax.set_ylabel(ylabel, fontsize=ft, color=col)
    ax.patch.set_alpha(0)
    return (fig, ax)

# helper functions

# function to calculate 0-sensitive geometric mean
def geometric_mean(vector, pseudocount=1):
    return np.exp(np.mean(np.log(vector + pseudocount))) - pseudocount

# function to convert pandas dataframe to r matrix
def pandas_dataframe_to_r_matrix(df, dtype=float):
    """
    Function to convert pandas DataFrame objects to R matrix objects.
    """
    if dtype is float:
        vector = ro.vectors.FloatVector(df.values.flatten().tolist())
    elif dtype is str:
        vector = ro.vectors.StrVector(df.values.flatten().tolist())
    elif dtype is int:
        vector = ro.vectors.FloatVector(df.values.flatten().tolist())
    else:
        raise ValueError('The dtype {} is not recognized'.format(dtype))
    matrix = rmatrix.Matrix(
        data=vector, 
        nrow=df.shape[0], 
        ncol=df.shape[1], 
        byrow=True, 
        dimnames=[df.index.to_list(), df.columns.to_list()], 
        sparse=True
    )
    return matrix

def parse_fn_kallisto_sn(fn, sn_type='', get_columns=False):
    if not get_columns:
        try:
            ass, sample, ammend, timep, depth, size, rep = [''] * 7
            if sn_type == 'G1NS':
                ass, sm_sz, rep, _ = fn.split('.')
                sample, sz = sm_sz.split('_',1)
                size = re.sub('_','.',sz)
            elif sn_type == 'G2NS':
                ass, sample, depth, sz, rep, _ = fn.split('.')
                size = re.sub('_','.',sz)
            elif sn_type == 'G3NS':
                ass_sm, dp_sz_rep = fn.split('_', 1)
                ass = re.match(r'.+NS', ass_sm)[0]
                sample = re.search(r'UW\d+', ass_sm)[0]
                dp1, dp2, sz, rep, _ = dp_sz_rep.split('.')
                depth = f'{dp1}.{dp2}'
                size = re.sub('_','.',sz)
            elif sn_type == 'G5':
                ass, sample, ammend, timep, rep, _ = fn.split('.')
            elif sn_type == 'D1':
                ass, sm_rep_tp, _, _ = fn.split('.')
                sample, rep, timep = sm_rep_tp.split('_')
            elif sn_type == 'G1PA':
                ass, fn_ = fn.split('.', 1)
                sample, fn_ = fn_.split('_', 1)
                size = re.search(r'.+um', fn_)[0]
                rep = re.search(r'(?<=um)\w+(?=\.)',fn_)[0]
            elif sn_type == 'G2PA':
                _, ass, sample, depth, sz, rep, _, _ = fn.split('.')
                size = re.sub('_','.',sz)
            elif sn_type == 'G3PA.UW':
                ass, sample, _, _, _, _ = fn.split('.')
            elif sn_type == 'G3PA.diel':
                ass1, ass2, sample, rep, _, _, _, _ = fn.split('.')
                ass = f'{ass1}.{ass2}'
            elif sn_type == 'G3PA.PM':
                ass_sm, dp_tp_sz_rp = fn.split('_', 1)
                ass = re.match(r'.+(?=.UW)', ass_sm)[0]
                sample = re.search(r'UW\d+$', ass_sm)[0]
                depth, tp_sz_rp = dp_tp_sz_rp.split('_',1)
                timep, sz_rp = tp_sz_rp.split('.',1)
                size = re.match(r'.+um(?=\.)', sz_rp)[0]
                rep = re.search(r'(?<=um\.)\w+', sz_rp)[0]
            else:
                raise ValueError(
                    f"""
                    Sample name parse type not provided (sn_type_parse_kallisto column in file table)
                    """
                )        
            return [ass, sample, ammend, timep, depth, size, rep]
        except:
            raise ValueError(
                f"""
                Failed to parse filename:
                {fn}
                Using type:
                {sn_type}
                """
            )
    else:
        return ['assembly', 'sample', 'ammendment', 'timepoint', 'depth', 'size', 'rep']


## Get example slab

Filenames

In [None]:
slab_dir = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/taxon_slabs/G1PA/3um'
fns = glob.glob(f'{slab_dir}/*.csv')
fns

Load example

In [None]:
fn = fns[0]
df = pd.read_csv(fn)
df

## Normalize