# Spatial statistics on hsdm hiprfish images

## Setup

Imports

In [None]:
import os
import re
import gc
import sys
import glob
import yaml
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from time import time, sleep
import aicspylibczi as aplc
import matplotlib.pyplot as plt
import matplotlib.ticker as tck
from collections import defaultdict
# from scipy.cluster import hierarchy
from sklearn.neighbors import NearestNeighbors
import sklearn.cluster as clst
import hdbscan
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
import umap
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# from skimage.feature import peak_local_max
# from matplotlib_scalebar.scalebar import ScaleBar
# from sklearn.cluster import AgglomerativeClustering
# from scipy.spatial.distance import squareform, pdist
# from skimage.segmentation import watershed, relabel_sequential
# from skimage.measure import label, regionprops_table, regionprops

import cv2
import libpysal as ps
from esda.moran import Moran, Moran_BV
from libpysal.weights import W
import pointpats.quadrat_statistics as qs
from pointpats import PointPattern, RectangleM
from pointpats import distance_statistics as dst
from pointpats import PoissonPointProcess as csr
from pointpats.window import Window, poly_from_bbox, as_window, to_ccf
from scikit_posthocs import posthoc_dunn
from time import sleep
from scipy.spatial import distance

Get workdir

In [None]:
cluster = ""
workdir = ""
os.chdir(cluster + workdir)


In [None]:
os.getcwd()

In [None]:
%load_ext autoreload
%autoreload 2
functions_path = 'functions'

sys.path.append(cluster + functions_path)

import fn_general_use as fgu
import image_plots as ip
# import segmentation_func as sf
import fn_hiprfish_classifier as fhc
import fn_spectral_images as fsi
import fn_analysis_plots as apl

Get czi filenames

In [None]:
input_table_fn = "input_table_all.csv"
input_table = pd.read_csv(input_table_fn)
filenames = input_table["filenames"]

In [None]:
dict_date_sn_fns = defaultdict(lambda: defaultdict(list))
for fn in filenames:
    bn = os.path.split(fn)[1]
    date, bn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", bn)
    sn, ext = re.split("(?<=fov_\d{2})", bn)
    dict_date_sn_fns[date][sn].append(fn)

Get processed filename formats

In [None]:
out_dir = "../outputs/{date}/{date}_{sn}"
out_fmt_classif = out_dir + "/classif"
centroid_sciname_fmt = out_fmt_classif + "/{date}_{sn}_centroid_sciname.csv"

Get color dict

In [None]:
# sciname_list = [
#     "Corynebacterium",
#     "Actinomyces",
#     "Rothia",
#     "Capnocytophaga",
#     "Prevotella",
#     "Porphyromonas",
#     "Streptococcus",
#     "Gemella",
#     "Veillonella",
#     "Selenomonas",
#     "Lautropia",
#     "Neisseriaceae",
#     "Pasteurellaceae",
#     "Campylobacter",
#     "Fusobacterium",
#     "Leptotrichia",
#     "Treponema",
#     "TM7",
# ]
# colors = plt.get_cmap("tab20").colors
# # colors = [c + (1,) for c in colors]
# dict_sciname_color = dict(zip(sciname_list, colors))
# dict_sciname_color["Neisseria"] = dict_sciname_color["Neisseriaceae"]
# dict_sciname_color["Saccharibacteria"] = dict_sciname_color["TM7"]
# dict_sciname_color["TM"] = dict_sciname_color["TM7"]

# get coords
sciname_list = [
    'Pasteurellaceae',
    'Corynebacterium',
    'Veillonella',
    'Actinomyces',
    'Selenomonas',
    'Rothia',
    'Porphyromonas',
    'Capnocytophaga',
    'Prevotella',
    'Streptococcus',
    'Gemella',
    'Campylobacter',
    'Lautropia',
    'Leptotrichia',
    'Neisseriaceae',
    'Treponema',
    'Fusobacterium',
    'TM7'
]
cmap = plt.get_cmap('gist_rainbow')
colors = [cmap(i)[:3] for i in np.linspace(0,1,len(sciname_list))]
colors[13] = [0.75,0.75,0.75]
colors[1] = np.array([176,2,104])/255
colors[8] = np.array([184,155,69])/255
colors[6] = np.array([245,164,159])/255
colors[5] = np.array([221,159,239])/255
colors[14] = np.array([54,148,161])/255
colors.insert(17, colors.pop(1))
colors.insert(4, colors.pop(6))
colors.insert(9, colors.pop(12))
colors.insert(5, colors.pop(8))
# colors = [c + (1,) for c in colors]
dict_sciname_color = dict(zip(sciname_list, colors))
dict_sciname_color['Neisseria'] = dict_sciname_color['Neisseriaceae']
dict_sciname_color['Saccharibacteria'] = dict_sciname_color['TM7']


In [None]:
ip.taxon_legend(list(dict_sciname_color.keys()), list(dict_sciname_color.values()))
out_fn = "../outputs/taxon_legend.png"
ip.save_fig(out_fn)

Pick an image to test

In [None]:
date = "2023_02_08"
sn = "hsdm_group_1_sample_12_fov_01"
czi_fns = dict_date_sn_fns[date][sn]
czi_fns

Get coordinates and labels

In [None]:
centroid_sciname_fn = centroid_sciname_fmt.format(date=date, sn=sn)
centroid_sciname = pd.read_csv(centroid_sciname_fn)
coords = np.array([eval(c) for c in centroid_sciname["coord"].values])
scinames = centroid_sciname["sciname"].values
scn_unq = np.unique(scinames)

## Global Spatial autocorrelation

Get adjacency  

In [None]:
radius_um = 3

res_mpix = fsi.get_resolution(czi_fns[0])
res_umpix = res_mpix * 1e6
radius_pix = radius_um / res_umpix
neigh = NearestNeighbors(radius=radius_pix)
nbrs = neigh.fit(coords)
nn_dists, nn_inds = nbrs.radius_neighbors(coords)

In [None]:
print('Pixel radius', radius_pix)
_ = plt.hist([len(ni) for ni in nn_inds])
_ = plt.xlabel('Number of cells within ' + str(radius_um) + 'μm')

Build weights matrix

In [None]:
nn_inds[0]

In [None]:
neighbors = {}
for i, (nn_i) in enumerate(nn_inds):
    neighbors[i] = [ni for ni in nn_i if ni != i]
neighbors[0]

In [None]:
w = W(neighbors)

Get join counts stat for each taxon

In [None]:
lw = 1
ft = 7
dims = (2, 1)
l_col = 'k'
colors = apl.get_cmap_listed("tab10")
line_colors = [colors[0], colors[1]]
xlabel = ""
xticks = (1, 1.7)
xlims = (0.65, 2.05)
ylabel = ""
yticks = []
pad = 0.2
h = 100
dpi=500

spot_size = 3
dims_im = (4,4)

xlim = (0, np.max(coords[:, 1]))
ylim = (0, np.max(coords[:, 0]))

moran_dir = out_dir + '/spatial_stats/morans_i'
moran_plot_fmt = moran_dir + "/{date}_{sn}_sciname_{scn}_moran_plot.png"
scatter_plot_fmt = moran_dir + "/{date}_{sn}_sciname_{scn}_scatter_plot.png"
moran_table_fmt = moran_dir + "/{date}_{sn}_moran_values.csv"

moran_table = defaultdict(list)
for scn in scn_unq:
    # calculate morans i
    col = dict_sciname_color[scn]
    bool_scn = scinames == scn
    y = bool_scn * 1
    mi = Moran(y, w)
    print(scn, mi.I, mi.p_sim)
    moran_table["sciname"].append(scn)
    moran_table["I_expected"].append(mi.EI)
    moran_table["I_measured"].append(mi.I)
    moran_table["p_simulation"].append(mi.p_sim)
    # PLot the simluation vs observed
    fig, ax = apl.general_plot(col=l_col, dims=dims, lw=lw, ft=ft, pad=pad)
    apl.plot_morans_i_sim_obj(ax, mi, lw=lw, ft=ft, col=col, l_col=l_col)
    moran_fn = moran_plot_fmt.format(date=date, sn=sn, scn=scn)
    ip.check_dir(moran_fn)
    ip.save_fig(moran_fn)
    plt.show()
    plt.close()
    # Plot the scatter locations
    coord_scn = coords[bool_scn]
    fig, ax = ip.general_plot(col='w', dims=dims_im, lw=lw, ft=ft)
    ax.scatter(coord_scn[:,1], coord_scn[:,0], s=spot_size, color=col)
    ax.set_xlim(xlim[0], xlim[1])
    ax.set_ylim(ylim[0], ylim[1])
    ax.invert_yaxis()
    ax.set_aspect('equal')
    scatter_fn = scatter_plot_fmt.format(date=date, sn=sn, scn=scn)
    ip.check_dir(scatter_fn)
    ip.save_fig(scatter_fn, dpi=dpi)
    plt.show()
    plt.close()

moran_table_fn = moran_table_fmt.format(date=date, sn=sn)
ip.check_dir(moran_table_fn)
pd.DataFrame(moran_table).to_csv(moran_table_fn, index=False)

## Bivariate global correlation

In [None]:
lw = 1
ft = 7
dims = (2, 1)
l_col = 'k'
colors = apl.get_cmap_listed("tab10")
line_colors = [colors[0], colors[1]]
xlabel = ""
xticks = (1, 1.7)
xlims = (0.65, 2.05)
ylabel = ""
yticks = []
pad = 0.2
h = 100
dpi=500

spot_size = 1
dims_im = (4,4)

xlim = (0, np.max(coords[:, 1]))
ylim = (0, np.max(coords[:, 0]))

moran_bv_dir = out_dir + '/spatial_stats/moran_bv'
moran_bv_plot_fmt = moran_bv_dir + "/{date}_{sn}_sciname_{scn}_moran_plot.png"
scatter_plot_fmt = moran_bv_dir + "/{date}_{sn}_scinames_{scn0}_{scn1}_scatter_plot.png"
moran_table_fmt = moran_bv_dir + "/{date}_{sn}_moran_bv_values.csv"

moran_bv_table = defaultdict(list)
for i, scn0 in enumerate(scn_unq):
    for j, scn1 in enumerate(scn_unq):
        if i < j:
            # calculate morans i
            col0 = dict_sciname_color[scn0]
            col1 = dict_sciname_color[scn1]
            bool_scn0 = scinames == scn0
            bool_scn1 = scinames == scn1
            y0 = bool_scn0 * 1
            y1 = bool_scn1 * 1
            mbv = Moran_BV(y0, y1, w)
            print(scn0, scn1, mbv.EI_sim, mbv.I, mbv.p_sim)
            moran_bv_table["sciname0"].append(scn0)
            moran_bv_table["sciname1"].append(scn0)
            moran_bv_table["I_expected"].append(mbv.EI_sim)
            moran_bv_table["I_measured"].append(mbv.I)
            moran_bv_table["p_simulation"].append(mbv.p_sim)
            # PLot the simluation vs observed
            fig, ax = apl.general_plot(col=l_col, dims=dims, lw=lw, ft=ft, pad=pad)
            apl.plot_morans_i_sim_obj(ax, mbv, lw=lw, ft=ft, col=col0, l_col=l_col)
            # moran_fn = moran_plot_fmt.format(date=date, sn=sn, , scn0=scn1, scn0=scn1)
            # ip.check_dir(moran_fn)
            # ip.save_fig(moran_fn)
            plt.show()
            plt.close()
            # Plot the scatter locations
            coord_scn0 = coords[bool_scn0]
            coord_scn1 = coords[bool_scn1]
            fig, ax = ip.general_plot(col='w', dims=dims_im, lw=lw, ft=ft)
            ax.scatter(coord_scn0[:,1], coord_scn0[:,0], s=spot_size, color=col0)
            ax.scatter(coord_scn1[:,1], coord_scn1[:,0], s=spot_size, color=col1)
            ax.set_xlim(xlim[0], xlim[1])
            ax.set_ylim(ylim[0], ylim[1])
            ax.invert_yaxis()
            ax.set_aspect('equal')
            # scatter_fn = scatter_plot_fmt.format(date=date, sn=sn, scn0=scn1, scn0=scn1)
            # ip.check_dir(scatter_fn)
            # ip.save_fig(scatter_fn, dpi=dpi)
            plt.show()
            plt.close()

# moran_bv_table_fn = moran_bv_table_fmt.format(date=date, sn=sn)
# ip.check_dir(moran_bv_table_fn)
# pd.DataFrame(moran_bv_table).to_csv(moran_bv_table_fn, index=False)

## Pair correlation

In [None]:
dmax = 205
stepsize=5

dims=(10,5)
ft=12
lw=2

def deriv(K, h):
    return np.diff(K) / np.diff(h)


# Define distance range
res_mpix = fsi.get_resolution(czi_fns[0])
res_umpix = res_mpix * 1e6
d = np.arange(0, dmax, stepsize)
dpix = d / res_umpix
# Get cells window
convex_hull = ps.cg.convex_hull(coords.tolist())
ch_arr = np.array(to_ccf(convex_hull))
plt.plot(ch_arr[:,1], ch_arr[:,0])
plt.gca().invert_yaxis()
window = Window([convex_hull])

for scn in scn_unq[3:5]:
    print(scn)
    # Get point pattern
    col_obs = dict_sciname_color[scn]
    bool_scn = scinames == scn
    coord_scn = coords[bool_scn]
    pp = PointPattern(coord_scn, window=window)
    # Measure L values
    lenv = dst.L(pp, d=dpix)
    # Plot L values
    fig, ax = ip.general_plot(dims=dims, ft=ft, col='k', lw=lw)
    x = lenv.d
    ax.plot(x, lenv.l, lw=lw, color=col_obs)
    xlab = np.arange(0, dmax, 10)
    xticks = xlab / res_umpix
    _ = ax.set_xticks(xticks, labels=xlab)
    ax.plot([xticks[0], xticks[-1]],[0,0], 'k')
    ax.set_xlim([xticks[0], xticks[-1]])
    # PLot derivative of L
    h = lenv.d
    Lpobs = deriv(lenv.l, h)
    fig, ax = ip.general_plot(dims=dims, ft=ft, col='k', lw=lw)
    x = h[:-1]
    ax.plot(x, Lpobs, lw=lw, color=col_obs)
    xlab = np.arange(0, dmax, 10)
    xticks = xlab / res_umpix
    _ = ax.set_xticks(xticks, labels=xlab)
    ax.plot([xticks[0], xticks[-1]],[0,0], 'k')
    ax.set_xlim([xticks[0], xticks[-1]])
    # PLot second derivative of L
    h = x
    Lppobs = deriv(Lpobs, h)
    fig, ax = ip.general_plot(dims=dims, ft=ft, col='k', lw=lw)
    x = h[:-1]
    ax.plot(x, Lppobs, lw=lw, color=col_obs)
    xlab = np.arange(0, dmax, 10)
    xticks = xlab / res_umpix
    _ = ax.set_xticks(xticks, labels=xlab)
    ax.plot([xticks[0], xticks[-1]],[0,0], 'k')
    ax.set_xlim([xticks[0], xticks[-1]])


Set up simulation envelope

In [None]:

# bbox=[0,0,crf_shp[0],crf_shp[1]]
# poly = poly_from_bbox(bbox)
# window = as_window(poly)

convex_hull = ps.cg.convex_hull(coords.tolist())

In [None]:
ch_arr = np.array(to_ccf(convex_hull))
plt.plot(ch_arr[:,1], ch_arr[:,0])
plt.gca().invert_yaxis()

In [None]:
window = Window([convex_hull])
window.bbox

Set up point pattern

In [None]:
scn = 'Gemella'
scn

In [None]:
bool_scn = scinames == scn
coord_scn = coords[bool_scn]
pp = PointPattern(coord_scn, window=window)
# mge_pp = PointPattern(mge_coords_adj_order, window=window)
pp.summary()

In [None]:
dmax = 205
stepsize=5

res_mpix = fsi.get_resolution(czi_fns[0])
res_umpix = res_mpix * 1e6
d = np.arange(0, dmax, stepsize)
dpix = d / res_umpix
d

L function

In [None]:
lenv = dst.L(pp, d=dpix)
lenv.plot()

In [None]:
ls_sim = '-.'
env_col = 'k'
xt_max = 100
dims=(10,5)
ft=12
lw=2
col_obs = dict_sciname_color[scn]


fig, ax = ip.general_plot(dims=dims, ft=ft, col='k', lw=lw)
x = lenv.d
ax.plot(x, lenv.l, lw=lw, color=col_obs)
# ax.plot(x, Lpmean, lw=lw, color='k', ls=ls_sim)
# ax.plot(x, Lplow, lw=lw, color=(0.5,0.5,0.5), ls=ls_sim)
# ax.plot(x, Lphigh, lw=lw, color=(0.5,0.5,0.5), ls=ls_sim)

xlim = ax.get_xlim()
# xt_max = int(xlim[1] * mge_umpix)
xlab = np.arange(0, dmax, 10)
xticks = xlab / res_umpix
_ = ax.set_xticks(xticks, labels=xlab)

ax.plot(xlim,[0,0], 'k')
# ax.set_ylim(0,10)

L derivative

In [None]:
def deriv(K, h):
    return np.diff(K) / np.diff(h)


In [None]:
h = lenv.d
Lpobs = deriv(lenv.l, h)
# Lpmean = pcf(lenv.mean, h)
# Lplow = pcf(lenv.low, h)
# Lphigh = pcf(lenv.high, h)


ls_sim = '-.'
env_col = 'k'
xt_max = 100
dims=(10,5)
ft=12
lw=2
col_obs = dict_sciname_color[scn]


fig, ax = ip.general_plot(dims=dims, ft=ft, col='k', lw=lw)
x = h[:-1]
ax.plot(x, Lpobs, lw=lw, color=col_obs)
# ax.plot(x, Lpmean, lw=lw, color='k', ls=ls_sim)
# ax.plot(x, Lplow, lw=lw, color=(0.5,0.5,0.5), ls=ls_sim)
# ax.plot(x, Lphigh, lw=lw, color=(0.5,0.5,0.5), ls=ls_sim)

xlim = ax.get_xlim()
# xt_max = int(xlim[1] * mge_umpix)
xlab = np.arange(0, dmax, 10)
xticks = xlab / res_umpix
_ = ax.set_xticks(xticks, labels=xlab)

ax.plot(xlim,[0,0], 'k')
# ax.set_ylim(0,10)

L second derivative

In [None]:
h = lenv.d[:-1]
Lppobs = deriv(Lpobs, h)
# Lpmean = pcf(lenv.mean, h)
# Lplow = pcf(lenv.low, h)
# Lphigh = pcf(lenv.high, h)


ls_sim = '-.'
env_col = 'k'
xt_max = 100
dims=(10,5)
ft=12
lw=2

fig, ax = ip.general_plot(dims=dims, ft=ft, col='k', lw=lw)
x = h[:-1]
ax.plot(x, Lppobs, lw=lw, color=col_obs)
# ax.plot(x, Lpmean, lw=lw, color='k', ls=ls_sim)
# ax.plot(x, Lplow, lw=lw, color=(0.5,0.5,0.5), ls=ls_sim)
# ax.plot(x, Lphigh, lw=lw, color=(0.5,0.5,0.5), ls=ls_sim)

xlim = ax.get_xlim()
# xt_max = int(xlim[1] * mge_umpix)
xlab = np.arange(0, dmax, 10)
xticks = xlab / res_umpix
_ = ax.set_xticks(xticks, labels=xlab)
ax.plot(xlim,[0,0], 'k')

In [None]:
# realizations = csr(pp.window, pp.n, 100, asPP=True) # simulate CSR 
# reals = realizations.realizations

In [None]:
nn_dists = pp.nnd.squeeze()
nn_dists_um = nn_dists * res_umpix
n_ints_1um = int(np.max(nn_dists_um))

n_ints_1um

In [None]:
Kenv = dst.K(pp, d=dpix)
Kenv.plot()

In [None]:
ls_sim = '-.'
env_col = 'k'
xt_max = 100
dims=(10,5)
ft=12
lw=2
col_obs = dict_sciname_color[scn]


fig, ax = ip.general_plot(dims=dims, ft=ft, col='k', lw=lw)
x = d
ax.plot(x, Kenv.k, lw=lw, color=col_obs)
# ax.plot(x, Lpmean, lw=lw, color='k', ls=ls_sim)
# ax.plot(x, Lplow, lw=lw, color=(0.5,0.5,0.5), ls=ls_sim)
# ax.plot(x, Lphigh, lw=lw, color=(0.5,0.5,0.5), ls=ls_sim)

lndist = np.log(d[1:])
lnk = np.log(Kenv.k[1:])
model = LinearRegression().fit(lndist[:, None], lnk)
slope, intercept, r_value, p_value, std_err = stats.linregress(lndist, lnk)
print('slope: ', model.coef_, 'intercept: ', model.intercept_)
print(stats.pearsonr(lndist, lnk))
print(stats.linregress(lndist, lnk))

y = x**(model.coef_[0]) * math.exp(model.intercept_)
ax.plot(x, y, 'k')

ax.set_xscale("log")
ax.set_yscale("log")
ax.set_xlabel('log10(distance (μm))')
ax.set_ylabel("log10(Ripley's K)")





In [None]:
y

In [None]:
def pcf(K, h):
    dKdh = np.diff(K) / np.diff(h)
    return dKdh / (2 * h[1:] * np.pi)

In [None]:
col_obs = dict_sciname_color[scn]

Robs = pcf(Kenv.k, Kenv.d)
# Rmean = pcf(Kenv.mean, Kenv.d)
# Rlow = pcf(Kenv.low, Kenv.d)
# Rhigh = pcf(Kenv.high, Kenv.d)


ls_sim = '-.'
env_col = 'k'
xt_max = 100
dims=(10,5)
ft=12
lw=2

fig, ax = ip.general_plot(dims=dims, ft=ft, col='k', lw=lw)
x = Kenv.d[:-1]
ax.plot(x, Robs, lw=lw, color=col_obs)
# ax.plot(x, Rmean, lw=lw, color='k', ls=ls_sim)
# ax.plot(x, Rlow, lw=lw, color=(0.5,0.5,0.5), ls=ls_sim)
# ax.plot(x, Rhigh, lw=lw, color=(0.5,0.5,0.5), ls=ls_sim)

xlim = ax.get_xlim()
# xt_max = int(xlim[1] * mge_umpix)
xlab = np.arange(0, xt_max, 5)
xticks = xlab / res_umpix
_ = ax.set_xticks(xticks, labels=xlab)

# ax.set_ylim(0,10)

## Lena pair correlation

In [None]:
data = pd.read_csv("data/2022_12_16_harvardwelch_patient_10_tooth_8_aspect_MB_depth_supra_fov_01_centroid_sciname.csv", index_col=0)
spatial_temp = data['coord']
spatial = [tuple(map(float, line.replace('[', '').replace(']', '').replace(',', '').split())) for line in spatial_temp]
x_min = np.min(np.array(spatial).T[0]); x_max = np.max(np.array(spatial).T[0])
y_min = np.min(np.array(spatial).T[1]); y_max = np.max(np.array(spatial).T[1])
area = (x_max - x_min)*(y_max - y_min)
rmax = min(x_max - x_min, y_max - y_min) / 2 # np.sqrt(area)/2
radii = np.linspace(0,rmax,50).reshape(50,1)
def ripley_K(spatial, radii):
    ## Ripley's K with edge correction
    npts = np.shape(spatial)[0]                                 # Number of events in A
    diff = np.zeros(shape = (npts*(npts-1)//2,2))               # Decomposed distances matrix
    k = 0
    for i in range(npts - 1):
        size = npts - i - 1
        diff[k:k + size] = abs(np.array(spatial[i]) - np.array(spatial[i+1:])) # distance
        k += size
    n_ripley = np.zeros(len(radii))
    distances = np.hypot(diff[:,0], diff[:,1])                      # Pythagorean Theorem (a^2+b^2=c^2)
    for r in range(len(radii)):
        n_ripley[r] = (distances<radii[r]).sum()                    # Indicator function and summation term
    n_ripley = area * 2. * n_ripley / (npts * (npts - 1))           # Expectation vector element-wise divided by scalar intensity
    return n_ripley
def ripley_K_boundary(spatial, radii=radii):
    ## Ripley's K with edge correction
    ripley = np.zeros(len(radii))
    #npts = len(spatial)
    npts = np.shape(spatial)[0]                                 # Number of events in A
    diff = np.zeros(shape = (npts*(npts-1)//2,2))               # Decomposed distances matrix
    k = 0
    for i in range(npts - 1):
        size = npts - i - 1
        diff[k:k + size] = abs(np.array(spatial[i]) - np.array(spatial[i+1:])) # distance
        k += size
    x_min = np.min(np.array(spatial).T[0]); x_max = np.max(np.array(spatial).T[0])
    y_min = np.min(np.array(spatial).T[1]); y_max = np.max(np.array(spatial).T[1])
    hor_dist = np.zeros(shape=(npts * (npts - 1)) // 2, dtype=np.double)
    ver_dist = np.zeros(shape=(npts * (npts - 1)) // 2, dtype=np.double)
    for k in range(npts - 1):                           # Finds horizontal and vertical distances from every event to nearest egde
        min_hor_dist = min(x_max - spatial[k][0], spatial[k][0] - x_min)
        min_ver_dist = min(y_max - spatial[k][1], spatial[k][1] - y_min)
        start = (k * (2 * (npts - 1) - (k - 1))) // 2
        end = ((k + 1) * (2 * (npts - 1) - k)) // 2
        hor_dist[start: end] = min_hor_dist * np.ones(npts - 1 - k)
        ver_dist[start: end] = min_ver_dist * np.ones(npts - 1 - k)
    dist = np.hypot(diff[:, 0], diff[:, 1])
    dist_ind = dist <= np.hypot(hor_dist, ver_dist)     # True if distance between events is less than or equal to distance to edge
    w1 = (1 - (np.arccos(np.minimum(ver_dist, dist) / dist) + np.arccos(np.minimum(hor_dist, dist) / dist)) / np.pi)
    w2 = (3 / 4 - 0.5 * (np.arccos(ver_dist / dist * ~dist_ind) + np.arccos(hor_dist / dist * ~dist_ind)) / np.pi)
    weight = dist_ind * w1 + ~dist_ind * w2              # Weighting term
    for r in range(len(radii)):
        ripley[r] = ((dist < radii[r]) / weight).sum()   # Indicator function with weighting term
    ripley = area * 2. * ripley / (npts * (npts - 1))
    return ripley
ripley_K =  ripley_K(spatial, radii)
ripley_K_edge =  ripley_K_boundary(spatial, radii)
ax, fig1 = plt.subplots()
fig1.plot(radii,ripley_K_edge,color='red',linewidth=2,label='K_edgeCorrect')
fig1.plot(radii,ripley_K,color='black',linewidth=2,label='K')
fig1.set_title('K Function without edge adjustment',size=20)
fig1.set_xlabel('Radii (m)',size=15); fig1.set_ylabel('K',size=15)
fig1.legend();

## Cluster Size

In [None]:
import numpy as np
import hdbscan
import libpysal as ps
from pointpats.window import Window

xlim = (0, np.max(coords[:, 1]))
ylim = (0, np.max(coords[:, 0]))

dims_im = (20,20)
spot_size = 10
lw=2
ft=20

dict_df = defaultdict(list)
for scn in scn_unq:
    # cluster coords
    col = dict_sciname_color[scn]
    bool_scn = scinames == scn
    coord_scn = coords[bool_scn]
    hdb = hdbscan.HDBSCAN(min_cluster_size=10).fit(coord_scn).labels_

    # Plot clusters
    fig, ax = ip.general_plot(col='w', dims=dims_im, lw=lw, ft=ft)
    coord_scn_cl = coord_scn[hdb != -1]
    hdb_cl = hdb[hdb != -1]
    ax.scatter(coord_scn_cl[:,1], coord_scn_cl[:,0], s=spot_size, c=hdb_cl, cmap='gist_rainbow')
    coord_scn_non = coord_scn[hdb == -1]
    ax.scatter(coord_scn_non[:,1], coord_scn_non[:,0], s=spot_size, color=[0.75]*3)
    ax.set_xlim(xlim[0], xlim[1])
    ax.set_ylim(ylim[0], ylim[1])
    ax.invert_yaxis()
    ax.set_aspect('equal')
    # print(np.unique(hdb))

    # count spots in clusters and plot convex hull on clusters
    # areas = []
    counts = []
    for cl in np.unique(hdb_cl):
        coords_cl = coord_scn[hdb == cl]
        ch_cl = ps.cg.convex_hull(coords_cl.tolist())
        ch_arr = np.array(to_ccf(ch_cl))
        ax.plot(ch_arr[:,1], ch_arr[:,0],'w', lw=lw)
        # w = Window([ch_cl])
        # areas.append(w.area)
        counts.append(coords_cl.shape[0])
        # cent = w.centroid
        # ax.text(cent[1], cent[0], int(w.area), color='w', fontsize=ft)
    plt.show()
    plt.close()

    # Plot power law stuff? Mori, Smith, Hsu, 2020
    counts_sort = sorted(counts)
    nclust = len(counts_sort)
    Pr = [r/nclust for r in range(nclust,0,-1)]
    fig, ax = ip.general_plot()
    lnpr = np.log(Pr) - 0.5  # improve alpha estimate Gabaix and ibragimov 2011
    lncs = np.log(counts_sort)
    ax.plot(lnpr, lncs)
    plt.show()
    plt.close()
    rval, pval = stats.pearsonr(lnpr, lncs)
    dict_df['rval'].append(rval)
    dict_df['pval'].append(pval)
    model = LinearRegression().fit(lncs[:, None], lnpr)
    dict_df['alpha'].append(1 / model.coef_[0])
    dict_df['coef'].append(model.coef_[0])
    dict_df['intercept'].append(model.intercept_)
    dict_df['score'].append(model.score(lncs[:, None], lnpr))



In [None]:
pd.DataFrame(np.array([[0,1],[2,3]]), columns=['a','b'], index=['1','2'])

# Look at feature matrix

In [None]:
sample_compare_dir =  '../outputs/compare_samples'
feature_matrix_fn = sample_compare_dir + '/samples_feature_matrix.csv'

In [None]:
feature_matrix = pd.read_csv(feature_matrix_fn, index_col=0)
feature_matrix = feature_matrix.fillna(0)
X = feature_matrix.values
feature_matrix.shape

### UMAP

In [None]:
# n_neighbors=2, min_dist=1

fig, ax = ip.general_plot(dims=(10,10), col='w')
fit = umap.UMAP(n_neighbors=2, min_dist=1).fit(X)
u = fit.embedding_
ax.scatter(u[:,0], u[:,1], s=10)
ax.set_aspect('equal')


In [None]:
kmeans = clst.KMeans(n_clusters=9).fit(u)
cl_kmeans = kmeans.labels_

In [None]:
fig, ax = ip.general_plot(dims=(10, 10), col="w")

ax.scatter(u[:, 0], u[:, 1], s=10, c=cl_kmeans, cmap='tab10')
ax.set_aspect("equal")

In [None]:
for cl in np.unique(cl_kmeans):
    print(cl)
    bool_cl = cl_kmeans == cl
    print(feature_matrix.index.values[bool_cl])

### PCA

In [None]:

pca = PCA(n_components=10).fit(X)
pca.explained_variance_ratio_

In [None]:
cols = feature_matrix.columns.values

for i, c in enumerate(pca.components_):
    if i < 4:
        print(i)
        vlow = np.sort(c)[:3]
        vhigh = np.sort(c)[-3:]
        for v in [vlow, vhigh]:
            for v_ in v:
                bool_v = c == v_
                print(v_)
                print(cols[bool_v])



In [None]:
pca = PCA()
scaler = StandardScaler()
Xs = scaler.fit_transform(X)
pca_fit = pca.fit(Xs)
Xt = pca_fit.transform(X)
plot = plt.scatter(Xt[:, 0], Xt[:, 1])

In [None]:
for i, c in enumerate(pca_fit.components_):
    if i < 4:
        print(i)
        vlow = np.sort(c)[:3]
        vhigh = np.sort(c)[-3:]
        for v in [vlow, vhigh]:
            for v_ in v:
                bool_v = c == v_
                print(v_)
                print(cols[bool_v])

In [None]:
kmeans_pca = clst.KMeans(n_clusters=3).fit(Xt[:,:2])
cl_kmeans_pca = kmeans_pca.labels_

In [None]:
fig, ax = ip.general_plot(dims=(10, 10), col="w")

ax.scatter(Xt[:, 0], Xt[:, 1], s=20, c=cl_kmeans_pca, cmap="tab10")
ax.set_aspect("equal")

In [None]:
for cl in np.unique(cl_kmeans_pca):
    print(cl)
    bool_cl = cl_kmeans_pca == cl
    print(feature_matrix.index.values[bool_cl])

### Compare groups

In [None]:
dict_group_sn = {
    "healthy_tooth": [
        "2023_02_18_hsdm_group_batch1_patient_1_fov_01",
        "2023_02_18_hsdm_group_batch1_patient_1_fov_02",
        "2023_02_18_hsdm_group_batch2_patient_10_fov_01",
        "2023_02_18_hsdm_group_batch2_patient_9_fov_01",
        "2022_12_16_harvardwelch_patient_10_tooth_8_aspect_MB_depth_supra_fov_01",
        "2022_12_16_harvardwelch_patient_10_tooth_8_aspect_MB_depth_supra_fov_02",
        "2022_12_16_harvardwelch_patient_10_tooth_8_aspect_MB_depth_supra_fov_03",
        "2022_12_16_harvardwelch_patient_1_tooth_31_aspect_ML_depth_supra_fov_01",
        "2022_12_16_harvardwelch_patient_1_tooth_31_aspect_ML_depth_supra_fov_02",
        "2022_12_16_harvardwelch_patient_1_tooth_31_aspect_ML_depth_supra_fov_03",
        "2022_12_16_harvardwelch_patient_9_tooth_15_aspect_MB_depth_supra_fov_01",
        "2022_12_16_harvardwelch_patient_9_tooth_15_aspect_MB_depth_supra_fov_02",
        "2022_12_16_harvardwelch_patient_9_tooth_15_aspect_MB_depth_supra_fov_03",
        "2022_12_16_harvardwelch_patient_9_tooth_3_aspect_D_depth_supra_fov_01",
        "2022_12_16_harvardwelch_patient_9_tooth_3_aspect_D_depth_supra_fov_02",
        "2022_12_16_harvardwelch_patient_9_tooth_3_aspect_D_depth_supra_fov_03",
    ],
    "periodontitis_tooth": [
        "2023_02_18_hsdm_group_batch2_patient_11_fov_01",
        "2023_02_18_hsdm_group_batch1_patient_2_fov_01",
        "2023_02_18_hsdm_group_batch1_patient_3_fov_01",
        "2023_02_18_hsdm_group_batch1_patient_4_fov_01",
        "2023_02_18_hsdm_group_batch1_patient_7_fov_01",
    ],
    "healthy_implant": [
        "2023_02_18_hsdm_group_I_patient_11_fov_01",
        "2023_02_18_hsdm_group_I_patient_11_fov_02",
        "2023_02_18_hsdm_group_I_patient_13_fov_01",
        "2023_02_18_hsdm_group_I_patient_6_fov_01",
        "2023_10_16_hsdm_slide_IB_fov_01",
        "2023_10_16_hsdm_slide_IB_fov_03",
        "2023_02_08_hsdm_group_1_sample_06_fov_01",
        "2023_02_08_hsdm_group_1_sample_11_fov_01",
        "2023_02_08_hsdm_group_1_sample_12_fov_01",
        "2023_10_16_hsdm_slide_IB_fov_02",
        "2023_10_16_hsdm_slide_IL_fov_01",
        "2023_10_16_hsdm_slide_IL_fov_02",
        "2023_10_16_hsdm_slide_IL_fov_03",
        "2024_04_24_hsdm_group_I_patient_16_aspect_MB_fov_01",
        "2024_04_24_hsdm_group_I_patient_16_aspect_MB_fov_02",
        "2024_04_24_hsdm_group_I_patient_16_aspect_MB_fov_03",
    ],
    "severe_peri_implantitis": [
        "2023_02_18_hsdm_group_IV_patient_1_fov_01",
        "2023_02_18_hsdm_group_IV_patient_1_fov_02",
        "2022_12_16_harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_01",
        "2022_12_16_harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_02",
        "2022_12_16_harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_03",
        "2022_12_16_harvardwelch_patient_18_tooth_2_aspect_MB_depth__fov_01",
        "2022_12_16_harvardwelch_patient_19_tooth_15_aspect_MF_depth_sub_fov_01",
        "2022_12_16_harvardwelch_patient_19_tooth_30_aspect_MB_depth_sub_fov_01",
        "2023_02_08_hsdm_group_4_sample_01_fov_01",
        "2023_02_08_hsdm_group_4_sample_01_fov_02",
        "2024_04_16_hsdmgel_group_IV_pat_4_asp_MB_fov_02",
        "2024_05_03_hsdm_group_IV_patient_4_aspect_ML_fov_01",
        "2024_05_03_hsdm_group_IV_patient_4_aspect_ML_fov_02",
        "2024_05_03_hsdm_group_IV_patient_3_aspect_DL_fov_02",
        "2024_05_03_hsdm_group_IV_patient_3_aspect_DL_fov_03",
    ],
    "moderate_peri_implantitis": [
        "2023_10_18_hsdm_slide_IIIB_fov_01",
        "2023_10_18_hsdm_slide_IIIL_fov_01",
        "2024_04_16_hsdmgel_group_III_pat_6_asp_ML_fov_03",
        "2024_04_19_hsdm_group_III_patient_7_aspect_ML_fov_02",
        "2024_04_19_hsdm_group_III_patient_7_aspect_ML_fov_03",
        "2024_04_19_hsdm_group_III_patient_7_aspect_ML_fov_04",
        "2024_04_24_hsdm_group_III_patient_5_aspect_DB_fov_01",
        "2024_04_24_hsdm_group_III_patient_8_aspect_DB_fov_03",
        "2024_04_27_hsdm_group_III_patient_11_aspect_DL_fov_01",
        "2024_04_27_hsdm_group_III_patient_11_aspect_DL_fov_02",
        "2024_05_03_hsdm_group_III_patient_3_aspect_DL_fov_02",
        "2024_05_03_hsdm_group_III_patient_3_aspect_DL_fov_03",
    ],
    "mucositis_implant": [
        "2023_02_18_hsdm_group_II_patient_6_fov_01",
        "2023_02_18_hsdm_group_II_patient_7_fov_01",
        "2023_02_18_hsdm_group_II_patient_7_fov_02",
        "2023_02_08_hsdm_group_2_sample_06_fov_01",
        "2023_02_08_hsdm_group_2_sample_06_fov_02",
        "2023_10_18_hsdm_slide_IIL_fov_01",
        "2024_04_19_hsdm_group_II_patient_13_aspect_MB_fov_04",
        "2024_04_19_hsdm_group_II_patient_13_aspect_MB_fov_05",
        "2024_04_19_hsdm_group_II_patient_13_aspect_MB_fov_06",
        "2024_04_19_hsdm_group_II_patient_14_aspect_MB_fov_01",
        "2024_04_19_hsdm_group_II_patient_14_aspect_MB_fov_02",
        "2024_04_19_hsdm_group_II_patient_15_aspect_MB_fov_01",
        "2024_04_19_hsdm_group_II_patient_15_aspect_MB_fov_02",
        "2024_04_27_hsdm_group_II_patient_11_aspect_DL_fov_02",
        "2024_04_27_hsdm_group_II_patient_11_aspect_DL_fov_03",
        "2024_04_27_hsdm_group_II_patient_11_aspect_DL_fov_04",
    ],

}

In [None]:
dict_group_pat_sn = {
    "healthy_implant": {
        "patient_11": [
            "2023_02_18_hsdm_group_I_patient_11_fov_01",
            "2023_02_18_hsdm_group_I_patient_11_fov_02",
            "2023_02_08_hsdm_group_1_sample_11_fov_01",
        ],
        "patient_16": [
            "2024_04_24_hsdm_group_I_patient_16_aspect_MB_fov_01",
            "2024_04_24_hsdm_group_I_patient_16_aspect_MB_fov_02",
            "2024_04_24_hsdm_group_I_patient_16_aspect_MB_fov_03",
            "2024_04_24_hsdm_group_I_patient_16_aspect_MB_fov_04",
        ],
        "patient_13": [
            "2023_02_18_hsdm_group_I_patient_13_fov_01",
        ],
        "patient_6": [
            "2023_02_18_hsdm_group_I_patient_6_fov_01",
            "2023_02_08_hsdm_group_1_sample_06_fov_01",
        ],
        "patient_12": [
            "2023_02_08_hsdm_group_1_sample_12_fov_01",
        ],
        "patient_4": [
            "2023_10_16_hsdm_slide_IB_fov_01",
            "2023_10_16_hsdm_slide_IB_fov_02",
            "2023_10_16_hsdm_slide_IB_fov_03",
            "2023_10_16_hsdm_slide_IB_fov_02",
            "2023_10_16_hsdm_slide_IL_fov_01",
            "2023_10_16_hsdm_slide_IL_fov_02",
            "2023_10_16_hsdm_slide_IL_fov_03",
        ],
        "patient_10": [
            "2024_04_16_hsdmgel_group_I_pat_10_asp_DL_fov_01",
            "2024_04_16_hsdmgel_group_I_pat_10_asp_DL_fov_02",
        ]
    },
    "mucositis_implant": {
        "patient_6":[
            "2023_02_18_hsdm_group_II_patient_6_fov_01",
            "2023_02_08_hsdm_group_2_sample_06_fov_01",
            "2023_02_08_hsdm_group_2_sample_06_fov_02",
        ],
        "patient_7":[
            "2023_02_18_hsdm_group_II_patient_7_fov_01",
            "2023_02_18_hsdm_group_II_patient_7_fov_02",
        ],
        "patient_9":[
            "2023_10_18_hsdm_slide_IIL_fov_01",
            "2023_10_16_hsdm_slide_IIB_fov_01",
        ],
        "patient_13":[
            "2024_04_19_hsdm_group_II_patient_13_aspect_MB_fov_01",
            "2024_04_19_hsdm_group_II_patient_13_aspect_MB_fov_02",
            "2024_04_19_hsdm_group_II_patient_13_aspect_MB_fov_03",
            "2024_04_19_hsdm_group_II_patient_13_aspect_MB_fov_04",
            "2024_04_19_hsdm_group_II_patient_13_aspect_MB_fov_05",
            "2024_04_19_hsdm_group_II_patient_13_aspect_MB_fov_06",
        ],
        "patient_14":[
            "2024_04_16_hsdmgel_group_II_pat_14_asp_DB_fov_01",
            "2024_04_16_hsdmgel_group_II_pat_14_asp_DB_fov_02",
            "2024_04_16_hsdmgel_group_II_pat_14_asp_DB_fov_03",
            "2024_04_16_hsdmgel_group_II_pat_14_asp_DB_fov_04",
            "2024_04_19_hsdm_group_II_patient_14_aspect_MB_fov_01",
            "2024_04_19_hsdm_group_II_patient_14_aspect_MB_fov_02",
        ],
        "patient_15":[
            "2024_04_19_hsdm_group_II_patient_15_aspect_MB_fov_01",
            "2024_04_19_hsdm_group_II_patient_15_aspect_MB_fov_02",
        ],
        "patient_11":[
            "2024_04_27_hsdm_group_II_patient_11_aspect_DL_fov_01",
            "2024_04_27_hsdm_group_II_patient_11_aspect_DL_fov_02",
            "2024_04_27_hsdm_group_II_patient_11_aspect_DL_fov_03",
            "2024_04_27_hsdm_group_II_patient_11_aspect_DL_fov_04",
        ],
        "patient_12":[
            "2024_04_27_hsdm_group_II_patient_12_aspect_ML_fov_01",
            "2024_04_27_hsdm_group_II_patient_12_aspect_ML_fov_02",
            "2024_04_27_hsdm_group_II_patient_12_aspect_ML_fov_03",
            "2024_04_27_hsdm_group_II_patient_12_aspect_ML_fov_04",
        ],
        "patient_8":[
            "2024_04_27_hsdm_group_II_patient_8_aspect_DL_fov_01",
            "2024_04_27_hsdm_group_II_patient_8_aspect_DL_fov_02",
        ]
    },
    
    "mild_peri_implantitis": {
        "patient_2":[
            "2023_10_18_hsdm_slide_IIIB_fov_01",
            "2023_10_18_hsdm_slide_IIIL_fov_01",
        ],
        "patient_6":[
            "2024_04_16_hsdmgel_group_III_pat_6_asp_ML_fov_01",
            "2024_04_16_hsdmgel_group_III_pat_6_asp_ML_fov_02",
            "2024_04_16_hsdmgel_group_III_pat_6_asp_ML_fov_03",
        ],
        "patient_7":[
            "2024_04_19_hsdm_group_III_patient_7_aspect_ML_fov_01",
            "2024_04_19_hsdm_group_III_patient_7_aspect_ML_fov_02",
            "2024_04_19_hsdm_group_III_patient_7_aspect_ML_fov_03",
            "2024_04_19_hsdm_group_III_patient_7_aspect_ML_fov_04",
        ],
        "patient_5":[
            "2024_04_24_hsdm_group_III_patient_5_aspect_DB_fov_01",
            "2024_04_24_hsdm_group_III_patient_5_aspect_DB_fov_02",
            "2024_04_24_hsdm_group_III_patient_5_aspect_DB_fov_03",
            "2024_04_24_hsdm_group_III_patient_5_aspect_DB_fov_04",
        ],
        "patient_8":[
            "2024_04_24_hsdm_group_III_patient_8_aspect_DB_fov_01",
            "2024_04_24_hsdm_group_III_patient_8_aspect_DB_fov_02",
            "2024_04_24_hsdm_group_III_patient_8_aspect_DB_fov_03",
        ],
        "patient_11":[
            "2024_04_27_hsdm_group_III_patient_11_aspect_DL_fov_01",
            "2024_04_27_hsdm_group_III_patient_11_aspect_DL_fov_02",
        ],
        "patient_3":[
            "2024_05_03_hsdm_group_III_patient_3_aspect_DL_fov_01",
            "2024_05_03_hsdm_group_III_patient_3_aspect_DL_fov_02",
            "2024_05_03_hsdm_group_III_patient_3_aspect_DL_fov_03",
        ],
    },

    "moderate_severe_peri_implantitis": {
        "patient_1":[
            "2023_02_18_hsdm_group_IV_patient_1_fov_01",
            "2023_02_18_hsdm_group_IV_patient_1_fov_02",
            "2023_02_08_hsdm_group_4_sample_01_fov_01",
            "2023_02_08_hsdm_group_4_sample_01_fov_02",
        ],
        "patient_14":[
            "2022_12_16_harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_01",
            "2022_12_16_harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_02",
            "2022_12_16_harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_03",
        ],
        "patient_18":[
            "2022_12_16_harvardwelch_patient_18_tooth_2_aspect_MB_depth__fov_01",
        ],
        "patient_19":[
            "2022_12_16_harvardwelch_patient_19_tooth_30_aspect_MB_depth_sub_fov_01",
        ],
        "patient_4":[
            "2024_04_16_hsdmgel_group_IV_pat_4_asp_MB_fov_01",
            "2024_04_16_hsdmgel_group_IV_pat_4_asp_MB_fov_02",
            "2024_05_03_hsdm_group_IV_patient_4_aspect_ML_fov_01",
            "2024_05_03_hsdm_group_IV_patient_4_aspect_ML_fov_02",
            "2024_05_03_hsdm_group_IV_patient_4_aspect_ML_fov_03",
        ],
        "patient_3":[
            "2024_05_03_hsdm_group_IV_patient_3_aspect_DL_fov_01",
            "2024_05_03_hsdm_group_IV_patient_3_aspect_DL_fov_02",
            "2024_05_03_hsdm_group_IV_patient_3_aspect_DL_fov_03",
        ],
        "patient_2":[
            "2023_10_18_hsdm_slide_IVL_fov_01",
            "2023_10_18_hsdm_slide_IVB_fov_01",
        ]
    },


}

In [None]:
exclude_sns = [
    '2023_10_16_hsdm_slide_IB_fov_01', 
    '2023_10_16_hsdm_slide_IB_fov_03', 
    '2023_10_16_hsdm_slide_IB_fov_02', 
    '2022_12_16_harvardwelch_patient_18_tooth_2_aspect_MB_depth__fov_01',
    '2022_12_16_harvardwelch_patient_19_tooth_30_aspect_MB_depth_sub_fov_01',
    '2023_02_18_hsdm_group_II_patient_7_fov_02',
    '2023_02_18_hsdm_group_II_patient_7_fov_01',
    '2024_04_16_hsdmgel_group_I_pat_10_asp_DL_fov_01',
    '2024_04_16_hsdmgel_group_I_pat_10_asp_DL_fov_02',
    '2024_04_27_hsdm_group_II_patient_12_aspect_ML_fov_01',
    '2024_04_27_hsdm_group_II_patient_12_aspect_ML_fov_02',
    '2024_04_27_hsdm_group_II_patient_12_aspect_ML_fov_03',
    '2024_04_16_hsdmgel_group_II_pat_14_asp_DB_fov_01',
    '2024_04_16_hsdmgel_group_II_pat_14_asp_DB_fov_02',
    '2024_04_16_hsdmgel_group_II_pat_14_asp_DB_fov_03',
    '2024_04_16_hsdmgel_group_II_pat_14_asp_DB_fov_04',
    '2024_04_27_hsdm_group_II_patient_8_aspect_DL_fov_01',
    '2024_04_27_hsdm_group_II_patient_8_aspect_DL_fov_02',
    '2023_10_18_hsdm_slide_IIIB_fov_01',
    '2024_04_16_hsdmgel_group_III_pat_6_asp_ML_fov_01',
    '2024_05_03_hsdm_group_III_patient_3_aspect_DL_fov_01',
    '2024_04_24_hsdm_group_III_patient_5_aspect_DB_fov_01',
    '2024_04_24_hsdm_group_III_patient_5_aspect_DB_fov_03',
    '2024_04_24_hsdm_group_III_patient_5_aspect_DB_fov_04',
    '2024_04_24_hsdm_group_III_patient_8_aspect_DB_fov_01',
    '2024_04_24_hsdm_group_III_patient_8_aspect_DB_fov_02',    
    '2024_05_03_hsdm_group_IV_patient_3_aspect_DL_fov_01',
]

In [None]:

# dict_group_sn = {
#     "healthy_tooth": [
#         "2023_02_18_hsdm_group_batch1_patient_1_fov_01",
#         "2023_02_18_hsdm_group_batch1_patient_1_fov_02",
#         "2023_02_18_hsdm_group_batch2_patient_10_fov_01",
#         "2023_02_18_hsdm_group_batch2_patient_9_fov_01",
#         "2022_12_16_harvardwelch_patient_10_tooth_8_aspect_MB_depth_supra_fov_01",
#         "2022_12_16_harvardwelch_patient_10_tooth_8_aspect_MB_depth_supra_fov_02",
#         "2022_12_16_harvardwelch_patient_10_tooth_8_aspect_MB_depth_supra_fov_03",
#         "2022_12_16_harvardwelch_patient_1_tooth_31_aspect_ML_depth_supra_fov_01",
#         "2022_12_16_harvardwelch_patient_1_tooth_31_aspect_ML_depth_supra_fov_02",
#         "2022_12_16_harvardwelch_patient_1_tooth_31_aspect_ML_depth_supra_fov_03",
#         "2022_12_16_harvardwelch_patient_9_tooth_15_aspect_MB_depth_supra_fov_01",
#         "2022_12_16_harvardwelch_patient_9_tooth_15_aspect_MB_depth_supra_fov_02",
#         "2022_12_16_harvardwelch_patient_9_tooth_15_aspect_MB_depth_supra_fov_03",
#         "2022_12_16_harvardwelch_patient_9_tooth_3_aspect_D_depth_supra_fov_01",
#         "2022_12_16_harvardwelch_patient_9_tooth_3_aspect_D_depth_supra_fov_02",
#         "2022_12_16_harvardwelch_patient_9_tooth_3_aspect_D_depth_supra_fov_03",
#     ],
#     "periodontitis_tooth": [
#         "2023_02_18_hsdm_group_batch2_patient_11_fov_01",
#         "2023_02_18_hsdm_group_batch1_patient_2_fov_01",
#         "2023_02_18_hsdm_group_batch1_patient_3_fov_01",
#         "2023_02_18_hsdm_group_batch1_patient_4_fov_01",
#         "2023_02_18_hsdm_group_batch1_patient_7_fov_01",
#     ],
#     "healthy_implant": [
#         "2023_02_18_hsdm_group_I_patient_11_fov_01",
#         "2023_02_18_hsdm_group_I_patient_11_fov_02",
#         "2023_02_18_hsdm_group_I_patient_13_fov_01",
#         "2023_02_18_hsdm_group_I_patient_6_fov_01",
#         "2023_10_16_hsdm_slide_IB_fov_01",
#         "2023_10_16_hsdm_slide_IB_fov_03",
#         "2023_02_08_hsdm_group_1_sample_06_fov_01",
#         "2023_02_08_hsdm_group_1_sample_11_fov_01",
#         "2023_02_08_hsdm_group_1_sample_12_fov_01",
#         "2023_10_16_hsdm_slide_IB_fov_02",
#         "2023_10_16_hsdm_slide_IL_fov_01",
#         "2023_10_16_hsdm_slide_IL_fov_02",
#         "2023_10_16_hsdm_slide_IL_fov_03",
#     ],
#     "severe_implant": [
#         "2023_02_18_hsdm_group_IV_patient_1_fov_01",
#         "2023_02_18_hsdm_group_IV_patient_1_fov_02",
#         "2022_12_16_harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_01",
#         "2022_12_16_harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_02",
#         "2022_12_16_harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_03",
#         "2022_12_16_harvardwelch_patient_18_tooth_2_aspect_MB_depth__fov_01",
#         "2022_12_16_harvardwelch_patient_19_tooth_15_aspect_MF_depth_sub_fov_01",
#         "2022_12_16_harvardwelch_patient_19_tooth_30_aspect_MB_depth_sub_fov_01",
#         "2023_02_08_hsdm_group_4_sample_01_fov_01",
#         "2023_02_08_hsdm_group_4_sample_01_fov_02",
#     ],
#     "mucositis_implant": [
#         "2023_02_18_hsdm_group_II_patient_6_fov_01",
#         "2023_02_18_hsdm_group_II_patient_7_fov_01",
#         "2023_02_18_hsdm_group_II_patient_7_fov_02",
#         "2023_02_08_hsdm_group_2_sample_06_fov_01",
#         "2023_02_08_hsdm_group_2_sample_06_fov_02",
#         "2023_10_18_hsdm_slide_IIL_fov_01",
#     ],
#     "moderate_implant": [
#         "2023_10_18_hsdm_slide_IIIB_fov_01",
#         "2023_10_18_hsdm_slide_IIIL_fov_01",
#     ],
# }

In [None]:
# dict_sn_group = {}
# for k, vs in dict_group_sn.items():
#     for v in vs:
#         dict_sn_group[v] = k

In [None]:
dict_sn_group = {}
for k, dps in dict_group_pat_sn.items():
    for p, sns in dps.items():
        for v in sns:
            dict_sn_group[v] = k

In [None]:
a = np.array([1,1,1,2,2,3,3,3,3])
_, counts = np.unique(a, return_counts=True)
counts

In [None]:
a = "/{date}_{sn}_sciname_{scn}_cluster_size.npy".format(date=1, sn=2, scn='Veillonella')

re.search('(?<=sciname_)[A-Za-z]+', a)[0]

In [None]:
# dict_group_sn = {
#     "healthy_tooth": [
#         "2022_12_16_harvardwelch_patient_10_tooth_8_aspect_MB_depth_supra_fov_01",
#         "2022_12_16_harvardwelch_patient_10_tooth_8_aspect_MB_depth_supra_fov_02",
#         "2022_12_16_harvardwelch_patient_10_tooth_8_aspect_MB_depth_supra_fov_03",
#         "hsdm_group_batch2_patient_10_fov_01_2023_02_18",
#         "hsdm_group_batch2_patient_9_fov_01_2023_02_18",
#         "harvardwelch_patient_9_tooth_15_aspect_MB_depth_supra_fov_01_2022_12_16",
#         "harvardwelch_patient_9_tooth_15_aspect_MB_depth_supra_fov_02_2022_12_16",
#         "harvardwelch_patient_9_tooth_15_aspect_MB_depth_supra_fov_03_2022_12_16",
#         "harvardwelch_patient_9_tooth_3_aspect_D_depth_supra_fov_01_2022_12_16",
#         "harvardwelch_patient_9_tooth_3_aspect_D_depth_supra_fov_02_2022_12_16",
#         "harvardwelch_patient_9_tooth_3_aspect_D_depth_supra_fov_03_2022_12_16",
#         "harvardwelch_patient_1_tooth_31_aspect_ML_depth_supra_fov_01_2022_12_16",
#         "harvardwelch_patient_1_tooth_31_aspect_ML_depth_supra_fov_02_2022_12_16",
#         "harvardwelch_patient_1_tooth_31_aspect_ML_depth_supra_fov_03_2022_12_16",
#         "hsdm_group_batch1_patient_1_fov_01_2023_02_18",
#         "hsdm_group_batch1_patient_1_fov_02_2023_02_18",
#     ],
#     "disease_tooth": [
#         "hsdm_group_batch1_patient_2_fov_01_2023_02_18",
#         "hsdm_group_batch1_patient_3_fov_01_2023_02_18",
#         "hsdm_group_batch1_patient_4_fov_01_2023_02_18",
#         "hsdm_group_batch1c_patient_7_fov_01_2023_02_18",
#         "2023_02_18_hsdm_group_batch2_patient_11_fov_01",
#     ],
#     "healthy_implant": [
#         "hsdm_group_I_patient_11_fov_01_2023_02_18",
#         "hsdm_group_I_patient_11_fov_02_2023_02_18",
#         "hsdm_group_I_patient_13_fov_01_2023_02_18",
#         "hsdm_group_I_patient_6_fov_01_2023_02_18",
#         "hsdm_group_1_sample_06_fov_01_2023_02_08",
#         "hsdm_group_1_sample_11_fov_01_2023_02_08",
#         "hsdm_group_1_sample_12_fov_01_2023_02_08",
#     ],
#     "moderate_severe_implant": [
#         "harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_01_2022_12_16",
#         "harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_03_2022_12_16",
#         "harvardwelch_patient_18_tooth_2_aspect_MB_depth__fov_01_2022_12_16",
#         "harvardwelch_patient_19_tooth_15_aspect_MF_depth_sub_fov_01_2022_12_16",
#         "harvardwelch_patient_19_tooth_30_aspect_MB_depth_sub_fov_01_2022_12_16",
#         "hsdm_group_4_sample_01_fov_01_2023_02_08",
#         "hsdm_group_4_sample_01_fov_02_2023_02_08",
#         "hsdm_group_IV_patient_1_fov_01_2023_02_18",
#         "hsdm_group_IV_patient_1_fov_02_2023_02_18",
#         "harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_02_2022_12_16",
#     ],
#     "mucositis_implant": [
#         "hsdm_group_2_sample_06_fov_01_2023_02_08",
#         "hsdm_group_2_sample_06_fov_02_2023_02_08",
#         "hsdm_group_II_patient_7_fov_01_2023_02_18",
#         "hsdm_group_II_patient_7_fov_02_2023_02_18",
#         "hsdm_group_II_patient_6_fov_01_2023_02_18",
#     ],
# }

## Get cluster size distribution

In [None]:
cluster_size_dict_fn = sample_compare_dir + "/dict_sample_sciname_clustersize.yaml"
os.path.exists(cluster_size_dict_fn)

In [None]:
cluster_size_dir = sample_compare_dir + '/cluster_size_distribution'

In [None]:
with open(cluster_size_dict_fn, 'r') as f:
    dict_sn_scn_clsize = yaml.unsafe_load(f)
len(dict_sn_scn_clsize)

Pool all clusters and plot distribution

In [None]:
sizes_all = []
for sn, dict_scn_clsize in dict_sn_scn_clsize.items():
    for scn, clsizes in dict_scn_clsize.items():
        sizes_all += clsizes.tolist()

len(sizes_all)

In [None]:
counts_sort = np.sort(sizes_all)
nclust = len(counts_sort)
Pr = np.array([r/nclust for r in range(nclust,0,-1)])

bool_l = counts_sort > 20
bool_u = counts_sort < 1e4
lnc = np.log(counts_sort[bool_l * bool_u])
lnp = np.log(Pr[bool_l * bool_u])
slope, intercept, r_value, p_value, std_err = stats.linregress(lnc, lnp)

fig, ax = ip.general_plot()
ax.scatter(counts_sort, Pr, color='k')

x = [2e1, 3e4]
y = math.exp(intercept) * x**slope
ax.plot(x, y, 'r')
x = 3e2
y = math.exp(intercept) * x**slope + 0.1
ax.text(x, y, 'Slope = ' + str(round(slope,2)), color='r')

ax.set_yscale("log")
ax.set_xscale("log")
ax.set_title('All clusters (n=8117)')
ax.set_xlabel("log(number of cells)")
ax.set_ylabel("log(Pr(ncells > x))")
out_fn = cluster_size_dir + '/all_clusters_size_distribution.png'
ip.check_dir(out_fn)
ip.save_fig(out_fn)

Each image

In [None]:
dict_sn_clsizes = defaultdict(list)
for sn, dict_scn_clsize in dict_sn_scn_clsize.items():
    for scn, clsizes in dict_scn_clsize.items():
        dict_sn_clsizes[sn] += clsizes.tolist()

len(dict_sn_clsizes)

In [None]:
smin = 1e10
smax = -1e10

colors = plt.get_cmap('tab10').colors
dict_group_color = dict(zip(list(dict_group_sn.keys()), colors))

for group, sns in dict_group_sn.items():
    color = dict_group_color[group]
    fig, ax = ip.general_plot()
    slopes = []
    intercepts = []
    for sn in sns:
        clsizes = dict_sn_clsizes[sn]
        counts_sort = np.sort(clsizes)
        nclust = len(counts_sort)
        Pr = np.array([r / nclust for r in range(nclust, 0, -1)])

        bool_l = counts_sort > 20
        bool_u = counts_sort < 1e4
        lnc = np.log(counts_sort[bool_l * bool_u])
        lnp = np.log(Pr[bool_l * bool_u])
        slope, intercept, r_value, p_value, std_err = stats.linregress(lnc, lnp)
        if slope < smin: 
            smin = slope
            imin = intercept
        if slope > smax:
            smax = slope
            imax = intercept
        ax.scatter(counts_sort, Pr, s=2, color=color)

        slopes.append(slope)
        intercepts.append(intercept)

    slopes = np.array(slopes)
    slopes = slopes[~np.isnan(slopes)]
    intercepts = np.array(intercepts)
    intercepts = intercepts[~np.isnan(intercepts)]
    smean = np.median(slopes)
    imean = np.median(intercepts)
    x = [2e1, 1e4]
    y = math.exp(imean) * x**smean
    ax.plot(x, y, "r")
    x = 3e2
    y = (math.exp(imean) * x**smean) 
    print(imean, smean)
    print(x, y)
    ax.text(x, y, "Slope median = " + str(round(smean, 2)), color="r", ha="left", va="bottom")

    # x = [2e1, 1e4]
    # y = math.exp(imin) * x**smin
    # ax.plot(x, y, "r")
    # x = 3e2
    # y = (math.exp(imin) * x**smin) - 0.01
    # print(x, y)
    # ax.text(x, y, "Slope min = " + str(round(smin, 2)), color="r", ha='right', va='top')

    # x = [2e1, 1e4]
    # y = math.exp(imax) * x**smax
    # ax.plot(x, y, "r")
    # x = 3e2
    # y = math.exp(imax) * x**smax + 0.1
    # ax.text(x, y, "Slope max = " + str(round(smax, 2)), color="r", ha='left', va='bottom')
    # print(x,y)

    ax.set_yscale("log")
    ax.set_xscale("log")
    ax.set_title(group)
    ax.set_xlabel("log(number of cells)")
    ax.set_ylabel("log(Pr(ncells > x))")

    out_fn = cluster_size_dir + '/size_distribution_by_sample_group_{}.png'.format(group)
    ip.check_dir(out_fn)
    ip.save_fig(out_fn)

Plot implants together

In [None]:
counts_sort

In [None]:
smin = 1e10
smax = -1e10

colors = plt.get_cmap("tab10").colors
dict_group_color = dict(zip(list(dict_group_sn.keys()), colors))
fig, ax = ip.general_plot()

for group, sns in dict_group_sn.items():
    if 'implant' in group:
        color = dict_group_color[group]
        clsizes = []
        for sn in sns:
            clsizes += dict_sn_clsizes[sn]

        counts_sort = np.sort(np.array(clsizes))
        nclust = len(counts_sort)
        Pr = np.array([r / nclust for r in range(nclust, 0, -1)])

        bool_l = counts_sort > 20
        bool_u = counts_sort < 1e4
        lnc = np.log(counts_sort[bool_l * bool_u])
        lnp = np.log(Pr[bool_l * bool_u])


        slope, intercept, r_value, p_value, std_err = stats.linregress(lnc, lnp)
        if slope < smin:
            smin = slope
            imin = intercept
        if slope > smax:
            smax = slope
            imax = intercept
        ax.scatter(counts_sort, Pr, s=2, color=color, label=group)

        # x = [2e1, 1e4]
        # y = math.exp(intercept) * x**slope
        # ax.plot(x, y, "r")
        # x = 3e2
        # y = math.exp(intercept) * x**slope
        # ax.text(
        #     x,
        #     y,
        #     "Slope = " + str(round(slope, 2)) + ", R^2 = " + str(round(r_value**2, 3)),
        #     color="r",
        #     ha="left",
        #     va="bottom",
        # )

        # x = [2e1, 1e4]
        # y = math.exp(imin) * x**smin
        # ax.plot(x, y, "r")
        # x = 3e2
        # y = (math.exp(imin) * x**smin) - 0.01
        # print(x, y)
        # ax.text(x, y, "Slope min = " + str(round(smin, 2)), color="r", ha='right', va='top')

        # x = [2e1, 1e4]
        # y = math.exp(imax) * x**smax
        # ax.plot(x, y, "r")
        # x = 3e2
        # y = math.exp(imax) * x**smax + 0.1
        # ax.text(x, y, "Slope max = " + str(round(smax, 2)), color="r", ha='left', va='bottom')
        # print(x,y)

        ax.set_yscale("log")
        ax.set_xscale("log")
        ax.set_title(group)
        ax.set_xlabel("log(ncells in cluster)")
        ax.set_ylabel("log(Pr(ncells > x))")
ax.legend()
    # out_fn = cluster_size_dir + "/size_distribution_by_sample_group_{}.png".format(
    #     group
    # )
    # ip.check_dir(out_fn)
    # ip.save_fig(out_fn)

Plot each taxon

In [None]:
dict_scn_clsizes = defaultdict(list)
for sn, dict_scn_clsize in dict_sn_scn_clsize.items():
    for scn, clsizes in dict_scn_clsize.items():
        dict_scn_clsizes[scn] += clsizes.tolist()

len(dict_scn_clsizes)

In [None]:
smin = 1e10
smax = -1e10

colors = plt.get_cmap('tab20').colors

plots = []
slopes = []
intercepts = []
fig, ax = ip.general_plot()
for i, (sn, clsizes) in enumerate(dict_scn_clsizes.items()):
    counts_sort = np.sort(clsizes)
    nclust = len(counts_sort)
    Pr = np.array([r / nclust for r in range(nclust, 0, -1)])

    bool_l = counts_sort > 20
    bool_u = counts_sort < 1e4
    lnc = np.log(counts_sort[bool_l * bool_u])
    lnp = np.log(Pr[bool_l * bool_u])
    slope, intercept, r_value, p_value, std_err = stats.linregress(lnc, lnp)
    if slope < smin:
        smin = slope
        imin = intercept
    if slope > smax:
        smax = slope
        imax = intercept
    slopes.append(slope)
    intercepts.append(intercept)
    ax.scatter(counts_sort, Pr, s=2, label=sn, color=colors[i])

# x = [2e1, 1e4]
# y = math.exp(imin) * x**smin
# ax.plot(x, y, "r")
# x = 3e2
# y = (math.exp(imin) * x**smin) - 0.001
# print(x, y)
# ax.text(x, y, "Slope min = " + str(round(smin, 2)), color="r", ha="right", va="top")

# x = [2e1, 1e4]
# y = math.exp(imax) * x**smax
# ax.plot(x, y, "r")
# x = 3e2
# y = math.exp(imax) * x**smax + 0.1
# ax.text(x, y, "Slope max = " + str(round(smax, 2)), color="r", ha="left", va="bottom")
# print(x, y)

smean = np.median(slopes)
imean = np.median(intercepts)
x = [2e1, 1e4]
y = math.exp(imean) * x**smean
ax.plot(x, y, "r")
x = 3e2
y = (math.exp(imean) * x**smean) + 0.1
print(x, y)
ax.text(x, y, "Slope median = " + str(round(smean, 2)), color="r", ha="left", va="bottom")

ax.set_yscale("log")
ax.set_xscale("log")
ax.set_title("All clusters from each taxon")
ax.set_xlabel("log(number of cells)")
ax.set_ylabel("log(Pr(ncells > x))")

plt.figure(fig)
out_fn = cluster_size_dir + '/all_taxa_clusters_size_distribution.png'
ip.check_dir(out_fn)
ip.save_fig(out_fn)

legendFig = plt.figure("Legend plot")
label_params = ax.get_legend_handles_labels()
figl, axl = plt.subplots()
axl.axis(False)
lgnd = axl.legend(*label_params, loc="center", bbox_to_anchor=(0.5, 0.5))
for l in lgnd.legendHandles:
    l._sizes = [20]

plt.figure(legendFig)
out_fn = cluster_size_dir + '/all_taxa_clusters_size_distribution_legend.png'
ip.check_dir(out_fn)
ip.save_fig(out_fn)

Plot slope for each taxon in each image

In [None]:
smin = 1e10
smax = -1e10

dict_group_sgroup = {
    "healthy_implant": "healthy",
    "severe_implant": "disease",
    "moderate_implant": "disease",
    "mucositis_implant": "disease",
    "periodontitis_tooth": "disease",
    "healthy_tooth": "healthy",
}

dict_scn_slopes = defaultdict(list)
dict_scn_groups = defaultdict(list)
for sn, dict_scn_clsize in dict_sn_scn_clsize.items():
    group = dict_sn_group[sn]
    if 'implant' in group:
        for scn, clsizes in dict_scn_clsize.items():
            if len(clsizes) > 3:
                if scn == "Neisseria":
                    scn = "Neisseriaceae"
                elif scn == "TM7":
                    scn = "Saccharibacteria"
                elif scn == "TM":
                    scn = "Saccharibacteria"

                # color = dict_sciname_color[scn]
                counts_sort = np.sort(np.array(clsizes))
                nclust = len(counts_sort)
                Pr = np.array([r / nclust for r in range(nclust, 0, -1)])

                bool_l = counts_sort > 20
                bool_u = counts_sort < 1e4
                lnc = np.log(counts_sort[bool_l * bool_u])
                lnp = np.log(Pr[bool_l * bool_u])

                slope, intercept, r_value, p_value, std_err = stats.linregress(lnc, lnp)
                if not np.isnan(slope):
                    dict_scn_slopes[scn].append(slope)
                    dict_scn_groups[scn].append(group)


means = [np.median(s) for s in dict_scn_slopes.values()]
scns = [s for s in dict_scn_slopes.keys()]
scns_sort = [x for _, x in sorted(zip(means, scns))]


xticks = np.arange(len(scns_sort)) + 1
dict_sciname_ind = dict(zip(scns_sort, xticks))
# dict_sciname_ind["Neisseria"] = dict_sciname_ind["Neisseriaceae"]
# dict_sciname_ind["Saccharibacteria"] = dict_sciname_ind["TM7"]
# dict_sciname_ind["TM"] = dict_sciname_ind["TM7"]

s=100

fig, ax = ip.general_plot(dims=(20,10), ft=20)
x = 1
# scn_list = []
# xticks = []
for scn in scns_sort:
    x = dict_sciname_ind[scn]
    slopes = dict_scn_slopes[scn]
    slopes = -np.array(slopes) 
    xs = [x] * len(slopes) + np.random.rand(len(slopes)) * 0.2 - 0.1
    color = dict_sciname_color[scn]
    # box = ax.boxplot([slopes], positions=[x], vert=True, widths=0.5)
    box = ax.boxplot([slopes], positions=[x], vert=True, widths=0.5, patch_artist=True)
    for k, vs in box.items():
        for v in vs:
            v.set_color('k')
            v.set_alpha(0.25)
            # if k == 'boxes':
            # v.set_facecolor("k")

    groups = np.array(dict_scn_groups[scn])

    ax.scatter(xs[groups == "healthy_implant"], slopes[groups == "healthy_implant"], color="tab:blue", s=s)
    ax.scatter(xs[groups == "mucositis_implant"], slopes[groups == "mucositis_implant"], color="tab:green", s=s)
    ax.scatter(xs[groups == "peri_implantitis"], slopes[groups == "peri_implantitis"], color="tab:red", s=s)


    
    x += 1

_ = ax.set_xticks(
    xticks, scns_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
)
ax.set_xlim(0,np.max(xticks) + 1)
ax.set_ylabel("Power law eponent for each tilescan")


# out_fn = cluster_size_dir + "/size_distribution_by_sample_group_{}.png".format(
#     group
# )
# ip.check_dir(out_fn)
# ip.save_fig(out_fn)

In [None]:

toplot = ['Streptococcus','Lautropia', 'Porphyromonas','Selenomonas','Neisseriaceeae','Veillonella', 'Pasteurellaceae','Prevotella']

s=5

fig, ax = ip.general_plot(dims=(2,2), ft=7)
x = 1
scn_list = []
xticks = []
for scn in scns_sort:
    if scn in toplot:
        # x = dict_sciname_ind[scn]
        slopes = dict_scn_slopes[scn]
        slopes = -np.array(slopes) 
        xs = [x] * len(slopes) + np.random.rand(len(slopes)) * 0.2 - 0.1
        color = dict_sciname_color[scn]
        # box = ax.boxplot([slopes], positions=[x], vert=True, widths=0.5)
        box = ax.boxplot([slopes], positions=[x], vert=True, widths=0.5, patch_artist=True)
        for k, vs in box.items():
            for v in vs:
                v.set_color('k')
                v.set_alpha(0.25)
                # if k == 'boxes':
                # v.set_facecolor("k")

        groups = np.array(dict_scn_groups[scn])

        ax.scatter(xs[groups == "healthy_implant"], slopes[groups == "healthy_implant"], color="tab:blue", s=s)
        ax.scatter(xs[groups == "mucositis_implant"], slopes[groups == "mucositis_implant"], color="tab:green", s=s)
        ax.scatter(xs[groups == "peri_implantitis"], slopes[groups == "peri_implantitis"], color="tab:red", s=s)

        scn_list.append(scn)
        xticks.append(x)
        
        x += 1

_ = ax.set_xticks(
    xticks, scn_list, rotation=45, ha="right", va="top", rotation_mode="anchor"
)
ax.set_xlim(0,np.max(xticks) + 1)
ax.set_ylim(0,2)
# ax.set_ylabel("Power law eponent for each tilescan")


out_fn = cluster_size_dir + "/size_distribution_boxplot_select.pdf"
ip.check_dir(out_fn)
ip.save_fig(out_fn)

PLot specific curves

In [None]:
out_dir

In [None]:
scns = ['Pasteurellaceae','Selenomonas']
date = "2023_02_08"
sn = "hsdm_group_1_sample_12_fov_01"
key = date + "_" + sn

dims = (2,1.5)
ft=6
lw=1

fig, ax = ip.general_plot(dims=dims, ft=ft, lw=lw)
for i, scn in enumerate(scns):
    color = dict_sciname_color[scn]
    clsizes = dict_sn_scn_clsize[key][scn]
    counts_sort = np.sort(clsizes)
    nclust = len(counts_sort)
    Pr = np.array([r / nclust for r in range(nclust, 0, -1)])
    ax.scatter(counts_sort, Pr, s=2, label=sn, color=color)
    # Get slope
    bool_l = counts_sort > 10
    bool_u = counts_sort < 1e4
    lnc = np.log(counts_sort[bool_l * bool_u])
    lnp = np.log(Pr[bool_l * bool_u])
    slope, intercept, r_value, p_value, std_err = stats.linregress(lnc, lnp)
    # PLot slope
    x = [2e1, 1e4]
    y = math.exp(intercept) * x**slope
    ax.plot(x, y, color=color)
    x = 3e2
    y = (math.exp(intercept) * x**slope)
    print(scn)
    print('Slope: ',slope)
    print('rsqure: ',r_value**2)
    # if i == 1:
    #     ax.text(x, y, scn + ":\nSlope = " + str(round(slope, 2)) + ", R^2 = " + str(round(r_value**2, 3)), color=color, ha="left", va="bottom", fontsize=ft)
    # else:
    #     ax.text(x, y, scn + ":\nSlope = " + str(round(slope, 2)) + ", R^2 = " + str(round(r_value**2, 3)), color=color, ha="right", va="top", fontsize=ft)


ax.set_yscale("log")
ax.set_xscale("log")
# _ = ax.set_xlabel("log(number of cells)")
# _ = ax.set_ylabel("log(Pr(ncells > x))")

spatial_dir = out_dir + '/spatial_statistics'
cl_size_dir = spatial_dir + '/cluster_size'
cluster_slope_fmt = cl_size_dir + '/plots/{date}_{sn}_scinames_{scn0}_{scn1}_power_law_plot.pdf'
out_fn = cluster_slope_fmt.format(date=date, sn=sn, scn0=scns[0], scn1=scns[1])
ip.check_dir(out_fn)
ip.save_fig(out_fn)


Plot slope for each sciname in each group

In [None]:
smin = 1e10
smax = -1e10


dict_group_slopes = defaultdict(list)
for sn, dict_scn_clsize in dict_sn_scn_clsize.items():
    group = dict_sn_group[sn]
    if 'implant' in group:
        for scn, clsizes in dict_scn_clsize.items():
            if len(clsizes) > 3:


                # color = dict_sciname_color[scn]
                counts_sort = np.sort(np.array(clsizes))
                nclust = len(counts_sort)
                Pr = np.array([r / nclust for r in range(nclust, 0, -1)])

                bool_l = counts_sort > 20
                bool_u = counts_sort < 1e4
                lnc = np.log(counts_sort[bool_l * bool_u])
                lnp = np.log(Pr[bool_l * bool_u])

                slope, intercept, r_value, p_value, std_err = stats.linregress(lnc, lnp)
                if not np.isnan(slope):
                    dict_group_slopes[group].append(slope)
            # x = dict_sciname_ind[scn] + np.random.rand()*0.5 - 0.25 + 1
            # ax.scatter([x], [-slope], s=2, color=color)

scns_sort = ['healthy_implant', 'mucositis_implant', 'peri_implantitis']

xticks = np.arange(len(scns_sort)) + 1
dict_sciname_ind = dict(zip(scns_sort, xticks))
# dict_sciname_ind["Neisseria"] = dict_sciname_ind["Neisseriaceae"]
# dict_sciname_ind["Saccharibacteria"] = dict_sciname_ind["TM7"]
# dict_sciname_ind["TM"] = dict_sciname_ind["TM7"]

s=5

fig, ax = ip.general_plot(dims=(2,2), ft=7)

for scn in scns_sort:
    x = dict_sciname_ind[scn]
    slopes = dict_group_slopes[scn]
    slopes = -np.array(slopes) 
    xs = [x] * len(slopes) + np.random.rand(len(slopes)) * 0.2 - 0.1
    # color = dict_sciname_color[scn]
    box = ax.boxplot([slopes], positions=[x], vert=True, widths=0.5)

    ax.scatter(xs, slopes, color='k', s=s)

_ = ax.set_xticks(
    xticks, scns_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
)
ax.set_xlim(0,np.max(xticks) + 1)
ax.set_ylim(0,2)
# ax.set_ylabel('Power law eponent for each tilescan')

out_fn = cluster_size_dir + "/size_distribution_boxplot_bygroup.png".format(
    group
)
ip.check_dir(out_fn)
ip.save_fig(out_fn)

Plot slope for each tilescan in each group

In [None]:
smin = 1e10
smax = -1e10


dict_group_slopes = defaultdict(list)
for sn, clsizes in dict_sn_clsizes.items():
    group = dict_sn_group[sn]
    if 'implant' in group:
        if len(clsizes) > 3:


            # color = dict_sciname_color[scn]
            counts_sort = np.sort(np.array(clsizes))
            nclust = len(counts_sort)
            Pr = np.array([r / nclust for r in range(nclust, 0, -1)])

            bool_l = counts_sort > 20
            bool_u = counts_sort < 1e4
            lnc = np.log(counts_sort[bool_l * bool_u])
            lnp = np.log(Pr[bool_l * bool_u])

            slope, intercept, r_value, p_value, std_err = stats.linregress(lnc, lnp)
            if not np.isnan(slope):
                dict_group_slopes[group].append(slope)
        # x = dict_sciname_ind[scn] + np.random.rand()*0.5 - 0.25 + 1
        # ax.scatter([x], [-slope], s=2, color=color)

scns_sort = ['healthy_implant', 'mucositis_implant', 'peri_implantitis']

xticks = np.arange(len(scns_sort)) + 1
dict_sciname_ind = dict(zip(scns_sort, xticks))
# dict_sciname_ind["Neisseria"] = dict_sciname_ind["Neisseriaceae"]
# dict_sciname_ind["Saccharibacteria"] = dict_sciname_ind["TM7"]
# dict_sciname_ind["TM"] = dict_sciname_ind["TM7"]

s=5

fig, ax = ip.general_plot(dims=(2,2), ft=7)

for scn in scns_sort:
    x = dict_sciname_ind[scn]
    slopes = dict_group_slopes[scn]
    slopes = -np.array(slopes) 
    xs = [x] * len(slopes) + np.random.rand(len(slopes)) * 0.2 - 0.1
    # color = dict_sciname_color[scn]
    box = ax.boxplot([slopes], positions=[x], vert=True, widths=0.5)

    ax.scatter(xs, slopes, color='k', s=s)

_ = ax.set_xticks(
    xticks, scns_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
)
ax.set_xlim(0,np.max(xticks) + 1)
ax.set_ylim(0,2)
# ax.set_ylabel('Power law eponent for each tilescan')

out_fn = cluster_size_dir + "/size_distribution_boxplot_tiles_bygroup.png".format(
    group
)
ip.check_dir(out_fn)
ip.save_fig(out_fn)

In [None]:
(np.log(36) - np.log(9)) / (np.log(4) - np.log(8))

In [None]:
scns_sort

##  Count things

In [None]:
out_dir = "../outputs/{date}/{date}_{sn}"

out_dir_seg = out_dir + '/segs'
props_fmt = out_dir_seg + "/{date}_{sn}_M_{m}_props.csv"

spatial_dir = out_dir + '/spatial_statistics'
cl_size_dir = spatial_dir + '/cluster_size'
cluster_size_fmt = cl_size_dir + '/{date}_{sn}_sciname_{scn}_cluster_size.npy'

In [None]:
# dict_group_counts = defaultdict(dict)
# for group, sns in dict_group_sn.items():
#     n_scans = 0
#     n_tiles = 0
#     n_cells = 0 
#     n_clusters = 0
#     for s in sns:
#         n_scans += 1
#         # date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", s)
#         # props_glob = props_fmt.format(date=date, sn=sn, m='*')
#         # props_fns = glob.glob(props_glob)
#         # n_tiles += len(props_fns)
#         # for pfn in props_fns:
#         #     prop = pd.read_csv(pfn)
#         #     n_cells += prop.shape[0]
        
#         # cl_glob = cluster_size_fmt.format(date=date, sn=sn, scn='*')
#         # cl_fns = glob.glob(cl_glob)
#         # for fn in cl_fns:
#         #     clust = np.load(fn)
#         #     n_clusters += len(np.unique(clust))
#     dict_group_counts[group]['scans'] = n_scans
#     # dict_group_counts[group]['tiles'] = n_tiles
#     # dict_group_counts[group]['cells'] = n_cells
#     # dict_group_counts[group]['clusters'] = n_clusters

# dict_group_counts

        

In [None]:
a = np.array([[1,2],[3,4]])
np.zeros_like(a, dtype=int)

In [None]:
dict_group_counts = defaultdict(dict)
for group, sns in dict_group_sn.items():
    n_scans = 0
    n_tiles = 0
    n_cells = 0 
    n_clusters = 0
    for s in sns:
        n_scans += 1
        # date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", s)
        # props_glob = props_fmt.format(date=date, sn=sn, m='*')
        # props_fns = glob.glob(props_glob)
        # n_tiles += len(props_fns)
        # for pfn in props_fns:
        #     prop = pd.read_csv(pfn)
        #     n_cells += prop.shape[0]
        
        # cl_glob = cluster_size_fmt.format(date=date, sn=sn, scn='*')
        # cl_fns = glob.glob(cl_glob)
        # for fn in cl_fns:
        #     clust = np.load(fn)
        #     n_clusters += len(np.unique(clust))
    dict_group_counts[group]['scans'] = n_scans
    # dict_group_counts[group]['tiles'] = n_tiles
    # dict_group_counts[group]['cells'] = n_cells
    # dict_group_counts[group]['clusters'] = n_clusters

dict_group_counts

        

In [None]:
a = pd.DataFrame(dict_group_counts).T
b = a[(a.index != 'healthy_tooth') & (a.index != 'periodontitis_tooth')]
b

In [None]:
b.tiles.sum()

In [None]:
b.cells.sum()

### Counting post exclude

In [None]:
dict_group_counts = defaultdict(dict)
for gr, ps_dict in dict_group_pat_sn.items():
    n_pats = 0
    n_scans = 0
    n_tiles = 0
    n_cells = 0 
    for pat, sns in ps_dict.items():
        n_unexcluded = 0
        for bn in sns:
            if bn not in exclude_sns:
                n_unexcluded += 1
        if n_unexcluded > 0:
            n_pats += 1
            for bn in sns:
                if bn not in exclude_sns:
                    n_scans += 1
                    date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", bn)
                    props_glob = props_fmt.format(date=date, sn=sn, m='*')
                    props_fns = glob.glob(props_glob)
                    n_tiles += len(props_fns)
                    for pfn in props_fns:
                        prop = pd.read_csv(pfn)
                        n_cells += prop.shape[0]
    dict_group_counts[gr]['pats'] = n_pats     
    dict_group_counts[gr]['scans'] = n_scans     
    dict_group_counts[gr]['tiles'] = n_tiles    
    dict_group_counts[gr]['cells'] = n_cells  

                               


In [None]:
pd.DataFrame(dict_group_counts).T


In [None]:
pd.DataFrame(dict_group_counts).T.sum(axis=0)

## Box counting dimension

In [None]:
print(scn_unq)

In [None]:
window.bbox

In [None]:
np.argmin([1,2])

In [None]:
# Get cells window
convex_hull = ps.cg.convex_hull(coords.tolist())
ch_arr = np.array(to_ccf(convex_hull))
# plt.plot(ch_arr[:, 1], ch_arr[:, 0])
# plt.gca().invert_yaxis()
window = Window([convex_hull])

# Get boxes initial
bbox = window.bbox
w_shp = [bbox[2] - bbox[0], bbox[3] - bbox[1]]
dmin = np.min(w_shp)
dmax = np.max(w_shp)
ind_dmin = np.argmin(w_shp)
ind_dmax = np.argmax(w_shp)
dim_xy_init = np.array([0,0])
dim_xy_init[ind_dmin] = 2
ndmax = int(math.ceil(dmax / dmin))
dim_xy_init[ind_dmax] = ndmax
dmax_i_um_init = dmax / dim_xy_init[ind_dmax] * res_umpix

# # Get hexagon length initial
# lh_init = 100

# minimum box size um
dmiu_min = 5

slopes = []
for scn in scn_unq:
    if scn != 'Prevotella':
        fig, ax = ip.general_plot()
        print(scn)
        # Get point pattern
        col_obs = dict_sciname_color[scn]
        bool_scn = scinames == scn
        coord_scn = coords[bool_scn]
        pp = PointPattern(coord_scn, window=window)

        # Get qstatistic
        dxy = dim_xy_init.copy()
        dmax_i_um = dmax_i_um_init.copy()
        counts = []
        box_size = []
        while dmax_i_um > dmiu_min:
            # q_r = qs.QStatistic(pp, shape="rectangle", nx=dxy[1], ny=dxy[0])
            # lh = lh_init
            # q_r = qs.QStatistic(pp, shape="hexagon", lh=lh)
            # q_r.plot()
            rect = RectangleM(pp, dxy[1], dxy[0]).point_location_sta()
            c = np.array(list(rect.values()))
            count = sum(c > 0)
            counts.append(count)
            box_size.append(dmax_i_um)

            dxy *= 2
            dmax_i_um = dmax / dxy[ind_dmax] * res_umpix

        # Get regression
        lnc = np.log(counts)
        lns = np.log(box_size)
        slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
        print(slope)
        slopes.append(-slope)


        ax.scatter(box_size, counts, color=col_obs, label=scn)

    
        x = np.array([min(box_size), max(box_size)])
        y = math.exp(intercept) * x**slope
        ax.plot(x, y, color=col_obs)
        x = 40
        y = math.exp(intercept) * x**slope
        # ax.text(
        #     x,
        #     y,
        #     "Slope = " + str(round(smean, 2)) + ", R^2 = " + str(round(r_value**2, 3)),
        #     color="r",
        #     ha="left",
        #     va="bottom",
        # )


        ax.set_xscale("log")
        ax.set_yscale("log")
        plt.legend()
        plt.show()
        plt.close()



In [None]:
np.sqrt(np.sum(np.array([3,4])**2))

In [None]:
# Get cells window
convex_hull = ps.cg.convex_hull(coords.tolist())
ch_arr = np.array(to_ccf(convex_hull))
# plt.plot(ch_arr[:, 1], ch_arr[:, 0])
# plt.gca().invert_yaxis()
window = Window([convex_hull])

# Get boxes initial
bbox = window.bbox
w_shp = np.array([bbox[2] - bbox[0], bbox[3] - bbox[1]])
box_edge_init_um = 50
box_edge_init = box_edge_init_um / res_umpix
box_edge_min_um = 5
box_edge_min = box_edge_min_um / res_umpix

# dmin = np.min(w_shp)
# dmax = np.max(w_shp)
# ind_dmin = np.argmin(w_shp)
# ind_dmax = np.argmax(w_shp)
# dim_xy_init = np.array([0,0])
# dim_xy_init[ind_dmin] = 2
# ndmax = int(math.ceil(dmax / dmin))
# dim_xy_init[ind_dmax] = ndmax
# dmax_i_um_init = dmax / dim_xy_init[ind_dmax] * res_umpix

# # Get hexagon length initial
# lh_init = 100

# minimum box size um
# dmiu_min = 5

slopes = []
for scn in scn_unq:
    # if scn != 'Prevotella':
    fig, ax = ip.general_plot()
    print(scn)
    # Get point pattern
    col_obs = dict_sciname_color[scn]
    bool_scn = scinames == scn
    coord_scn = coords[bool_scn]
    pp = PointPattern(coord_scn, window=window)

    # Get qstatistic
    box_edge_um = box_edge_init_um
    counts = []
    box_size = []
    while box_edge_um >= box_edge_min_um:
        box_edge = box_edge_um / res_umpix
        nxy = np.ceil(w_shp / box_edge).astype(int)
        # q_r = qs.QStatistic(pp, shape="rectangle", nx=dxy[1], ny=dxy[0])
        # lh = lh_init
        # q_r = qs.QStatistic(pp, shape="hexagon", lh=lh)
        # q_r.plot()
        rect = RectangleM(pp, nxy[1], nxy[0]).point_location_sta()
        c = np.array(list(rect.values()))
        count = sum(c > 0)
        counts.append(count)
        dxy = w_shp / nxy
        diag = np.sqrt(np.sum(dxy**2))
        diag_um = diag * res_umpix
        box_size.append(diag_um)

        box_edge_um -= 5

    # Get regression
    lnc = np.log(counts)
    lns = np.log(box_size)
    slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
    print(slope, r_value**2)
    slopes.append(-slope)


    ax.scatter(box_size, counts, color=col_obs, label=scn)


    x = np.array([min(box_size), max(box_size)])
    y = math.exp(intercept) * x**slope
    ax.plot(x, y, color=col_obs)
    x = 40
    y = math.exp(intercept) * x**slope
    # ax.text(
    #     x,
    #     y,
    #     "Slope = " + str(round(smean, 2)) + ", R^2 = " + str(round(r_value**2, 3)),
    #     color="r",
    #     ha="left",
    #     va="bottom",
    # )


    ax.set_xscale("log")
    ax.set_yscale("log")
    plt.legend()
    plt.show()
    plt.close()



In [None]:
x = np.array([0]*len(slopes)) 
plt.scatter(x, slopes)

## Measure of homogeneity

Define new function for Rectangle class

In [None]:
class RectangleM_new:
    """
    Rectangle grid structure for quadrat-based method.

    Parameters
    ----------
    pp                : :class:`.PointPattern`
                        Point Pattern instance.
    count_column      : integer
                        Number of rectangles in the horizontal
                        direction. Use in pair with count_row to
                        fully specify a rectangle. Incompatible with
                        rectangle_width and rectangle_height.
    count_row         : integer
                        Number of rectangles in the vertical
                        direction. Use in pair with count_column to
                        fully specify a rectangle. Incompatible with
                        rectangle_width and rectangle_height.
    rectangle_width   : float
                        Rectangle width. Use in pair with
                        rectangle_height to fully specify a rectangle.
                        Incompatible with count_column & count_row.
    rectangle_height  : float
                        Rectangle height. Use in pair with
                        rectangle_width to fully specify a rectangle.
                        Incompatible with count_column & count_row.

    Attributes
    ----------
    pp                : :class:`.PointPattern`
                        Point Pattern instance.
    mbb               : array
                        Minimum bounding box for the point pattern.
    points            : array
                        x,y coordinates of the point points.
    count_column      : integer
                        Number of columns.
    count_row         : integer
                        Number of rows.
    num               : integer
                        Number of rectangular quadrats.

    """

    def __init__(self, pp, labels, count_column = 3, count_row = 3,
                 rectangle_width = 0, rectangle_height = 0):
        self.mbb = pp.mbb
        self.pp = pp
        self.points = np.asarray(pp.points)
        self.labels = np.array(labels)
        x_range = self.mbb[2]-self.mbb[0]
        y_range = self.mbb[3]-self.mbb[1]
        if rectangle_width & rectangle_height:
            self.rectangle_width = rectangle_width
            self.rectangle_height = rectangle_height

            # calculate column count and row count
            self.count_column = int(math.ceil(x_range / rectangle_width))
            self.count_row = int(math.ceil(y_range / rectangle_height))
        else:
            self.count_column = count_column
            self.count_row = count_row

            # calculate the actual width and height of cell
            self.rectangle_width = x_range/float(count_column)
            self.rectangle_height = y_range/float(count_row)
        self.num = self.count_column * self.count_row


    def point_location_sta(self):
        """
        Count the point events in each cell.

        Returns
        -------
        dict_id_count : dict
                        keys: rectangle id, values: number of point
                        events in each cell.
        """

        dict_id_count = {}
        for i in range(self.count_row):
            for j in range(self.count_column):
                dict_id_count[j+i*self.count_column] = 0

        for point in self.points:
            index_x = (point[0]-self.mbb[0]) // self.rectangle_width
            index_y = (point[1]-self.mbb[1]) // self.rectangle_height
            if index_x == self.count_column:
                index_x -= 1
            if index_y == self.count_row:
                index_y -= 1
            id = index_y * self.count_column + index_x
            dict_id_count[id] += 1
        return dict_id_count

    def _get_dict_id_labels(self):
        """
        Get a list of labels in each cell.

        Returns
        -------
        dict_id_count : dict
                        keys: rectangle id, values: number of point
                        events in each cell.
        """

        # dict_id_count = {}
        # for i in range(self.count_row):
        #     for j in range(self.count_column):
        #         dict_id_points[j+i*self.count_column] = []
        dict_id_labels = defaultdict(list)
        for point, lab in zip(self.points, self.labels):
            index_x = (point[0]-self.mbb[0]) // self.rectangle_width
            index_y = (point[1]-self.mbb[1]) // self.rectangle_height
            if index_x == self.count_column:
                index_x -= 1
            if index_y == self.count_row:
                index_y -= 1
            id = index_y * self.count_column + index_x
            dict_id_labels[id].append(lab)
        self.dict_id_labels = dict_id_labels

    def get_shannon_diversities(self):
        self._get_dict_id_labels()
        dict_idx_shannon = {}
        for idx, labels in self.dict_id_labels.items():
            labels = np.array(labels)
            nlab = len(labels)
            Hs = []
            for l in np.unique(labels):
                nl = sum(labels==l)
                p = nl / nlab
                Hs.append(p * np.log(p))
            H = -np.sum(Hs)
            dict_idx_shannon[idx] = H
        return dict_idx_shannon

    def get_simpson_diversities(self):
        self._get_dict_id_labels()
        dict_idx_simpson = {}
        for idx, labels in self.dict_id_labels.items():
            labels = np.array(labels)
            nlab = len(labels)
            if nlab > 1:
                etas = []
                for l in np.unique(labels):
                    nl = sum(labels==l)
                    etas.append(nl*(nl-1))
                D = 1 - np.sum(etas) / (nlab*(nlab-1))
            else:
                D = 0
            dict_idx_simpson[idx] = D
        return dict_idx_simpson

    def get_multispecies_tiles(self):
        self._get_dict_id_labels()
        dict_idx_multi = {}
        for idx, labels in self.dict_id_labels.items():
            c = 0
            if len(np.unique(labels)) > 1:
                c = 1
            dict_idx_multi[idx] = c
        return dict_idx_multi


In [None]:
labels = np.array([])
nlab = len(labels)
etas = []
for l in np.unique(labels):
    nl = sum(labels==l)
    etas.append(nl*(nl-1))
D = 1 - np.sum(etas) / (nlab*(nlab-1))
D

Do fuzzy box counting on shannon diversity

In [None]:
date = "2023_02_08"
sn = "hsdm_group_1_sample_12_fov_01"

centroid_sciname_fn = centroid_sciname_fmt.format(date=date, sn=sn)
centroid_sciname = pd.read_csv(centroid_sciname_fn)
coords = np.array([eval(c) for c in centroid_sciname["coord"].values])
scinames = centroid_sciname["sciname"].values
scn_unq = np.unique(scinames)

# Get cells window
convex_hull = ps.cg.convex_hull(coords.tolist())
ch_arr = np.array(to_ccf(convex_hull))
# plt.plot(ch_arr[:, 1], ch_arr[:, 0])
# plt.gca().invert_yaxis()
window = Window([convex_hull])

# Get boxes initial
bbox = window.bbox
# w_shp = np.array([bbox[2] - bbox[0], bbox[3] - bbox[1]])
box_edge_init_um = 50
div_edgesize = 1.5
box_edge_min_um = 5
col='k'

# if scn != 'Prevotella':
fig, ax = ip.general_plot()
# Get point pattern
pp = PointPattern(coords, window=window)

# Get qstatistic
box_edge_um = box_edge_init_um
counts = []
box_size = []
while box_edge_um >= box_edge_min_um:
    print(box_edge_um)
    box_edge = box_edge_um / res_umpix
    nxy = np.ceil(w_shp / box_edge).astype(int)

    rect = RectangleM_new(
        pp, 
        count_column = nxy[1], 
        count_row = nxy[1],
        labels=scinames
    ).get_simpson_diversities()
    Hs = np.array(list(rect.values()))
    count = sum(Hs)
    print(count)
    counts.append(count)
    dxy = w_shp / nxy
    diag = np.sqrt(np.sum(dxy**2))
    diag_um = diag * res_umpix
    box_size.append(diag_um)

    box_edge_um /= div_edgesize

# Get regression
lnc = np.log(counts)
lns = np.log(box_size)
slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
print(slope, r_value**2)
slopes.append(-slope)


ax.scatter(box_size, counts, color=col, label=scn)


x = np.array([min(box_size), max(box_size)])
y = math.exp(intercept) * x**slope
ax.plot(x, y, color=col)
x = 40
y = math.exp(intercept) * x**slope
ax.set_xscale("log")
ax.set_yscale("log")
plt.show()
plt.close()

print(box_edge_um)

In [None]:
sum(Hs == 0) / len(Hs)

In [None]:
date = "2023_10_18"
sn = "hsdm_slide_IIL_fov_01"

centroid_sciname_fn = centroid_sciname_fmt.format(date=date, sn=sn)
centroid_sciname = pd.read_csv(centroid_sciname_fn)
coords = np.array([eval(c) for c in centroid_sciname["coord"].values])
scinames = centroid_sciname["sciname"].values
scn_unq = np.unique(scinames)

# Get cells window
convex_hull = ps.cg.convex_hull(coords.tolist())
ch_arr = np.array(to_ccf(convex_hull))
# plt.plot(ch_arr[:, 1], ch_arr[:, 0])
# plt.gca().invert_yaxis()
window = Window([convex_hull])

# Get boxes initial
bbox = window.bbox
w_shp = np.array([bbox[2] - bbox[0], bbox[3] - bbox[1]])
box_edge_init_um = 50
div_edgesize = 1.5
box_edge_min_um = 5
col='k'

# if scn != 'Prevotella':
fig, ax = ip.general_plot()
# Get point pattern
pp = PointPattern(coords, window=window)

# Get qstatistic
box_edge_um = box_edge_init_um
counts = []
box_size = []
while box_edge_um >= box_edge_min_um:
    box_edge = box_edge_um / res_umpix
    nxy = np.ceil(w_shp / box_edge).astype(int)

    rect = RectangleM_new(
        pp, 
        count_column = nxy[1], 
        count_row = nxy[1],
        labels=scinames
    ).get_simpson_diversities()
    # ).get_shannon_diversities()
    Hs = np.array(list(rect.values()))
    count = sum(Hs)
    counts.append(count)
    dxy = w_shp / nxy
    diag = np.sqrt(np.sum(dxy**2))
    diag_um = diag * res_umpix
    box_size.append(diag_um)

    box_edge_um /= div_edgesize

# Get regression
lnc = np.log(counts)
lns = np.log(box_size)
slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
print(slope, r_value**2)
slopes.append(-slope)


ax.scatter(box_size, counts, color=col, label=scn)


x = np.array([min(box_size), max(box_size)])
y = math.exp(intercept) * x**slope
ax.plot(x, y, color=col)
x = 40
y = math.exp(intercept) * x**slope
ax.set_xscale("log")
ax.set_yscale("log")
plt.show()
plt.close()



In [None]:
sum(Hs == 0) / len(Hs)

### After running box counting on all

fuzzy box counting

In [None]:
fuzzy_box_count_all_fn = sample_compare_dir + '/fuzzy_box_counting_table.csv'
fuzzy_box_count_all = pd.read_csv(fuzzy_box_count_all_fn, index_col=0)
fuzzy_box_count_all

In [None]:
fuzzy_box_count_all2_fn = sample_compare_dir + '/20240506/fuzzy_box_counting_table.csv'
fuzzy_box_count_all2 = pd.read_csv(fuzzy_box_count_all2_fn, index_col=0)
fuzzy_box_count_all2

In [None]:
exclude_sns = [
    '2023_10_16_hsdm_slide_IB_fov_01', 
    '2023_10_16_hsdm_slide_IB_fov_03', 
    '2023_10_16_hsdm_slide_IB_fov_02', 
    '2022_12_16_harvardwelch_patient_18_tooth_2_aspect_MB_depth__fov_01',
    '2022_12_16_harvardwelch_patient_19_tooth_30_aspect_MB_depth_sub_fov_01',
    '2023_02_18_hsdm_group_II_patient_7_fov_02',
    '2023_02_18_hsdm_group_II_patient_7_fov_01',
    '2024_04_16_hsdmgel_group_I_pat_10_asp_DL_fov_01',
    '2024_04_16_hsdmgel_group_I_pat_10_asp_DL_fov_02',
    '2024_04_27_hsdm_group_II_patient_12_aspect_DL_fov_01',
    '2024_04_27_hsdm_group_II_patient_12_aspect_DL_fov_02',
    '2024_04_27_hsdm_group_II_patient_12_aspect_DL_fov_03',
    '2024_04_16_hsdmgel_group_II_pat_14_asp_DB_fov_01',
    '2024_04_16_hsdmgel_group_II_pat_14_asp_DB_fov_02',
    '2024_04_16_hsdmgel_group_II_pat_14_asp_DB_fov_03',
    '2024_04_16_hsdmgel_group_II_pat_14_asp_DB_fov_04',
    '2024_04_27_hsdm_group_II_patient_8_aspect_DL_fov_01',
    '2024_04_27_hsdm_group_II_patient_8_aspect_DL_fov_02',
    '2023_10_18_hsdm_slide_IIIB_fov_01',
    '2024_04_16_hsdmgel_group_III_pat_6_asp_ML_fov_01',
    '2024_05_03_hsdm_group_III_patient_3_aspect_DL_fov_01',
    '2024_04_24_hsdm_group_III_patient_5_aspect_DB_fov_01',
    '2024_04_24_hsdm_group_III_patient_5_aspect_DB_fov_03',
    '2024_04_24_hsdm_group_III_patient_5_aspect_DB_fov_04',
    '2024_04_24_hsdm_group_III_patient_8_aspect_DB_fov_01',
    '2024_04_24_hsdm_group_III_patient_8_aspect_DB_fov_02',    
    '2024_05_03_hsdm_group_IV_patient_3_aspect_DL_fov_01',
]

In [None]:
dict_group_bcvals = defaultdict(list)
for group, sns in dict_group_sn.items():
    print(group)
    for sn in sns:
        if not sn in exclude_sns:
            try:
                bcval = -fuzzy_box_count_all.loc[sn,'slope']
            except:
                bcval = -fuzzy_box_count_all2.loc[sn,'slope']

            dict_group_bcvals[group].append(bcval)
            # if bcval > 1.85:
            #     print(sn, bcval)



In [None]:
group_sort = ['healthy_implant', 'mucositis_implant', 'moderate_peri_implantitis', 'severe_peri_implantitis']

xticks = np.arange(len(group_sort)) + 1
dict_group_xtick = dict(zip(group_sort, xticks))

s=5

fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)

for group in group_sort:
    x = dict_group_xtick[group]
    bcvals = dict_group_bcvals[group]
    xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

    # color = dict_sciname_color[scn]
    box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)

    ax.scatter(xs, bcvals, color='k', s=s)

# _ = ax.set_xticks(
#     xticks, group_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
# )
_ = ax.set_xticks([])
ax.set_xlim(0,np.max(xticks) + 1)
ax.set_ylim(0,2)
# ax.set_ylabel('Homogeneity\n(Fuzzy box counting fractal dimension)')
ax.set_ylim(1.4,2)
out_fn = sample_compare_dir + "/box_counting/fuzzy_box_counting_bygroup.pdf".format(
    group
)
ip.check_dir(out_fn)
ip.save_fig(out_fn)

In [None]:
for g0 in group_sort:
    for g1 in group_sort:
        if g0 != g1:
            bc0 = dict_group_bcvals[g0]
            bc1 = dict_group_bcvals[g1]
            print(g0, 'vs.', g1)
            print(stats.ttest_ind(bc0,bc1, equal_var=False))

multispecies box counting

In [None]:
fuzzy_box_count_all_fn = sample_compare_dir + '/multispecies_box_counting_table.csv'
fuzzy_box_count_all = pd.read_csv(fuzzy_box_count_all_fn, index_col=0)
fuzzy_box_count_all

In [None]:
exclude_sns = [
    '2023_10_16_hsdm_slide_IB_fov_01', 
    '2023_10_16_hsdm_slide_IB_fov_03', 
    '2023_10_16_hsdm_slide_IB_fov_02', 
    '2022_12_16_harvardwelch_patient_18_tooth_2_aspect_MB_depth__fov_01',
    '2022_12_16_harvardwelch_patient_19_tooth_15_aspect_MF_depth_sub_fov_01',
    '2022_12_16_harvardwelch_patient_19_tooth_30_aspect_MB_depth_sub_fov_01',
    '2023_02_18_hsdm_group_II_patient_7_fov_02',

]

In [None]:
dict_group_bcvals = defaultdict(list)
for group, sns in dict_group_sn.items():
    print(group)
    for sn in sns:
        if not sn in exclude_sns:
            bcval = -fuzzy_box_count_all.loc[sn,'slope']
            dict_group_bcvals[group].append(bcval)


In [None]:
group_sort = ['healthy_implant', 'mucositis_implant', 'peri_implantitis']

xticks = np.arange(len(group_sort)) + 1
dict_group_xtick = dict(zip(group_sort, xticks))

s=5

fig, ax = ip.general_plot(dims=(4,5), ft=12)

for group in group_sort:
    x = dict_group_xtick[group]
    bcvals = dict_group_bcvals[group]
    xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

    # color = dict_sciname_color[scn]
    box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)

    ax.scatter(xs, bcvals, color='k', s=s)

_ = ax.set_xticks(
    xticks, group_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
)
ax.set_xlim(0,np.max(xticks) + 1)
ax.set_ylim(0,2)
ax.set_ylabel('Homogeneity\n(Box counting fractal dimension)')
ax.set_ylim(1,2)
# out_fn = cluster_size_dir + "/size_distribution_boxplot_bygroup.png".format(
#     group
# )
# ip.check_dir(out_fn)
# ip.save_fig(out_fn)

## Combine patient data points

Load values into dict

In [None]:
out_dir

In [None]:
spatial_dir = out_dir + '/spatial_statistics'
fuzzy_box_counting_fmt = spatial_dir + '/box_counting/{date}_{sn}_fuzzy_box_counting_regression.csv'
dict_group_pat_slopes = defaultdict(lambda: defaultdict(list))
for gr, ps_dict in dict_group_pat_sn.items():
    for pat, sns in ps_dict.items():
        for bn in sns:
            if bn not in exclude_sns:
                date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", bn)
                fn = fuzzy_box_counting_fmt.format(date=date, sn=sn)
                box_counting = pd.read_csv(fn)
                slope = box_counting['slope'].values[0]
                dict_group_pat_slopes[gr][pat].append(slope)



In [None]:
dict_group_slmeans = defaultdict(list)
for gr, ps_dict in dict_group_pat_slopes.items():
    for pat, slopes in ps_dict.items():
        val = -np.mean(slopes)
        dict_group_slmeans[gr].append(val)


In [None]:
group_sort = ['healthy_implant', 'mucositis_implant', 'mild_peri_implantitis', 'moderate_severe_peri_implantitis']

xticks = np.arange(len(group_sort)) + 1
dict_group_xtick = dict(zip(group_sort, xticks))

s=5

fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)

for group in group_sort:
    x = dict_group_xtick[group]
    bcvals = dict_group_slmeans[group]
    xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

    # color = dict_sciname_color[scn]
    box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)

    ax.scatter(xs, bcvals, color='k', s=s)

# _ = ax.set_xticks(
#     xticks, group_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
# )
_ = ax.set_xticks([])
ax.set_xlim(0,np.max(xticks) + 1)
ax.set_ylim(0,2)
# ax.set_ylabel('Homogeneity\n(Fuzzy box counting fractal dimension)')
ax.set_ylim(1.5,2)

out_fn = sample_compare_dir + "/box_counting/fuzzy_box_counting_bygroup_patient.pdf".format(
    group
)
ip.check_dir(out_fn)
ip.save_fig(out_fn)



In [None]:
for g0 in group_sort:
    for g1 in group_sort:
        if g0 != g1:
            bc0 = dict_group_slmeans[g0]
            bc1 = dict_group_slmeans[g1]
            print(g0, 'vs.', g1)
            print(stats.ttest_ind(bc0,bc1, equal_var=False))

Kruskal wallis test for difference in median

In [None]:

stats.kruskal(
    dict_group_slmeans[group_sort[0]],
    dict_group_slmeans[group_sort[1]],
    dict_group_slmeans[group_sort[2]],
    dict_group_slmeans[group_sort[3]],
)

Dunn pairwise test

In [None]:
posthoc_dunn(list(dict_group_slmeans.values()))

In [None]:
posthoc_dunn(list(dict_group_slmeans.values()), p_adjust='bonferroni')

## Plot simpson diversity curves

In [None]:
dict_group_pat_slopes['healthy_implant']

Load Xas dicts

In [None]:
sns = [
    '2022_12_16_harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_02',
    '2024_04_27_hsdm_group_III_patient_11_aspect_DL_fov_01'
]

cols = plt.get_cmap('tab10').colors
dict_bn_col = {
    '2022_12_16_harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_02': cols[0],
    '2024_04_27_hsdm_group_III_patient_11_aspect_DL_fov_01':cols[1]
}

In [None]:
out_dir = '../outputs/{date}/{date}_{sn}'
spatial_dir = out_dir + '/spatial_statistics'
multifractal_dir = spatial_dir + '/multifractal'
local_diversity_dict_fmt = multifractal_dir + '/{date}_{sn}_dict_area_q_partition_vals.yaml'


In [None]:
dict_bn_areas_Xas = {}
for bn in sns:
    date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", bn)
    print(date,sn)
    ldd_fn = local_diversity_dict_fmt.format(date=date, sn=sn)
    with open(ldd_fn, 'r') as f:
        dict_area_q_Xas = yaml.unsafe_load(f)
    areas = []
    Dsums = []
    for area, dict_q_Xas in dict_area_q_Xas.items():
        areas.append(area)
        Xas = dict_q_Xas[2]
        Ds = 1 - np.array(Xas)
        Dsums.append(np.sum(Ds))
    dict_bn_areas_Xas[bn] = [areas, Dsums]




In [None]:
marker_size = 2

fig, ax = ip.general_plot(dims=(1.9,2.5), ft=6)
for bn in sns:
    areas, Dsums = dict_bn_areas_Xas[bn]
    edges = np.array(areas) ** (1/2)
    
    lnc = np.log(Dsums)
    lns = np.log(edges)
    slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
    print('fit: ',slope, ', r^2: ', r_value**2, ', intercept: ', intercept)

    col = dict_bn_col[bn]
    ax.scatter(edges, Dsums, color=col, s=marker_size)

    x = np.array([min(edges), max(edges)])
    y = math.exp(intercept) * x**slope
    ax.plot(x, y, color=col)
    ax.set_xscale("log")
    ax.set_yscale("log")   
    # ax.set_xticks([], labels=[])
    ax.xaxis.set_minor_formatter(tck.NullFormatter())

# Plot idealized slopes
color = (0.5,0.5,0.5)
linestyle = ':'

x = np.array([min(edges), max(edges)])
intercept = 12.25
slope = -2
y = math.exp(intercept) * x**slope
ax.plot(x,y, color=color, ls=linestyle)

x = np.array([min(edges), max(edges)])
intercept = 6
slope = -1
y = math.exp(intercept) * x**slope
ax.plot(x,y, color=color, ls=linestyle)

sample_compare_dir =  '../outputs/compare_samples'
out_fn = sample_compare_dir + "/box_counting/fuzzy_box_counting_curves.pdf"
ip.save_fig(out_fn)

In [None]:
np.array(list(dict_area_q_Xas.keys()))**(1/2)

## Plot simpson diversity curves cosdist

In [None]:
sns = [
    '2023_02_08_hsdm_group_1_sample_06_fov_01',
    '2024_04_27_hsdm_group_III_patient_11_aspect_DL_fov_01'
]

cols = plt.get_cmap('tab10').colors
dict_bn_col = {
    '2023_02_08_hsdm_group_1_sample_06_fov_01': cols[0],
    '2024_04_27_hsdm_group_III_patient_11_aspect_DL_fov_01':cols[1]
}

In [None]:
spatial_dir = out_dir + '/spatial_statistics'
multifractal_dir = spatial_dir + '/multifractal'
local_diversity_dict_fmt = multifractal_dir + '/{date}_{sn}_dict_area_q_partition_vals.yaml'


In [None]:
dict_bn_areas_Xas = {}
for bn in sns:
    date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", bn)
    print(date,sn)
    ldd_fn = local_diversity_dict_fmt.format(date=date, sn=sn)
    with open(ldd_fn, 'r') as f:
        dict_area_q_Xas = yaml.unsafe_load(f)
    areas = []
    Dsums = []
    for area, dict_q_Xas in dict_area_q_Xas.items():
        areas.append(area)
        Xas = dict_q_Xas[2]
        Ds = 1 - np.array(Xas)
        Dsums.append(np.sum(Ds))
    dict_bn_areas_Xas[bn] = [areas, Dsums]




In [None]:
marker_size = 2

fig, ax = ip.general_plot(dims=(1.9,2.5), ft=6)
for bn in sns:
    areas, Dsums = dict_bn_areas_Xas[bn]
    edges = np.array(areas) ** (1/2)
    
    lnc = np.log(Dsums)
    lns = np.log(edges)
    slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
    print('fit: ',slope, ', r^2: ', r_value**2, ', intercept: ', intercept)

    col = dict_bn_col[bn]
    ax.scatter(edges, Dsums, color=col, s=marker_size)

    x = np.array([min(edges), max(edges)])
    y = math.exp(intercept) * x**slope
    ax.plot(x, y, color=col)
    ax.set_xscale("log")
    ax.set_yscale("log")   
    # ax.set_xticks([], labels=[])
    ax.xaxis.set_minor_formatter(tck.NullFormatter())

# Plot idealized slopes
color = (0.5,0.5,0.5)
linestyle = ':'

x = np.array([min(edges), max(edges)])
intercept = 12.25
slope = -2
y = math.exp(intercept) * x**slope
ax.plot(x,y, color=color, ls=linestyle)

x = np.array([min(edges), max(edges)])
intercept = 6.5
slope = -1
y = math.exp(intercept) * x**slope
ax.plot(x,y, color=color, ls=linestyle)

sample_compare_dir =  '../outputs/compare_samples'
out_fn = sample_compare_dir + "/box_counting/fuzzy_box_counting_curves_cosdist.pdf"
ip.save_fig(out_fn)

In [None]:
np.array(list(dict_area_q_Xas.keys()))**(1/2)

## Plot abundance of each genus grouped by clinical diagnosis

In [None]:
out_fmt_classif = out_dir + "/classif"
abundances_fmt = out_fmt_classif + '/{date}_{sn}_scinames.npy'

sciname_list = [
    'Pasteurellaceae',
    'Corynebacterium',
    'Veillonella',
    'Actinomyces',
    'Selenomonas',
    'Rothia',
    'Porphyromonas',
    'Capnocytophaga',
    'Prevotella',
    'Streptococcus',
    'Gemella',
    'Campylobacter',
    'Lautropia',
    'Leptotrichia',
    'Neisseriaceae',
    'Treponema',
    'Fusobacterium',
    'Saccharibacteria'
]

dict_sci_rename = {
    'TM7':'Saccharibacteria',
    'Neisseria':'Neisseriaceae'
}

In [None]:
# dict_group_sn = {
#     'a':['2024_04_27_hsdm_group_III_patient_11_aspect_DL_fov_01'],
#     'b':['2022_12_16_harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_02']
# }

In [None]:
dict_group_scn_abund = defaultdict(lambda: defaultdict(list))
for group, sns in dict_group_sn.items():
    print(group)
    for sn_ in sns:
        # Get fns
        date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", sn_)
        abundances_fn = abundances_fmt.format(date=date, sn=sn)

        # Load files
        abundances = np.load(abundances_fn)

        # add values
        ncells = len(abundances)
        scn_unq = np.unique(abundances)
        scn_used = []
        for sciname in scn_unq:
            scn = dict_sci_rename[sciname] if sciname in dict_sci_rename else sciname
            abund = sum(abundances == sciname) / ncells
            dict_group_scn_abund[group][scn].append(abund)
            scn_used.append(scn)

        # Zeros for missed taxa
        for sciname in sciname_list:
            if sciname not in scn_used:
                dict_group_scn_abund[group][sciname].append(0)


In [None]:
group_sort = ['healthy_implant', 'mucositis_implant', 'moderate_peri_implantitis', 'severe_peri_implantitis']
# group_sort = ['a','b']

xticks = np.arange(len(group_sort)) + 1
dict_group_xtick = dict(zip(group_sort, xticks))

s=5

for scn in sciname_list:
    print(scn)
    fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)

    for group in group_sort:
        # Load classif
        x = dict_group_xtick[group]
        bcvals = dict_group_scn_abund[group][scn]
        xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

        # color = dict_sciname_color[scn]
        # box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)
        # if scn == 'Veillonella': print(bcvals)
        ax.scatter(xs, bcvals, color='k', s=s)
    plt.show()
    # _ = ax.set_xticks(
    #     xticks, group_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
    # )
    # _ = ax.set_xticks([])
    # ax.set_xlim(0,np.max(xticks) + 1)
    # ax.set_ylim(-0.01,1.01)
    # ax.set_ylabel('Homogeneity\n(Fuzzy box counting fractal dimension)')
    # ax.set_ylim(1.4,2)

    # out_fn = sample_compare_dir + "/box_counting/2024_05_06_fuzzy_box_counting_bygroup.pdf".format(
    #     group
    # )
    # ip.check_dir(out_fn)
    # ip.save_fig(out_fn)

### Absolute abundance

In [None]:
out_fmt_absabund = out_fmt_classif + '/absolute_abundance'
volume_agg_fmt = out_fmt_absabund + '/{date}_{sn}_absolute_abundance_volume_aggregated.npy'
scinames_agg_fmt = out_fmt_absabund + '/{date}_{sn}_absolute_abundance_scinames_aggregated.npy'



In [None]:
# date = '2022_12_16'
# sn = 'harvardwelch_patient_10_tooth_8_aspect_MB_depth_supra_fov_02'


# volume_agg_fn = volume_agg_fmt.format(date=date, sn=sn)
# volume_agg = np.load(volume_agg_fn)
# scinames_agg_fn = scinames_agg_fmt.format(date=date, sn=sn)
# scinames_agg = np.load(scinames_agg_fn)

# print(volume_agg)
# scinames_agg

In [None]:
sciname_list = [
    'Pasteurellaceae',
    'Corynebacterium',
    'Veillonella',
    'Actinomyces',
    'Selenomonas',
    'Rothia',
    'Porphyromonas',
    'Capnocytophaga',
    'Prevotella',
    'Streptococcus',
    'Gemella',
    'Campylobacter',
    'Lautropia',
    'Leptotrichia',
    'Neisseriaceae',
    'Treponema',
    'Fusobacterium',
    'Saccharibacteria'
]

dict_sci_rename = {
    'TM7':'Saccharibacteria',
    'Neisseria':'Neisseriaceae'
}

In [None]:
dict_group_pat_volscis = defaultdict(lambda: defaultdict(dict))
for gr, ps_dict in dict_group_pat_sn.items():
    for pat, sns in ps_dict.items():
        n_unexcluded = 0
        for bn in sns:
            if bn not in exclude_sns:
                n_unexcluded += 1
        if n_unexcluded > 0:
            dict_group_pat_volscis[gr][pat]['vol'] = 0
            dict_group_pat_volscis[gr][pat]['scinames'] = []
            for bn in sns:
                if bn not in exclude_sns:
                    date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", bn)

                    volume_agg_fn = volume_agg_fmt.format(date=date, sn=sn)
                    volume_agg = np.load(volume_agg_fn)

                    dict_group_pat_volscis[gr][pat]['vol'] += volume_agg[0]

                    scinames_agg_fn = scinames_agg_fmt.format(date=date, sn=sn)
                    scinames_agg = np.load(scinames_agg_fn)

                    dict_group_pat_volscis[gr][pat]['scinames'] += scinames_agg.tolist()


In [None]:
dict_group_sci_abunds = defaultdict(lambda: defaultdict(list))
for gr, pvs_dict in dict_group_pat_volscis.items():
    for pat, vs_dict in pvs_dict.items():
        scinames_pat = vs_dict['scinames']
        volume_pat = vs_dict['vol']
        names, counts = np.unique(scinames_pat, return_counts=True)
        for sci, c in zip(names, counts): 
            sci = dict_sci_rename[sci] if sci in dict_sci_rename else sci
            dict_group_sci_abunds[gr][sci].append(c / volume_pat)


In [None]:
group_sort = ['healthy_implant', 'mucositis_implant', 'mild_peri_implantitis', 'moderate_severe_peri_implantitis']

xticks = np.arange(len(group_sort)) + 1
dict_group_xtick = dict(zip(group_sort, xticks))

s=5

for scn in sciname_list:
    print(scn)
    fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)

    for group in group_sort:
        # Load classif
        x = dict_group_xtick[group]
        bcvals = dict_group_sci_abunds[group][scn]
        xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

        color = dict_sciname_color[scn]
        box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)
        # if scn == 'Veillonella': print(bcvals)
        ax.scatter(xs, bcvals, color=color, s=s)
    
    _ = ax.set_xticks([])

    abund_dir_compare = sample_compare_dir + "/absolute_abundances"
    out_fn = abund_dir_compare + "/absolute_abundances_bygroup_sciname_{}.pdf".format(
        scn
    )
    if not os.path.exists(abund_dir_compare):
        os.makedirs(abund_dir_compare)
    ip.check_dir(out_fn)
    ip.save_fig(out_fn)
    plt.show()

Kruskal wallis test 

In [None]:
kruskal_ps = []
for sci in sciname_list:
    
    test = stats.kruskal(
        dict_group_sci_abunds[group_sort[0]][sci],
        dict_group_sci_abunds[group_sort[1]][sci],
        dict_group_sci_abunds[group_sort[2]][sci],
        dict_group_sci_abunds[group_sort[3]][sci],
    )    
    kruskal_ps.append(test.pvalue)
kruskal_ps = np.array(kruskal_ps)

In [None]:
np.where(kruskal_ps <= 0.05)

In [None]:
kruskal_ps[np.where(kruskal_ps <= 0.05)]

In [None]:
for sci, ps in zip(sciname_list, kruskal_ps):
    print(sci, ps)

In [None]:
kruskal_sci = [sciname_list[i] for i in np.where(kruskal_ps <= 0.05)[0]]
kruskal_sci

Dunn pairwise test

In [None]:
for sci in kruskal_sci + ['Actinomyces','Corynebacterium']:
    vals = [dict_group_sci_abunds[gr][sci] for gr in group_sort]
    print(sci)
    print(posthoc_dunn(vals))


## Just look at largest clusters

Get largest clusters for each sciname

In [None]:
# topn = 1
# dict_sn_large_clusters = defaultdict(dict)
# for sn, dict_scn_clsize in dict_sn_scn_clsize.items():
#     for scn, clsizes in dict_scn_clsize.items():      
#         cls_topn = np.sort(clsizes)[-topn:]
#         cls_mean = np.mean(cls_topn)
#         dict_sn_large_clusters[sn][scn] = cls_mean
        

Get largest clusters for each sample

In [None]:
# topn = 1
# dict_sn_topn = {}
# for sn, dict_scn_clsize in dict_sn_large_clusters.items():
#     group = dict_sn_group[sn]
#     sizes = []
#     for scn, size in dict_scn_clsize.items():
#         sizes.append(size)
#     sz_sort = np.sort(sizes)
#     sz_topn = sz_sort[-topn:]
#     sz_mean = np.mean(sz_topn)
#     dict_sn_topn[sn] = sz_mean
    


        

In [None]:
dict_sn_top = {}
for sn, dict_scn_clsize in dict_sn_scn_clsize.items():
    sn_top = 0
    for scn, clsizes in dict_scn_clsize.items():      
        scn_top = np.sort(clsizes)[-1]
        sn_top = scn_top if scn_top > sn_top else sn_top
    dict_sn_top[sn] = sn_top

        

In [None]:
positions

In [None]:
s = 10

fig, ax = ip.general_plot(dims=(5,5), ft=12)

group_list = ['healthy_implant', 'mucositis_implant', 'peri_implantitis']
xticks = np.arange(len(group_list)) + 1
dict_group_xtick = dict(zip(group_list, xticks))
# col_list = 
# dict_group_color = dict(group_list, col_list)

dict_group_sizes = defaultdict(list)
for sn, size in dict_sn_top.items():
    group = dict_sn_group[sn]
    if 'implant' in group:
        x = dict_group_xtick[group] + np.random.rand() * 0.2 - 0.1
        dict_group_sizes[group].append(size)
        ax.scatter(x, size, color='k', s=s)

sizes = [dict_group_sizes[g] for g in group_list]
# positions = [[xt]*len(s) for xt, s in zip(xticks, sizes)]

box = ax.boxplot(sizes, positions=xticks, vert=True, widths=0.5)

_ = ax.set_xticks(
    xticks, group_list, rotation=45, ha="right", va="top", rotation_mode="anchor"
)
ax.set_xlim(0,np.max(xticks) + 1)
ax.set_ylabel('Largest cluster in tilescan')


In [None]:
a = pd.DataFrame([])
a.append(pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]}))

# Test out multifractal analysis

In [None]:
class RectangleM_new:
    """
    Rectangle grid structure for quadrat-based method.

    Parameters
    ----------
    pp                : :class:`.PointPattern`
                        Point Pattern instance.
    count_column      : integer
                        Number of rectangles in the horizontal
                        direction. Use in pair with count_row to
                        fully specify a rectangle. Incompatible with
                        rectangle_width and rectangle_height.
    count_row         : integer
                        Number of rectangles in the vertical
                        direction. Use in pair with count_column to
                        fully specify a rectangle. Incompatible with
                        rectangle_width and rectangle_height.
    rectangle_width   : float
                        Rectangle width. Use in pair with
                        rectangle_height to fully specify a rectangle.
                        Incompatible with count_column & count_row.
    rectangle_height  : float
                        Rectangle height. Use in pair with
                        rectangle_width to fully specify a rectangle.
                        Incompatible with count_column & count_row.

    Attributes
    ----------
    pp                : :class:`.PointPattern`
                        Point Pattern instance.
    mbb               : array
                        Minimum bounding box for the point pattern.
    points            : array
                        x,y coordinates of the point points.
    count_column      : integer
                        Number of columns.
    count_row         : integer
                        Number of rows.
    num               : integer
                        Number of rectangular quadrats.

    """

    def __init__(self, pp, labels, count_column = 3, count_row = 3,
                 rectangle_width = 0, rectangle_height = 0):
        self.mbb = pp.mbb
        self.pp = pp
        self.points = np.asarray(pp.points)
        self.labels = np.array(labels)
        x_range = self.mbb[2]-self.mbb[0]
        y_range = self.mbb[3]-self.mbb[1]
        if rectangle_width & rectangle_height:
            self.rectangle_width = rectangle_width
            self.rectangle_height = rectangle_height

            # calculate column count and row count
            self.count_column = int(math.ceil(x_range / rectangle_width))
            self.count_row = int(math.ceil(y_range / rectangle_height))
        else:
            self.count_column = count_column
            self.count_row = count_row

            # calculate the actual width and height of cell
            self.rectangle_width = x_range/float(count_column)
            self.rectangle_height = y_range/float(count_row)
        self.num = self.count_column * self.count_row


    def point_location_sta(self):
        """
        Count the point events in each cell.

        Returns
        -------
        dict_id_count : dict
                        keys: rectangle id, values: number of point
                        events in each cell.
        """

        dict_id_count = {}
        for i in range(self.count_row):
            for j in range(self.count_column):
                dict_id_count[j+i*self.count_column] = 0

        for point in self.points:
            index_x = (point[0]-self.mbb[0]) // self.rectangle_width
            index_y = (point[1]-self.mbb[1]) // self.rectangle_height
            if index_x == self.count_column:
                index_x -= 1
            if index_y == self.count_row:
                index_y -= 1
            id = index_y * self.count_column + index_x
            dict_id_count[id] += 1
        return dict_id_count

    def _get_dict_id_labels(self):
        """
        Get a list of labels in each cell.

        Returns
        -------
        dict_id_count : dict
                        keys: rectangle id, values: number of point
                        events in each cell.
        """

        # dict_id_count = {}
        # for i in range(self.count_row):
        #     for j in range(self.count_column):
        #         dict_id_points[j+i*self.count_column] = []
        dict_id_labels = defaultdict(list)
        for point, lab in zip(self.points, self.labels):
            index_x = (point[0]-self.mbb[0]) // self.rectangle_width
            index_y = (point[1]-self.mbb[1]) // self.rectangle_height
            if index_x == self.count_column:
                index_x -= 1
            if index_y == self.count_row:
                index_y -= 1
            id = index_y * self.count_column + index_x
            dict_id_labels[id].append(lab)
        self.dict_id_labels = dict_id_labels

    def get_shannon_diversities(self):
        self._get_dict_id_labels()
        dict_idx_shannon = {}
        for idx, labels in self.dict_id_labels.items():
            labels = np.array(labels)
            nlab = len(labels)
            Hs = []
            for l in np.unique(labels):
                nl = sum(labels==l)
                p = nl / nlab
                Hs.append(p * np.log(p))
            H = -np.sum(Hs)
            dict_idx_shannon[idx] = H
        return dict_idx_shannon

    def get_simpson_diversities(self):
        self._get_dict_id_labels()
        dict_idx_simpson = {}
        for idx, labels in self.dict_id_labels.items():
            labels = np.array(labels)
            nlab = len(labels)
            if nlab > 1:
                etas = []
                for l in np.unique(labels):
                    nl = sum(labels==l)
                    etas.append(nl*(nl-1))
                D = 1 - np.sum(etas) / (nlab*(nlab-1))
            else:
                D = 0
            dict_idx_simpson[idx] = D
        return dict_idx_simpson

    def get_multispecies_tiles(self):
        self._get_dict_id_labels()
        dict_idx_multi = {}
        for idx, labels in self.dict_id_labels.items():
            c = 0
            if len(np.unique(labels)) > 1:
                c = 1
            dict_idx_multi[idx] = c
        return dict_idx_multi


    def get_partition_values(self, q):
        self._get_dict_id_labels()
        dict_idx = {}
        for idx, labels in self.dict_id_labels.items():
            labels = np.array(labels)
            nlab = len(labels)
            Xs = []
            for l in np.unique(labels):
                nl = sum(labels==l)
                p = nl / nlab
                if q == 1:
                    Xs.append(-p * np.log(p))
                else:
                    Xs.append(p**q)
            Xa = np.sum(Xs)
            dict_idx[idx] = Xa
        return dict_idx

    def get_partition_values_noreplace(self, q):
        self._get_dict_id_labels()
        dict_idx = {}
        for idx, labels in self.dict_id_labels.items():
            labels = np.array(labels)
            nlab = len(labels)
            Xs = []
            if nlab >= abs(q):
                for l in np.unique(labels):
                    nl = sum(labels==l)
                    p = nl / nlab
                    if q == 1:
                        Xs.append(-p * np.log(p))
                    else:
                        if nl >= abs(q):
                            minus = 0
                            ns = []
                            Ns = []
                            while minus < abs(q):
                                ns.append(nl - minus)
                                Ns.append(nlab - minus)
                                minus += 1
                            p_ = (np.prod(ns)/np.prod(Ns)) ** np.sign(q)
                            Xs.append(p_)
                        else:
                            Xs.append(0)
            else:
                Xs.append(0)
            Xa = np.sum(Xs)
            dict_idx[idx] = Xa
        return dict_idx

In [None]:
(0)**-1

Get partition values

In [None]:
date = "2023_02_08"
sn = "hsdm_group_1_sample_12_fov_01"

centroid_sciname_fn = centroid_sciname_fmt.format(date=date, sn=sn)
centroid_sciname = pd.read_csv(centroid_sciname_fn)
coords = np.array([eval(c) for c in centroid_sciname["coord"].values])
scinames = centroid_sciname["sciname"].values
scn_unq = np.unique(scinames)

# Get cells window
convex_hull = ps.cg.convex_hull(coords.tolist())
ch_arr = np.array(to_ccf(convex_hull))
# plt.plot(ch_arr[:, 1], ch_arr[:, 0])
# plt.gca().invert_yaxis()
window = Window([convex_hull])

# Get boxes initial
bbox = window.bbox
# w_shp = np.array([bbox[2] - bbox[0], bbox[3] - bbox[1]])
box_edge_init_um = 50
div_edgesize = 1.5
box_edge_min_um = 5
col='k'
# range of moments
qrange = np.arange(-5,5)

# Get point pattern
pp = PointPattern(coords, window=window)
# Get qstatistic
box_edge_um = box_edge_init_um
box_size = []
dict_q_Xams = defaultdict(list)
while box_edge_um >= box_edge_min_um:
    print(box_edge_um)
    box_edge = box_edge_um / res_umpix
    nxy = np.ceil(w_shp / box_edge).astype(int)
    for q in qrange:
        rect = RectangleM_new(
            pp, 
            count_column = nxy[1], 
            count_row = nxy[1],
            labels=scinames
        ).get_partition_values(q)
        Xas = np.array(list(rect.values()))
        Xam = np.mean(Xas)
        dict_q_Xams[q].append(Xam)
    dxy = w_shp / nxy
    diag = np.sqrt(np.sum(dxy**2))
    diag_um = diag * res_umpix
    box_size.append(diag_um)

    box_edge_um /= div_edgesize



In [None]:
lnc = dict_q_Xams[1]
stats.linregress(lns, lnc)

Look at positive q values

In [None]:
zqs = []
dict_q_col = dict(zip(qrange, plt.get_cmap('tab10').colors))
fig, ax = ip.general_plot(dims=(10,5))
for q in range(5):
    # Get regression
    Xams = dict_q_Xams[q]
    lnc = np.log(Xams) if q != 1 else np.array(Xams)
    lns = np.log(box_size)
    slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
    zq = slope/(1-q) if q != 1 else slope
    print('q:', q,', fit: ',slope, ', r^2: ', r_value**2, ', zq: ',zq)
    zqs.append(zq)

    Xams = np.exp(Xams) if q == 1 else Xams
    col = dict_q_col[q]
    ax.scatter(box_size, Xams, label=q, color=col)

    x = np.array([min(box_size), max(box_size)])
    y = math.exp(intercept) * x**slope
    ax.plot(x, y, color=col, label=q)


ax.set_xscale("log")
ax.set_yscale("log")
plt.legend()
plt.show()
plt.close()

print(box_edge_um)

Look at negative q values

In [None]:
np.arange(-5,1)

In [None]:
zqs = []
dict_q_col = dict(zip(qrange, plt.get_cmap('tab10').colors))
fig, ax = ip.general_plot(dims=(10,5))
for q in range(-5,1):
    # Get regression
    Xams = dict_q_Xams[q]
    lnc = np.log(Xams) if q != 1 else np.array(Xams)
    lns = np.log(areas)
    slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
    zq = slope/(1-q) if q != 1 else slope
    print('q:', q,', fit: ',slope, ', r^2: ', r_value**2, ', zq: ',zq)
    zqs.append(zq)

    Xams = np.exp(Xams) if q == 1 else Xams
    col = dict_q_col[q]
    ax.scatter(areas, Xams, label=q, color=col)

    x = np.array([min(areas), max(areas)])
    y = math.exp(intercept) * x**slope
    ax.plot(x, y, color=col)
    x = 1e3
    y = math.exp(intercept) * x**slope
    ax.text(
        x,y, 
        (
            "Slope = " + str(round(slope, 2)) 
            + ",\nR^2 = " + str(round(r_value**2, 4))
        ),
        ha="right",
        va="bottom",
        color=col
    )

ax.set_xscale("log")
ax.set_yscale("log")
plt.legend()
plt.show()
plt.close()

print(box_edge_um)

Look at fractal dimension as a function of q

In [None]:
zqs = []
for q in qrange:
    # Get regression
    Xams = dict_q_Xams[q]
    lnc = np.log(Xams) if q != 1 else np.array(Xams)
    lns = np.log(areas)
    slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
    zq = slope/(1-q) if q != 1 else slope
    print('q:', q,', fit: ',slope, ', r^2: ', r_value**2, ', zq: ',zq)
    zqs.append(zq)
fig, ax = ip.general_plot(dims=(10,5))
ax.scatter(qrange, zqs)

In [None]:
a = pd.DataFrame([np.array([1,2,3]) for i in range(4)])
a.columns = np.array(['a','b','c'])
a

### Compare with a different sample

Get partion values

In [None]:
date = "2023_10_18"
sn = "hsdm_slide_IIL_fov_01"

centroid_sciname_fn = centroid_sciname_fmt.format(date=date, sn=sn)
centroid_sciname = pd.read_csv(centroid_sciname_fn)
coords = np.array([eval(c) for c in centroid_sciname["coord"].values])
scinames = centroid_sciname["sciname"].values
scn_unq = np.unique(scinames)

# Get cells window
convex_hull = ps.cg.convex_hull(coords.tolist())
ch_arr = np.array(to_ccf(convex_hull))
# plt.plot(ch_arr[:, 1], ch_arr[:, 0])
# plt.gca().invert_yaxis()
window = Window([convex_hull])

# Get boxes initial
bbox = window.bbox
# w_shp = np.array([bbox[2] - bbox[0], bbox[3] - bbox[1]])
box_edge_init_um = 50
div_edgesize = 1.5
box_edge_min_um = 5
col='k'
# range of moments
qrange = np.arange(-5,5)

# Get point pattern
pp = PointPattern(coords, window=window)
# Get qstatistic
box_edge_um = box_edge_init_um
box_size = []
areas = []
dict_sn_q_Xams = defaultdict(lambda: defaultdict(list))
while box_edge_um >= box_edge_min_um:
    print(box_edge_um)
    box_edge = box_edge_um / res_umpix
    nxy = np.ceil(w_shp / box_edge).astype(int)
    for q in qrange:
        rect = RectangleM_new(
            pp, 
            count_column = nxy[1], 
            count_row = nxy[1],
            labels=scinames
        ).get_partition_values(q)
        Xas = np.array(list(rect.values()))
        Xam = np.mean(Xas)
        dict_sn_q_Xams[sn][q].append(Xam)
    dxy = w_shp / nxy
    diag = np.sqrt(np.sum(dxy**2))
    diag_um = diag * res_umpix
    box_size.append(diag_um)
    areas.append(np.prod(dxy * res_umpix))

    box_edge_um /= div_edgesize



Positive q

In [None]:
dict_q_col = dict(zip(qrange, plt.get_cmap('tab10').colors))
fig, ax = ip.general_plot(dims=(10,5))
for q in range(5):
    # Get regression
    Xams = dict_sn_q_Xams[sn][q]
    lnc = np.log(Xams) if q != 1 else np.array(Xams)
    lns = np.log(box_size)
    slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
    zq = slope/(1-q) if q != 1 else slope
    print('q:', q,', fit: ',slope, ', r^2: ', r_value**2, ', zq: ',zq)

    Xams = np.exp(Xams) if q == 1 else Xams
    col = dict_q_col[q]
    ax.scatter(box_size, Xams, label=q, color=col)

    x = np.array([min(box_size), max(box_size)])
    y = math.exp(intercept) * x**slope
    ax.plot(x, y, color=col, label=q)


ax.set_xscale("log")
ax.set_yscale("log")
plt.legend()
plt.show()
plt.close()

print(box_edge_um)

negative q

In [None]:
dict_q_col = dict(zip(qrange, plt.get_cmap('tab10').colors))
fig, ax = ip.general_plot(dims=(10,5))
for q in range(-5,1):
    # Get regression
    Xams = dict_sn_q_Xams[sn][q]
    lnc = np.log(Xams) if q != 1 else np.array(Xams)
    lns = np.log(box_size)
    slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
    zq = slope/(1-q) if q != 1 else slope
    print('q:', q,', fit: ',slope, ', r^2: ', r_value**2, ', zq: ',zq)

    Xams = np.exp(Xams) if q == 1 else Xams
    col = dict_q_col[q]
    ax.scatter(box_size, Xams, label=q, color=col)

    x = np.array([min(box_size), max(box_size)])
    y = math.exp(intercept) * x**slope
    ax.plot(x, y, color=col)


ax.set_xscale("log")
ax.set_yscale("log")
plt.legend()
plt.show()
plt.close()

print(box_edge_um)

Comparison between fractal dimensino values

In [None]:
zqs = []
for q in qrange:
    # Get regression
    Xams = dict_q_Xams[q]
    lnc = np.log(Xams) if q != 1 else np.array(Xams)
    lns = np.log(areas)
    slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
    zq = slope/(1-q) if q != 1 else slope
    print('q:', q,', fit: ',slope, ', r^2: ', r_value**2, ', zq: ',zq)
    zqs.append(zq)

zqs1 = []
for q in qrange:
    # Get regression
    Xams = dict_sn_q_Xams[sn][q]
    lnc = np.log(Xams) if q != 1 else np.array(Xams)
    lns = np.log(areas)
    slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
    zq = slope/(1-q) if q != 1 else slope
    print('q:', q,', fit: ',slope, ', r^2: ', r_value**2, ', zq: ',zq)
    zqs1.append(zq)

fig, ax = ip.general_plot(dims=(10,5))
ax.plot(qrange, zqs, '-o')
ax.plot(qrange, zqs1, '-o')

### Do sum instead of mean?

In [None]:
dates = ["2023_02_08", "2023_10_18"]
sns = ["hsdm_group_1_sample_12_fov_01", "hsdm_slide_IIL_fov_01"]

dict_sn_q_Xass = defaultdict(lambda: defaultdict(list))
for date, sn in zip(dates, sns):
    centroid_sciname_fn = centroid_sciname_fmt.format(date=date, sn=sn)
    centroid_sciname = pd.read_csv(centroid_sciname_fn)
    coords = np.array([eval(c) for c in centroid_sciname["coord"].values])
    scinames = centroid_sciname["sciname"].values
    scn_unq = np.unique(scinames)

    # Get cells window
    convex_hull = ps.cg.convex_hull(coords.tolist())
    ch_arr = np.array(to_ccf(convex_hull))
    # plt.plot(ch_arr[:, 1], ch_arr[:, 0])
    # plt.gca().invert_yaxis()
    window = Window([convex_hull])

    # Get boxes initial
    bbox = window.bbox
    # w_shp = np.array([bbox[2] - bbox[0], bbox[3] - bbox[1]])
    box_edge_init_um = 50
    div_edgesize = 1.5
    box_edge_min_um = 5
    col='k'
    # range of moments
    qrange = np.arange(-4,6)

    # Get point pattern
    pp = PointPattern(coords, window=window)
    # Get qstatistic
    box_edge_um = box_edge_init_um
    box_size = []
    areas = []
    while box_edge_um >= box_edge_min_um:
        print(box_edge_um)
        box_edge = box_edge_um / res_umpix
        nxy = np.ceil(w_shp / box_edge).astype(int)
        for q in qrange:
            rect = RectangleM_new(
                pp, 
                count_column = nxy[1], 
                count_row = nxy[1],
                labels=scinames
            ).get_partition_values_new(q)
            Xas = np.array(list(rect.values()))
            Xam = np.mean(Xas)
            dict_sn_q_Xass[sn][q].append(Xam)
        dxy = w_shp / nxy
        diag = np.sqrt(np.sum(dxy**2))
        diag_um = diag * res_umpix
        box_size.append(diag_um)
        area = np.prod(dxy * res_umpix)
        areas.append(area)

        box_edge_um /= div_edgesize



Plot positive q values

In [None]:
areas

In [None]:
lnc

In [None]:
for date, sn in zip(dates, sns):
    print(sn)
    dict_q_col = dict(zip(qrange, plt.get_cmap('tab10').colors))
    fig, ax = ip.general_plot(dims=(10,5))
    for q in range(5):
        # Get regression
        Xams = dict_sn_q_Xass[sn][q]
        lnc = np.log(Xams) if q != 1 else np.array(Xams)
        lns = np.log(areas)
        slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
        zq = slope/(1-q) if q != 1 else slope
        print('q:', q,', fit: ',slope, ', r^2: ', r_value**2, ', zq: ',zq)

        Xams = np.exp(Xams) if q == 1 else Xams
        col = dict_q_col[q]
        ax.scatter(areas, Xams, label=q, color=col)

        x = np.array([min(areas), max(areas)])
        y = math.exp(intercept) * x**slope
        ax.plot(x, y, color=col, label=q)


    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.set_xlabel("log(edge length (μm))")
    ax.set_ylabel("log(partition function)")
    plt.legend()
    plt.show()
    plt.close()

    print(box_edge_um)

Negative q values

In [None]:
for date, sn in zip(dates, sns):
    print(sn)
    dict_q_col = dict(zip(qrange, plt.get_cmap('tab10').colors))
    fig, ax = ip.general_plot(dims=(10,5))
    for q in range(-4,1):
        # Get regression
        Xams = dict_sn_q_Xass[sn][q]
        lnc = np.log(Xams) if q != 1 else np.array(Xams)
        lns = np.log(areas)
        slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
        zq = slope/(1-q) if q != 1 else slope
        print('q:', q,', fit: ',slope, ', r^2: ', r_value**2, ', zq: ',zq)

        Xams = np.exp(Xams) if q == 1 else Xams
        col = dict_q_col[q]
        ax.scatter(areas, Xams, label=q, color=col)

        x = np.array([min(areas), max(areas)])
        y = math.exp(intercept) * x**slope
        ax.plot(x, y, color=col, label=q)


    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.set_xlabel("log(edge length (μm))")
    ax.set_ylabel("log(partition function)")
    plt.legend()
    plt.show()
    plt.close()

    print(box_edge_um)

Compare z values

In [None]:
fig, ax = ip.general_plot(dims=(10,5))
for sn in sns:
    print(sn)
    zqs = []
    for q in qrange:
        # Get regression
        Xams = dict_sn_q_Xass[sn][q]
        lnc = np.log(Xams) if q != 1 else np.array(Xams)
        lns = np.log(areas)
        slope, intercept, r_value, p_value, std_err = stats.linregress(lns, lnc)
        zq = slope/(1-q) if q != 1 else slope
        print('q:', q,', fit: ',slope, ', r^2: ', r_value**2, ', zq: ',zq)
        zqs.append(zq)
    ax.plot(qrange, zqs, '-o', label=sn)
plt.legend()
ax.set_xlabel("q")
ax.set_ylabel("z_q")

## After running multifractal pipeline

In [None]:
multifractal_all_fn = sample_compare_dir + '/multifractal_zq_values.csv'
multifractal_all = pd.read_csv(multifractal_all_fn, index_col=0)
multifractal_all.shape

In [None]:
multifractal_all

In [None]:
exclude_sns = [
    '2023_10_16_hsdm_slide_IB_fov_01', 
    '2023_10_16_hsdm_slide_IB_fov_03', 
    '2023_10_16_hsdm_slide_IB_fov_02', 
    '2022_12_16_harvardwelch_patient_18_tooth_2_aspect_MB_depth__fov_01',
    '2022_12_16_harvardwelch_patient_19_tooth_15_aspect_MF_depth_sub_fov_01',
    '2022_12_16_harvardwelch_patient_19_tooth_30_aspect_MB_depth_sub_fov_01',
    '2023_02_18_hsdm_group_II_patient_7_fov_02',
    '2023_02_18_hsdm_group_II_patient_7_fov_01',

]

In [None]:
qrange = multifractal_all.columns.values
qrange

In [None]:
dict_group_q_zqs = defaultdict(lambda: defaultdict(list))
for group, sns in dict_group_sn.items():
    print(group)
    for q in qrange:
        for sn in sns:
            if not sn in exclude_sns:
                bcval = multifractal_all.loc[sn,q]
                dict_group_q_zqs[group][q].append(bcval)

In [None]:
group_sort = ['healthy_implant', 'mucositis_implant', 'peri_implantitis']

xticks = np.arange(len(group_sort)) + 1
dict_group_xtick = dict(zip(group_sort, xticks))

s=5

for q in qrange:
    fig, ax = ip.general_plot(dims=(4,4), ft=12)

    for group in group_sort:
        x = dict_group_xtick[group]
        bcvals = dict_group_q_zqs[group][q]
        xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

        # color = dict_sciname_color[scn]
        box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)

        ax.scatter(xs, bcvals, color='k', s=s)

    _ = ax.set_xticks(
        xticks, group_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
    )
    ax.set_xlim(0,np.max(xticks) + 1)
    ax.set_ylabel('$Z_{}$'.format('{' + q + '}'))
    plt.show(plt.close)

In [None]:
fig, ax = ip.general_plot(dims=(3,2), ft=7)
sns = [
    "2023_02_08_hsdm_group_1_sample_11_fov_01", 
    "2023_10_18_hsdm_slide_IIL_fov_01", 
    "2023_02_18_hsdm_group_IV_patient_1_fov_01"
]
colors = [
    'tab:blue',
    'tab:green',
    'tab:red',
]
for sn, c in zip(sns, colors):
    zqs = multifractal_all.loc[sn, :].values
    ax.plot(qrange.astype(int)[4:8], zqs[4:8], '-o', label=sn, color=c)
# plt.legend()
# ax.set_xlabel("q")
# ax.set_ylabel("z_q")
ax.set_xticks([0,1,2,3])

multifractal_dir = sample_compare_dir + '/multifractal'
out_fn = multifractal_dir + '/zq_curves_three_images.pdf'
ip.check_dir(out_fn)
ip.save_fig(out_fn, transp=False)

In [None]:
dict_group_col = {
    'healthy_implant':'tab:blue',
    'mucositis_implant':'tab:green',
    'peri_implantitis':'tab:red',
}

fig, ax = ip.general_plot(dims=(10,10), ft=20)
for group, sns in dict_group_sn.items():
    if group in group_sort:
        col = dict_group_col[group]
        for sn in sns:
            if not sn in exclude_sns:
                zqs = multifractal_all.loc[sn, :].values
                ax.plot(qrange.astype(int), zqs, '-o', color=col)

## Plot values from power spectrum

In [None]:
out_dir = "../outputs/{date}/{date}_{sn}"

spatial_dir = out_dir + '/spatial_statistics'

power_spectrum_dir = spatial_dir + '/power_spectrum'
power_spectrum_fmt = power_spectrum_dir + '/{date}_{sn}_power_spectrum_fit.csv'


In [None]:
dict_sn_scn_psloper = defaultdict(dict)
for date, dict_sn_fns in dict_date_sn_fns.items():
    for sn, _ in dict_sn_fns.items():
        if os.path.exists(power_spectrum_fmt.format(date=date, sn=sn)):
            power_spectrums = pd.read_csv(power_spectrum_fmt.format(date=date, sn=sn))
            for i, row in power_spectrums.iterrows():
                scn, slope, rsquared = row[['sciname','slope','r_squared']]
                dsn = '{}_{}'.format(date,sn)
                dict_sn_scn_psloper[dsn][scn] = [slope, rsquared]

In [None]:
smin = 1e10
smax = -1e10

dict_scn_slopes = defaultdict(list)
dict_scn_rsquared = defaultdict(list)
dict_scn_groups = defaultdict(list)
for sn, dict_scn_psloper in dict_sn_scn_psloper.items():
    try:
        group = dict_sn_group[sn]
    except:
        group = "none"
        print(sn)
    if 'implant' in group:
        for scn, (slope, rsquared) in dict_scn_psloper.items():
            if scn == "Neisseria":
                scn = "Neisseriaceae"
            elif scn == "TM7":
                scn = "Saccharibacteria"
            elif scn == "TM":
                scn = "Saccharibacteria"

            if not np.isnan(slope):
                dict_scn_slopes[scn].append(slope)
                dict_scn_rsquared[scn].append(rsquared)
                dict_scn_groups[scn].append(group)


means = [np.median(s) for s in dict_scn_slopes.values()]
scns = [s for s in dict_scn_slopes.keys()]
scns_sort = [x for _, x in sorted(zip(means, scns))]


xticks = np.arange(len(scns_sort)) + 1
dict_sciname_ind = dict(zip(scns_sort, xticks))
# dict_sciname_ind["Neisseria"] = dict_sciname_ind["Neisseriaceae"]
# dict_sciname_ind["Saccharibacteria"] = dict_sciname_ind["TM7"]
# dict_sciname_ind["TM"] = dict_sciname_ind["TM7"]

s=100

fig, ax = ip.general_plot(dims=(20,10), ft=20)
x = 1
# scn_list = []
# xticks = []
dict_group_col = {
    'healthy_implant':'tab:blue',
    'mucositis_implant':'tab:purple',
    'mild_peri_implantitis':'tab:orange',
    'moderate_severe_peri_implantitis':'tab:red',
}
for scn in scns_sort:
    x = dict_sciname_ind[scn]
    slopes = dict_scn_slopes[scn]
    slopes = -np.array(slopes) 
    xs = [x] * len(slopes) + np.random.rand(len(slopes)) * 0.2 - 0.1
    # color = dict_sciname_color[scn]
    # box = ax.boxplot([slopes], positions=[x], vert=True, widths=0.5)
    box = ax.boxplot([slopes], positions=[x], vert=True, widths=0.5, patch_artist=True)
    for k, vs in box.items():
        for v in vs:
            v.set_color('k')
            v.set_alpha(0.25)
            # if k == 'boxes':
            # v.set_facecolor("k")

    groups = np.array(dict_scn_groups[scn])
    sizes = s*np.array(dict_scn_rsquared[scn])
    for g in ['healthy_implant','mucositis_implant','mild_peri_implantitis','moderate_severe_peri_implantitis']:
        boolg = groups == g
        col = dict_group_col[g]
        ax.scatter(
            xs[boolg], slopes[boolg],  s=sizes[boolg], color=col)


    # ax.scatter(xs[groups == "healthy_implant"], slopes[groups == "healthy_implant"], color="tab:blue", s=s)
    # ax.scatter(xs[groups == "mucositis_implant"], slopes[groups == "mucositis_implant"], color="tab:green", s=s)
    # ax.scatter(xs[groups == "peri_implantitis"], slopes[groups == "peri_implantitis"], color="tab:red", s=s)


    
    x += 1

_ = ax.set_xticks(
    xticks, scns_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
)
ax.set_xlim(0,np.max(xticks) + 1)
ax.set_ylabel("Power spectrum slope for each tilescan")


# out_fn = cluster_size_dir + "/size_distribution_by_sample_group_{}.png".format(
#     group
# )
# ip.check_dir(out_fn)
# ip.save_fig(out_fn)

In [None]:

toplot = ['Porphyromonas','Streptococcus','Selenomonas','Pasteurellaceae', 'Veillonella', 'Lautropia','Prevotella']

s=5

# fig, ax = ip.general_plot(dims=(4,4), ft=7)
fig, ax = ip.general_plot(dims=(2,2), ft=7)
x = 1
scn_list = []
xticks = []
for scn in toplot:
    # if scn in toplot:
    # x = dict_sciname_ind[scn]
    slopes = dict_scn_slopes[scn]
    slopes = -np.array(slopes) 
    xs = [x] * len(slopes) + np.random.rand(len(slopes)) * 0.2 - 0.1
    # color = dict_sciname_color[scn]
    # box = ax.boxplot([slopes], positions=[x], vert=True, widths=0.5)
    box = ax.boxplot([slopes], positions=[x], vert=True, widths=0.5, patch_artist=True)
    for k, vs in box.items():
        for v in vs:
            v.set_color('k')
            v.set_alpha(0.25)
            # if k == 'boxes':
            # v.set_facecolor("k")

    groups = np.array(dict_scn_groups[scn])

    ax.scatter(xs[groups == "healthy_implant"], slopes[groups == "healthy_implant"], color="tab:blue", s=s)
    ax.scatter(xs[groups == "mucositis_implant"], slopes[groups == "mucositis_implant"], color="tab:cyan", s=s)
    ax.scatter(xs[groups == "mild_peri_implantitis"], slopes[groups == "mild_peri_implantitis"], color="tab:pink", s=s)
    ax.scatter(xs[groups == "moderate_severe_peri_implantitis"], slopes[groups == "moderate_severe_peri_implantitis"], color="tab:red", s=s)

    scn_list.append(scn)
    xticks.append(x)
    
    x += 1

_ = ax.set_xticks(
    xticks, scn_list, rotation=45, ha="right", va="top", rotation_mode="anchor"
)
ax.set_xlim(0,np.max(xticks) + 1)
# ax.set_ylim(0,2)
# ax.set_ylabel("Power law eponent for each tilescan")

power_spectrum_compare_dir = sample_compare_dir + '/power_spectrum'
out_fn = power_spectrum_compare_dir + "/slopes_boxplot_select_genera.pdf"
ip.check_dir(out_fn)
ip.save_fig(out_fn)

## Bray curtis vs distance

### Get patches

Load file

In [None]:
sns = [
    '2023_02_08_hsdm_group_1_sample_06_fov_01',
    '2024_04_27_hsdm_group_III_patient_11_aspect_DL_fov_01'
]


cols = plt.get_cmap('tab10').colors
dict_bn_col = {
    '2023_02_08_hsdm_group_1_sample_06_fov_01': cols[0],
    '2024_04_27_hsdm_group_III_patient_11_aspect_DL_fov_01':cols[1]
}

out_fmt_classif = out_dir + "/classif"
out_dir_coords = out_fmt_classif + "/coords_240705_cosdist"
centroid_sciname_fmt = out_dir_coords + '/{date}_{sn}_centroid_sciname.csv'

cent_scis = []
for bn in sns:
    date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", bn)
    cs = pd.read_csv(centroid_sciname_fmt.format(date=date, sn=sn))
    cent_scis.append(cs)

In [None]:
cent_scis[0].columns

In [None]:
czi_fn = "../data/2022_12_16_harvardwelch_patient_14_tooth_14_aspect_MB_depth_sub_fov_02tile_las_488.czi"
resolution = fsi.get_resolution(czi_fn)
res_umpix = resolution * 1e6


In [None]:
scinames_sort = [
    'Pasteurellaceae',
    'Corynebacterium',
    'Veillonella',
    'Actinomyces',
    'Selenomonas',
    'Rothia',
    'Porphyromonas',
    'Capnocytophaga',
    'Prevotella',
    'Streptococcus',
    'Gemella',
    'Campylobacter',
    'Lautropia',
    'Leptotrichia',
    'Neisseriaceae',
    'Treponema',
    'Fusobacterium',
    'Saccharibacteria'
]

dict_sci_rename = {
    'TM7':'Saccharibacteria',
    'Neisseria':'Neisseriaceae'
}

In [None]:
radius_um = 5
n_compare = 10000

radius_pix = radius_um / res_umpix
euc_bc_list = []
for cent_sci in cent_scis:
    euc_bc = []
    coords = np.array([eval(c) for c in cent_sci.coord.values])
    scinames_cosdist = cent_sci.sciname.values
    nbrs = NearestNeighbors(radius=radius_pix)
    nbrs.fit(coords)
    nn = nbrs.radius_neighbors(coords, return_distance=False)
    choice = np.arange(coords.shape[0])
    rand_lst = [
        np.random.choice(choice, size=(1,2), replace=False) 
        for _ in range(n_compare)
    ]
    # iterate through random pairs of cells
    for r in tqdm(rand_lst):
        r = np.squeeze(r)
        euclid = distance.euclidean(coords[r[0]], coords[r[1]])
        # get nearest neighbors for both indices
        nns = [nn[r_] for r_ in r]
        # get scinames for nearest neighbors
        nn_mats = []
        for ns in nns:
            scis = []
            for i in ns:
                sci = scinames_cosdist[i]
                sci = dict_sci_rename[sci] if sci in dict_sci_rename else sci
                scis.append(sci)
            # Build feature matrices
            scis = np.array(scis)
            mat = []
            for sci in scinames_sort:
                count = sum(scis == sci)
                mat.append(count)
            nn_mats.append(mat)

        # Get distance
        bc = distance.braycurtis(nn_mats[0], nn_mats[1])
        euc_bc.append([euclid, bc])
    euc_bc_list.append(euc_bc)



In [None]:
d_um = [10,20,30,40,50,60,70,80,90,100,110,120,140]
d = [dp / res_umpix for dp in d_um]
ddb_list = []
for euc_bc in euc_bc_list:
    dict_dist_bcvals = defaultdict(list)
    for euc, bc in euc_bc:
        for i in range(len(d) - 1):
            if (euc >= d[i]) and (euc < d[i+1]):
                dict_dist_bcvals[i].append(bc)
    ddb_list.append(dict_dist_bcvals)

In [None]:
for dict_dist_bcvals in ddb_list:
    for dist, bcvals in dict_dist_bcvals.items():
        print(dist, len(bcvals))

In [None]:
xticks = np.arange(len(ddb_list)) + 1
dict_group_xtick = dict(zip(np.arange(len(ddb_list)), xticks))

s = 1
alpha = 0.1

for di in range(len(d)-1):
    print(di)
    fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)

    bcvalist = []
    for i, dict_dist_bcvals in enumerate(ddb_list):
        # Load classif
        x = dict_group_xtick[i]
        bcvals = dict_dist_bcvals[di]
        bcvalist.append(bcvals)
        print(len(bcvals))
        xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

        color = dict_bn_col[sns[i]]
        box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)
        # if scn == 'Veillonella': print(bcvals)
        ax.scatter(xs, bcvals, color=color, s=s, alpha=alpha)
    plt.show()
    print(stats.ttest_ind(bcvalist[0], bcvalist[1]))

Limited radius

In [None]:
radius_um = 5
n_compare = 100000
rlimu_um = 20
rliml_um = 10

rlimu = rlimu_um / res_umpix
rliml = rliml_um / res_umpix
radius_pix = radius_um / res_umpix
euc_bc_list = []
for cent_sci in cent_scis:
    euc_bc = []
    coords = np.array([eval(c) for c in cent_sci.coord.values])
    scinames_cosdist = cent_sci.sciname.values
    nbrs = NearestNeighbors(radius=radius_pix)
    nbrs.fit(coords)
    nn = nbrs.radius_neighbors(coords, return_distance=False)
    choice = np.arange(coords.shape[0])
    rand_lst = [
        np.random.choice(choice, size=(1,2), replace=False) 
        for _ in range(n_compare)
    ]
    # iterate through random pairs of cells
    for r in tqdm(rand_lst):
        r = np.squeeze(r)
        euclid = distance.euclidean(coords[r[0]], coords[r[1]])
        if (euclid > rliml) and (euclid < rlimu):
            # get nearest neighbors for both indices
            nns = [nn[r_] for r_ in r]
            # get scinames for nearest neighbors
            nn_mats = []
            for ns in nns:
                scis = []
                for i in ns:
                    sci = scinames_cosdist[i]
                    sci = dict_sci_rename[sci] if sci in dict_sci_rename else sci
                    scis.append(sci)
                # Build feature matrices
                scis = np.array(scis)
                mat = []
                for sci in scinames_sort:
                    count = sum(scis == sci)
                    mat.append(count)
                nn_mats.append(mat)

            # Get distance
            bc = distance.braycurtis(nn_mats[0], nn_mats[1])
            euc_bc.append([euclid, bc])
    euc_bc_list.append(euc_bc)



In [None]:
d_um = [10,20]
d = [dp / res_umpix for dp in d_um]
ddb_list = []
for euc_bc in euc_bc_list:
    dict_dist_bcvals = defaultdict(list)
    for euc, bc in euc_bc:
        for i in range(len(d) - 1):
            if (euc >= d[i]) and (euc < d[i+1]):
                dict_dist_bcvals[i].append(bc)
    ddb_list.append(dict_dist_bcvals)

In [None]:
for dict_dist_bcvals in ddb_list:
    for dist, bcvals in dict_dist_bcvals.items():
        print(dist, len(bcvals))

In [None]:
xticks = np.arange(len(ddb_list)) + 1
dict_group_xtick = dict(zip(np.arange(len(ddb_list)), xticks))

s = 1
alpha = 0.1

for di in range(len(d)-1):
    print(di)
    fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)

    for i, dict_dist_bcvals in enumerate(ddb_list):
        # Load classif
        x = dict_group_xtick[i]
        bcvals = dict_dist_bcvals[di]
        print(len(bcvals))
        xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

        color = dict_bn_col[sns[i]]
        box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)
        # if scn == 'Veillonella': print(bcvals)
        ax.scatter(xs, bcvals, color=color, s=s, alpha=alpha)
    plt.show()

In [None]:
sorted([[3,4],[2,254],[1,21]])

In [None]:
xticks = np.arange(len(ddb_list)) + 1
dict_group_xtick = dict(zip(np.arange(len(ddb_list)), xticks))

s = 1
alpha = 0.01



for i, euc_bc in enumerate(euc_bc_list):
    fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)
    # Load classif
    # x = dict_group_xtick[i]
    x = np.arange(len(euc_bc))
    bcvals_sort = [x for _, x in sorted(euc_bc)]
    # print(len(bcvals))
    # xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

    color = dict_bn_col[sns[i]]
    # box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)
    # if scn == 'Veillonella': print(bcvals)
    ax.scatter(x, bcvals_sort, color=color, s=s)
    plt.show()

Limited radius heatmap

In [None]:
radius_um = np.array([5,10])
n_compare = 100000
rlimu_um = np.array([20,30,40])
rliml_um = 10

rlimu = rlimu_um / res_umpix
rliml = rliml_um / res_umpix
radius_pix = radius_um / res_umpix
euc_bc_list = []
for cent_sci in cent_scis:
    euc_bc = []
    coords = np.array([eval(c) for c in cent_sci.coord.values])
    scinames_cosdist = cent_sci.sciname.values

    nbrs = NearestNeighbors(radius=radius_pix)
    nbrs.fit(coords)
    nn = nbrs.radius_neighbors(coords, return_distance=False)
    choice = np.arange(coords.shape[0])
    rand_lst = [
        np.random.choice(choice, size=(1,2), replace=False) 
        for _ in range(n_compare)
    ]
    # iterate through random pairs of cells
    for r in tqdm(rand_lst):
        r = np.squeeze(r)
        euclid = distance.euclidean(coords[r[0]], coords[r[1]])
        if (euclid > rliml) and (euclid < rlimu):
            # get nearest neighbors for both indices
            nns = [nn[r_] for r_ in r]
            # get scinames for nearest neighbors
            nn_mats = []
            for ns in nns:
                scis = []
                for i in ns:
                    sci = scinames_cosdist[i]
                    sci = dict_sci_rename[sci] if sci in dict_sci_rename else sci
                    scis.append(sci)
                # Build feature matrices
                scis = np.array(scis)
                mat = []
                for sci in scinames_sort:
                    count = sum(scis == sci)
                    mat.append(count)
                nn_mats.append(mat)

            # Get distance
            bc = distance.braycurtis(nn_mats[0], nn_mats[1])
            euc_bc.append([euclid, bc])
    euc_bc_list.append(euc_bc)



In [None]:
d_um = [10,20]
d = [dp / res_umpix for dp in d_um]
ddb_list = []
for euc_bc in euc_bc_list:
    dict_dist_bcvals = defaultdict(list)
    for euc, bc in euc_bc:
        for i in range(len(d) - 1):
            if (euc >= d[i]) and (euc < d[i+1]):
                dict_dist_bcvals[i].append(bc)
    ddb_list.append(dict_dist_bcvals)

In [None]:
for dict_dist_bcvals in ddb_list:
    for dist, bcvals in dict_dist_bcvals.items():
        print(dist, len(bcvals))

In [None]:
xticks = np.arange(len(ddb_list)) + 1
dict_group_xtick = dict(zip(np.arange(len(ddb_list)), xticks))

s = 1
alpha = 0.1

for di in range(len(d)-1):
    print(di)
    fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)

    for i, dict_dist_bcvals in enumerate(ddb_list):
        # Load classif
        x = dict_group_xtick[i]
        bcvals = dict_dist_bcvals[di]
        print(len(bcvals))
        xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

        color = dict_bn_col[sns[i]]
        box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)
        # if scn == 'Veillonella': print(bcvals)
        ax.scatter(xs, bcvals, color=color, s=s, alpha=alpha)
    plt.show()

## Grouped bray curtis analysis

In [None]:
beta_diversity_fmt = out_dir + '/beta_diversity/{date}_{sn}_dict_radius_distance_bray_curtis_mean.yaml'

dict_group_pat_bcs = defaultdict(lambda: defaultdict(list))
for gr, ps_dict in dict_group_pat_sn.items():
    for pat, sns in ps_dict.items():
        for bn in sns:
            if bn not in exclude_sns:
                date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", bn)
                fn = beta_diversity_fmt.format(date=date, sn=sn)
                with open(fn, 'r') as f:
                    dict_radius_distance_bc = yaml.unsafe_load(f)
                dict_group_pat_bcs[gr][pat].append(dict_radius_distance_bc)

Plot bray curtis vs distance

In [None]:
fig, ax = ip.general_plot()
for dist, bc in dict_group_pat_bcs['healthy_implant']['patient_6'][0][10].items():
    ax.plot(dist * res_umpix, bc, '.k')
for dist, bc in dict_group_pat_bcs['healthy_implant']['patient_6'][1][10].items():
    ax.plot(dist * res_umpix, bc, '.k')
for dist, bc in dict_group_pat_bcs['mild_peri_implantitis']['patient_11'][0][10].items():
    ax.plot(dist * res_umpix, bc, '.r')
for dist, bc in dict_group_pat_bcs['mild_peri_implantitis']['patient_11'][1][10].items():
    ax.plot(dist * res_umpix, bc, '.r')



In [None]:
fig, ax = ip.general_plot()
for dist, bc in dict_group_pat_bcs['healthy_implant']['patient_6'][0][20].items():
    ax.plot(dist * res_umpix, bc, '.k')
for dist, bc in dict_group_pat_bcs['healthy_implant']['patient_6'][1][20].items():
    ax.plot(dist * res_umpix, bc, '.k')
for dist, bc in dict_group_pat_bcs['mild_peri_implantitis']['patient_11'][0][20].items():
    ax.plot(dist * res_umpix, bc, '.r')
for dist, bc in dict_group_pat_bcs['mild_peri_implantitis']['patient_11'][1][20].items():
    ax.plot(dist * res_umpix, bc, '.r')



In [None]:
fig, ax = ip.general_plot()
for dist, bc in dict_group_pat_bcs['healthy_implant']['patient_6'][0][5].items():
    ax.plot(dist * res_umpix, bc, '.k')
for dist, bc in dict_group_pat_bcs['healthy_implant']['patient_11'][0][5].items():
    ax.plot(dist * res_umpix, bc, '.k')
for dist, bc in dict_group_pat_bcs['mild_peri_implantitis']['patient_11'][0][5].items():
    ax.plot(dist * res_umpix, bc, '.r')
for dist, bc in dict_group_pat_bcs['mild_peri_implantitis']['patient_2'][0][5].items():
    ax.plot(dist * res_umpix, bc, '.r')

# ax.set_xscale("log")
# ax.set_yscale("log")  

In [None]:
rad = 5
s = 1
for rad in [5,10,20]:
    dict_group_color = dict(zip(group_sort, plt.get_cmap('tab10').colors))

    fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)

    for gr, ps_dict in dict_group_pat_bcs.items():
        col = dict_group_color[gr]
        for pat, dcts in ps_dict.items():
            for dict_radius_distance_bc in dcts:
                dict_dist_bc = dict_radius_distance_bc[rad]
                for d, bc in dict_dist_bc.items():
                    ax.plot(d * res_umpix, bc, '.', color=col, ms=s, mec=col)
    out_fn = sample_compare_dir + "/bray_curtis/bray_curtis_mean_patch_radius_{}.pdf".format(
        rad
    )
    ip.check_dir(out_fn)
    ip.save_fig(out_fn)



Mean beta diversity at 100um

In [None]:
list(dict_radius_distance_bc[rad].keys())

In [None]:
100 / res_umpix

In [None]:
rad = 10
dst = 3

dict_group_bcs = defaultdict(list)
for gr, ps_dict in dict_group_pat_bcs.items():
    for pat, dcts in ps_dict.items():
        bcs = []
        for dict_radius_distance_bc in dcts:
            dists = list(dict_radius_distance_bc[rad].keys())
            print(dists[dst])
            bc = dict_radius_distance_bc[rad][dists[dst]]
            bcs.append(bc)
        dict_group_bcs[gr].append(np.mean(bcs))

In [None]:
dists[dst] * res_umpix


In [None]:
group_sort = ['healthy_implant', 'mucositis_implant', 'mild_peri_implantitis', 'moderate_severe_peri_implantitis']

xticks = np.arange(len(group_sort)) + 1
dict_group_xtick = dict(zip(group_sort, xticks))

s=5

fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)

for group in group_sort:
    x = dict_group_xtick[group]
    bcvals = dict_group_bcs[group]
    xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

    # color = dict_sciname_color[scn]
    box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)

    ax.scatter(xs, bcvals, color='k', s=s)

# _ = ax.set_xticks(
#     xticks, group_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
# )
_ = ax.set_xticks([])
ax.set_xlim(0,np.max(xticks) + 1)
# ax.set_ylim(0,2)
# ax.set_ylabel('Homogeneity\n(Fuzzy box counting fractal dimension)')
# ax.set_ylim(1.5,2)

# out_fn = sample_compare_dir + "/box_counting/2024_07_08_fuzzy_box_counting_bygroup_patient.pdf".format(
#     group
# )
# ip.check_dir(out_fn)
# ip.save_fig(out_fn)



Kruskal wallis test for difference in median

In [None]:

stats.kruskal(
    dict_group_bcs[group_sort[0]],
    dict_group_bcs[group_sort[1]],
    dict_group_bcs[group_sort[2]],
    dict_group_bcs[group_sort[3]],
)

Dunn pairwise test

In [None]:
posthoc_dunn(list(dict_group_bcs.values()))

## Average diversity nocell-> nan

In [None]:
sns = [
    '2023_02_08_hsdm_group_1_sample_06_fov_01',
    '2024_04_27_hsdm_group_III_patient_11_aspect_DL_fov_01'
]

cols = plt.get_cmap('tab10').colors
dict_bn_col = {
    '2023_02_08_hsdm_group_1_sample_06_fov_01': cols[0],
    '2024_04_27_hsdm_group_III_patient_11_aspect_DL_fov_01':cols[1]
}

In [None]:
spatial_dir = out_dir + '/spatial_statistics'
multifractal_dir = spatial_dir + '/multifractal'
local_diversity_nocell_nan_dict_fmt = multifractal_dir + '/{date}_{sn}_dict_area_q_partition_vals_nocell_nan.yaml'


Plot shannon diversities

In [None]:
area_i = 0
q_i = 1

dict_bn_shan = defaultdict(dict)
for bn in sns:
    date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", bn)
    print(date,sn)
    ldd_fn = local_diversity_nocell_nan_dict_fmt.format(date=date, sn=sn)
    with open(ldd_fn, 'r') as f:
        dict_area_q_Xas = yaml.unsafe_load(f)
    areas = sorted(list(dict_area_q_Xas.keys()))
    for a in range(len(areas)):
        Xas = np.array(dict_area_q_Xas[areas[a]][q_i])
        print(sum(np.isnan(Xas)))
        Xas = Xas[~np.isnan(Xas)]
        dict_bn_shan[bn][a] = Xas

    # Dsums = []
    # for area, dict_q_Xas in dict_area_q_Xas.items():
    #     areas.append(area)
    #     Xas = dict_q_Xas[2]
    #     Ds = 1 - np.array(Xas)
    #     Dsums.append(np.sum(Ds))
    # dict_bn_areas_Xas[bn] = [areas, Dsums]




In [None]:
xticks = np.arange(len(sns)) + 1
dict_group_xtick = dict(zip(np.arange(len(sns)), xticks))

s = 1
alpha = 0.1
for area_i in range(5):
    print(area_i)
    fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)
    for i, bn in enumerate(sns):
        x = dict_group_xtick[i]
        vals = dict_bn_shan[bn][area_i]
        # areas = sorted(list(dict_bn_shan[bn].keys()))
        xs = [x] * len(vals) + np.random.rand(len(vals)) * 0.2 - 0.1

        color = dict_bn_col[bn]
        ax.scatter(xs, vals, color=color, s=s, alpha=alpha)

        box = ax.boxplot([vals], positions=[x], vert=True, widths=0.5)






Plot simpson diversities

In [None]:
q_i = 2

dict_bn_simp = defaultdict(dict)
for bn in sns:
    date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", bn)
    print(date,sn)
    ldd_fn = local_diversity_nocell_nan_dict_fmt.format(date=date, sn=sn)
    with open(ldd_fn, 'r') as f:
        dict_area_q_Xas = yaml.unsafe_load(f)
    areas = sorted(list(dict_area_q_Xas.keys()))
    for a in range(len(areas)):
        Xas = np.array(dict_area_q_Xas[areas[a]][q_i])
        Xas = Xas[~np.isnan(Xas)]
        Ds = 1 - np.array(Xas)
        dict_bn_simp[bn][a] = Ds

    # Dsums = []
    # for area, dict_q_Xas in dict_area_q_Xas.items():
    #     areas.append(area)
    #     Xas = dict_q_Xas[2]
    #     Ds = 1 - np.array(Xas)
    #     Dsums.append(np.sum(Ds))
    # dict_bn_areas_Xas[bn] = [areas, Dsums]




In [None]:
xticks = np.arange(len(sns)) + 1
dict_group_xtick = dict(zip(np.arange(len(sns)), xticks))

s = 1
alpha = 0.1
for area_i in range(5):
    print(area_i)
    fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)
    for i, bn in enumerate(sns):
        x = dict_group_xtick[i]
        vals = dict_bn_simp[bn][area_i]
        # areas = sorted(list(dict_bn_shan[bn].keys()))
        xs = [x] * len(vals) + np.random.rand(len(vals)) * 0.2 - 0.1

        color = dict_bn_col[bn]
        ax.scatter(xs, vals, color=color, s=s, alpha=alpha)

        box = ax.boxplot([vals], positions=[x], vert=True, widths=0.5)






In [None]:
a = [0,1,2,3,np.nan]

In [None]:
np.array(a)[np.isnan(a)]

## Grouped alpha diversity

### shannon

Get mean and std dev for each tile and group

In [None]:
q = 1

dict_group_pat_area_shan = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
for gr, ps_dict in dict_group_pat_sn.items():
    for pat, sns in tqdm(ps_dict.items()):
        for bn in sns:
            if bn not in exclude_sns:
                date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", bn)
                ldd_fn = local_diversity_nocell_nan_dict_fmt.format(date=date, sn=sn)
                with open(ldd_fn, 'r') as f:
                    dict_area_q_Xas = yaml.unsafe_load(f)
                areas = sorted(list(dict_area_q_Xas.keys()))
                for i, a in enumerate(areas):
                    Xas = np.array(dict_area_q_Xas[a][q])
                    Xas = Xas[~np.isnan(Xas)]
                    mean = np.mean(Xas)
                    std = np.std(Xas)
                    dict_group_pat_area_shan[gr][pat][i].append([mean, std])

Average by patient

In [None]:
dict_group_area_patmeans = defaultdict(lambda: defaultdict(list))
for gr, psa_dict in dict_group_pat_area_shan.items():
    for pat, as_dict in psa_dict.items():
        for area, vals in as_dict.items():
            vals = np.array(vals)
            patmean = np.mean(vals, axis=0)
            dict_group_area_patmeans[gr][area].append(patmean)

Plot mean for varying area

In [None]:
group_sort = ['healthy_implant', 'mucositis_implant', 'mild_peri_implantitis', 'moderate_severe_peri_implantitis']

xticks = np.arange(len(group_sort)) + 1
dict_group_xtick = dict(zip(group_sort, xticks))

s=5

for ai in range(5):
    fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)
    for group in group_sort:
        x = dict_group_xtick[group]
        bcvals = dict_group_area_patmeans[group][ai]
        bcvals = [v[0] for v in bcvals]
        xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

        # color = dict_sciname_color[scn]
        box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)

        ax.scatter(xs, bcvals, color='k', s=s)

    # _ = ax.set_xticks(
    #     xticks, group_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
    # )
    _ = ax.set_xticks([])
    ax.set_xlim(0,np.max(xticks) + 1)
    plt.show()
    # ax.set_ylim(0,2)
    # ax.set_ylabel('Homogeneity\n(Fuzzy box counting fractal dimension)')
    # ax.set_ylim(1.5,2)

    print(stats.kruskal(
        [v[0] for v in dict_group_area_patmeans[group_sort[0]][ai]],
        [v[0] for v in dict_group_area_patmeans[group_sort[1]][ai]],
        [v[0] for v in dict_group_area_patmeans[group_sort[2]][ai]],
        [v[0] for v in dict_group_area_patmeans[group_sort[3]][ai]],
    ))

    # out_fn = sample_compare_dir + "/alpha_diversity/2024_07_12_shannon_mean_bygroup_patient.pdf".format(
    #     group
    # )
    # ip.check_dir(out_fn)
    # ip.save_fig(out_fn)



Plot stddev for varying area

In [None]:
group_sort = ['healthy_implant', 'mucositis_implant', 'mild_peri_implantitis', 'moderate_severe_peri_implantitis']

xticks = np.arange(len(group_sort)) + 1
dict_group_xtick = dict(zip(group_sort, xticks))

s=5

for ai in range(5):
    fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)
    for group in group_sort:
        x = dict_group_xtick[group]
        bcvals = dict_group_area_patmeans[group][ai]
        bcvals = [v[1] for v in bcvals]
        xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

        # color = dict_sciname_color[scn]
        box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)

        ax.scatter(xs, bcvals, color='k', s=s)

    # _ = ax.set_xticks(
    #     xticks, group_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
    # )
    _ = ax.set_xticks([])
    ax.set_xlim(0,np.max(xticks) + 1)
    # ax.set_ylim(0,2)
    # ax.set_ylabel('Homogeneity\n(Fuzzy box counting fractal dimension)')
    # ax.set_ylim(1.5,2)


    # out_fn = sample_compare_dir + "/alpha_diversity/2024_07_12_shannon_mean_bygroup_patient.pdf".format(
    #     group
    # )
    # ip.check_dir(out_fn)
    # ip.save_fig(out_fn)
    plt.show()
    print(stats.kruskal(
        [v[1] for v in dict_group_area_patmeans[group_sort[0]][ai]],
        [v[1] for v in dict_group_area_patmeans[group_sort[1]][ai]],
        [v[1] for v in dict_group_area_patmeans[group_sort[2]][ai]],
        [v[1] for v in dict_group_area_patmeans[group_sort[3]][ai]],
    ))


### simpson

Get mean and std dev for each tile and group

In [None]:
q = 2

dict_group_pat_area_simp = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
dict_ai_area = defaultdict(list)
for gr, ps_dict in dict_group_pat_sn.items():
    for pat, sns in tqdm(ps_dict.items()):
        for bn in sns:
            if bn not in exclude_sns:
                date, sn = re.split("(?<=^\d{4}_\d{2}_\d{2})_", bn)
                ldd_fn = local_diversity_nocell_nan_dict_fmt.format(date=date, sn=sn)
                with open(ldd_fn, 'r') as f:
                    dict_area_q_Xas = yaml.unsafe_load(f)
                areas = sorted(list(dict_area_q_Xas.keys()))
                for i, a in enumerate(areas):
                    Xas = np.array(dict_area_q_Xas[a][q])
                    Xas = Xas[~np.isnan(Xas)]
                    Xas = 1 - Xas
                    mean = np.mean(Xas)
                    std = np.std(Xas)
                    dict_group_pat_area_simp[gr][pat][i].append([mean, std])
                    dict_ai_area[i].append(a)

Average by patient

In [None]:
dict_group_area_patmeans = defaultdict(lambda: defaultdict(list))
for gr, psa_dict in dict_group_pat_area_simp.items():
    for pat, as_dict in psa_dict.items():
        for area, vals in as_dict.items():
            vals = np.array(vals)
            patmean = np.mean(vals, axis=0)
            dict_group_area_patmeans[gr][area].append(patmean)

Plot mean for varying area

In [None]:
group_sort = ['healthy_implant', 'mucositis_implant', 'mild_peri_implantitis', 'moderate_severe_peri_implantitis']

xticks = np.arange(len(group_sort)) + 1
dict_group_xtick = dict(zip(group_sort, xticks))

s=5

for ai in range(9):
    fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)
    for group in group_sort:
        x = dict_group_xtick[group]
        bcvals = dict_group_area_patmeans[group][ai]
        bcvals = [v[0] for v in bcvals]
        xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

        # color = dict_sciname_color[scn]
        box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)

        ax.scatter(xs, bcvals, color='k', s=s)

    # _ = ax.set_xticks(
    #     xticks, group_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
    # )
    _ = ax.set_xticks([])
    ax.set_xlim(0,np.max(xticks) + 1)
    # ax.set_ylim(0,2)
    # ax.set_ylabel('Homogeneity\n(Fuzzy box counting fractal dimension)')
    # ax.set_ylim(1.5,2)

    print(stats.kruskal(
        [v[0] for v in dict_group_area_patmeans[group_sort[0]][ai]],
        [v[0] for v in dict_group_area_patmeans[group_sort[1]][ai]],
        [v[0] for v in dict_group_area_patmeans[group_sort[2]][ai]],
        [v[0] for v in dict_group_area_patmeans[group_sort[3]][ai]],
    ))

    out_fn = sample_compare_dir + "/alpha_diversity/simpson_mean_bygroup_patient_area_{}.pdf".format(
        ai
    )
    ip.check_dir(out_fn)
    ip.save_fig(out_fn)
    plt.show()



In [None]:
for i, areas in dict_ai_area.items():
    print(i, np.mean(areas), np.mean(areas)**(1/2))

Plot stddev for varying area

In [None]:
group_sort = ['healthy_implant', 'mucositis_implant', 'mild_peri_implantitis', 'moderate_severe_peri_implantitis']

xticks = np.arange(len(group_sort)) + 1
dict_group_xtick = dict(zip(group_sort, xticks))

s=5

for ai in range(5):
    fig, ax = ip.general_plot(dims=(1.9,1.75), ft=7)
    for group in group_sort:
        x = dict_group_xtick[group]
        bcvals = dict_group_area_patmeans[group][ai]
        bcvals = [v[1] for v in bcvals]
        xs = [x] * len(bcvals) + np.random.rand(len(bcvals)) * 0.2 - 0.1

        # color = dict_sciname_color[scn]
        box = ax.boxplot([bcvals], positions=[x], vert=True, widths=0.5)

        ax.scatter(xs, bcvals, color='k', s=s)

    # _ = ax.set_xticks(
    #     xticks, group_sort, rotation=45, ha="right", va="top", rotation_mode="anchor"
    # )
    _ = ax.set_xticks([])
    ax.set_xlim(0,np.max(xticks) + 1)
    # ax.set_ylim(0,2)
    # ax.set_ylabel('Homogeneity\n(Fuzzy box counting fractal dimension)')
    # ax.set_ylim(1.5,2)


    # out_fn = sample_compare_dir + "/alpha_diversity/2024_07_12_shannon_mean_bygroup_patient.pdf".format(
    #     group
    # )
    # ip.check_dir(out_fn)
    # ip.save_fig(out_fn)
    plt.show()
    print(stats.kruskal(
        [v[1] for v in dict_group_area_patmeans[group_sort[0]][ai]],
        [v[1] for v in dict_group_area_patmeans[group_sort[1]][ai]],
        [v[1] for v in dict_group_area_patmeans[group_sort[2]][ai]],
        [v[1] for v in dict_group_area_patmeans[group_sort[3]][ai]],
    ))
