In [None]:
import scanpy as sc
import os
import numpy as np
import pandas as pd
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import pylab as pl
from scipy.spatial import cKDTree
import numpy as np
from sklearn.neighbors import KDTree
from tqdm.notebook import tqdm

In [None]:
experiment = "9hr_avr"

In [None]:
figure_output_folder = "6ac"

In [None]:
spatial_data = sc.read(
    os.path.join("../../data/integration", experiment, "spatial_data.h5ad")
)
seq_data = sc.read(os.path.join("../../data/integration", experiment, "seq_data.h5ad"))
latent_adata = sc.read(
    os.path.join("../../data/integration", experiment, "latent_adata.h5ad")
)

In [None]:
seq_file = "../../data/AvrRpt2_alone2.h5ad"
seq_data_copy = sc.read(seq_file)
seq_data_copy = seq_data_copy[
    seq_data_copy.obs["sample.order"].isin(
        [
            "00_00_Mock_rep1",
            "02_AvrRpt2_04h_rep1",
            "02_AvrRpt2_06h_rep1",
            "02_AvrRpt2_09h_rep1",
            "02_AvrRpt2_24h_rep1",
        ]
    )
]
seq_data_copy.obs["sample.order"] = (
    seq_data_copy.obs["sample.order"]
    .replace("00_00_Mock_rep1", "mock")
    .replace("02_AvrRpt2_04h_rep1", "4hr_avr")
    .replace("02_AvrRpt2_06h_rep1", "6hr_avr")
    .replace("02_AvrRpt2_09h_rep1", "9hr_avr")
    .replace("02_AvrRpt2_24h_rep1", "avrrpt24")
)
seq_data_copy.obs.index = [
    i.replace("00_col_Mock_rep1", "00_Mock")
    .replace("col_AvrRpt2_24h_rep1", "AvrRpt2_24h")
    .replace("col_AvrRpt2_04h_rep1", "AvrRpt2_04h")
    .replace("col_AvrRpt2_06h_rep1", "AvrRpt2_06h")
    .replace("col_AvrRpt2_09h_rep1", "AvrRpt2_09h")
    for i in seq_data_copy.obs.index
]

In [None]:
seq_data_copy.X = seq_data_copy.X.A
indices_match = [
    np.where(seq_data_copy.obs.index.values == seq_data.obs.index.values[i])[0][0]
    for i in range(len(seq_data.obs.index.values))
]
seq_data_copy = seq_data_copy[indices_match, :]
seq_data.X = seq_data.X.A
seq_data_copy_array = np.array(seq_data_copy.X)

In [None]:
# Assuming your original AnnData object is named 'adata'
adata = latent_adata.copy()

# Step 1: Filter spatial and seq cells
spatial_cells = adata[adata.obs["labels"] == "spatial"].copy()
seq_cells = adata[adata.obs["labels"] == "seq"].copy()


# Step 2: Create KD tree for seq cells
seq_gimvi = seq_data.obsm["X_gimvi"]
kdtree = KDTree(seq_gimvi)

# Step 3: Find nearest seq cells for each seq
n_neighbors = 30  # Specify the number of nearest neighbors
distances, indices = kdtree.query(spatial_data.obsm["X_gimvi"], k=n_neighbors)

# Step 4: Average gene expression for spatial cells
averaged_expression = np.zeros((spatial_cells.n_obs, len(seq_data_copy.X[0])))
for i in range(spatial_cells.n_obs):
    seq_neighbors_indices = indices[i]
    seq_neighbors_expression = seq_data_copy_array[seq_neighbors_indices]
    averaged_expression[i] = np.mean(seq_neighbors_expression, axis=0)

### ALD1 mRNA imputed

In [None]:
spatial_data.obsm["X_spatial"] = spatial_data.obs[["x", "y"]].values

In [None]:
gene_to_impute = "ALD1"
spatial_data.obs["current_imputed"] = averaged_expression[
    :, np.where(seq_data_copy.var.index == gene_to_impute)[0][0]
]

In [None]:
try:
    os.mkdir(figure_output_folder)
except:
    pass

sc.set_figure_params(dpi=400, dpi_save=400)
fig = sc.pl.embedding(
    spatial_data[spatial_data.obs["batch"] == experiment],
    basis="spatial",
    color=["current_imputed"],
    cmap="Blues",
    size=10,
    vmax=1.8,
    vmin=-0.01,
    return_fig=True,
    show=False,
    title="ALD1 mRNA imputed",
)
fig.dpi = 500
fig.tight_layout()
fig.savefig(os.path.join(figure_output_folder, f"ald1_mrna_imputed_{experiment}.pdf"))

### EDS16 imputation

In [None]:
gene_to_impute = "EDS16"
spatial_data.obs["current_imputed"] = averaged_expression[
    :, np.where(seq_data_copy.var.index == gene_to_impute)[0][0]
]

In [None]:
fig = sc.pl.embedding(
    spatial_data[spatial_data.obs["batch"] == experiment],
    basis="spatial",
    color=["current_imputed"],
    cmap="Blues",
    size=10,
    vmax=2.5,
    vmin=-0.01,
    return_fig=True,
    show=False,
    title="ICS1 mRNA imputed",
)
fig.tight_layout()
fig.savefig(os.path.join(figure_output_folder, f"ics1_mrna_imputed_{experiment}.pdf"))

### ATAC imputation

In [None]:
atac_data_copy = sc.read("../../data/AvrRpt2_alone2_atac.h5ad")

In [None]:
atac_data_copy = atac_data_copy[
    atac_data_copy.obs["sample.order"].isin(
        [
            "00_00_Mock_rep1",
            "02_AvrRpt2_04h_rep1",
            "02_AvrRpt2_06h_rep1",
            "02_AvrRpt2_09h_rep1",
            "02_AvrRpt2_24h_rep1",
        ]
    )
]
atac_data_copy.obs["sample.order"] = (
    atac_data_copy.obs["sample.order"]
    .replace("00_00_Mock_rep1", "mock")
    .replace("02_AvrRpt2_04h_rep1", "4hr_avr")
    .replace("02_AvrRpt2_06h_rep1", "6hr_avr")
    .replace("02_AvrRpt2_09h_rep1", "9hr_avr")
    .replace("02_AvrRpt2_24h_rep1", "avrrpt24")
)
atac_data_copy.obs.index = [
    i.replace("00_col_Mock_rep1", "00_Mock")
    .replace("col_AvrRpt2_24h_rep1", "AvrRpt2_24h")
    .replace("col_AvrRpt2_04h_rep1", "AvrRpt2_04h")
    .replace("col_AvrRpt2_06h_rep1", "AvrRpt2_06h")
    .replace("col_AvrRpt2_09h_rep1", "AvrRpt2_09h")
    for i in atac_data_copy.obs.index
]

In [None]:
atac_data_copy.X = atac_data_copy.X.A
indices_match = [
    np.where(atac_data_copy.obs.index.values == seq_data.obs.index.values[i])[0][0]
    for i in range(len(seq_data.obs.index.values))
]
atac_data_copy = atac_data_copy[indices_match, :]
# seq_data.X = seq_data.X
atac_data_copy_array = np.array(atac_data_copy.X)

In [None]:
# Assuming your original AnnData object is named 'adata'
adata = latent_adata.copy()

# Step 1: Filter spatial and seq cells
spatial_cells = adata[adata.obs["labels"] == "spatial"].copy()
seq_cells = adata[adata.obs["labels"] == "seq"].copy()


# Step 2: Create KD tree for seq cells
seq_gimvi = seq_data.obsm["X_gimvi"]
kdtree = KDTree(seq_gimvi)

# Step 3: Find nearest seq cells for each seq
n_neighbors = 100  # Specify the number of nearest neighbors
distances, indices = kdtree.query(spatial_data.obsm["X_gimvi"], k=n_neighbors)

# Step 4: Average gene expression for spatial cells
averaged_expression = np.zeros((spatial_cells.n_obs, len(atac_data_copy.X[0])))
for i in range(spatial_cells.n_obs):
    seq_neighbors_indices = indices[i]
    seq_neighbors_expression = atac_data_copy_array[seq_neighbors_indices, :]
    # print(seq_neighbors_expression)
    averaged_expression[i] = np.mean(seq_neighbors_expression, axis=0)

In [None]:
seq_data.obs["ALD1_guess"] = np.array(atac_data_copy.X)[
    :, np.where(atac_data_copy.var.index == "ALD1")[0][0]
]

In [None]:
sc.pl.embedding(seq_data, basis="X_umap_gimvi", color="ALD1_guess")

In [None]:
sc.pl.embedding(spatial_data, basis="X_umap_gimvi", color="current_imputed", vmax=0.3)

In [None]:
sc.pl.umap(spatial_data, color="current_imputed")

In [None]:
gene_to_impute = "ALD1"
spatial_data.obs["current_imputed"] = averaged_expression[
    :, np.where(atac_data_copy.var.index == gene_to_impute)[0][0]
]

In [None]:
fig = sc.pl.embedding(
    spatial_data[spatial_data.obs["batch"] == experiment],
    basis="spatial",
    color=["current_imputed"],
    cmap="Purples",
    size=10,
    vmax=0.12,
    vmin=0.05,
    return_fig=True,
    show=False,
    title="ALD1 ATAC imputed",
)
fig.tight_layout()
fig.savefig(os.path.join(figure_output_folder, f"ald1_atac_imputed_{experiment}.pdf"))

In [None]:
gene_to_impute = "EDS16"
spatial_data.obs["current_imputed"] = averaged_expression[
    :, np.where(atac_data_copy.var.index == gene_to_impute)[0][0]
]

In [None]:
fig = sc.pl.embedding(
    spatial_data[spatial_data.obs["batch"] == experiment],
    basis="spatial",
    color=["current_imputed"],
    cmap="Purples",
    size=10,
    vmax=0.125,
    vmin=0.02,
    return_fig=True,
    show=False,
    title="ICS1 ATAC imputed",
)
fig.tight_layout()
fig.savefig(os.path.join(figure_output_folder, f"ics1_atac_imputed_{experiment}.pdf"))

### Chromvar Imputation

In [None]:
chromvar_data_copy = pd.read_csv("../../data/AvrRpt2_alone2_chromvar.csv", index_col=0)

In [None]:
seq_file = "../../data/AvrRpt2_alone2.h5ad"
seq_data_copy = sc.read(seq_file)

In [None]:
chromvar_data_copy = sc.AnnData(
    X=chromvar_data_copy.T.values,
    obs=seq_data_copy.obs,
    var=pd.DataFrame(index=chromvar_data_copy.T.columns),
)

In [None]:
chromvar_data_copy = chromvar_data_copy[
    chromvar_data_copy.obs["sample.order"].isin(
        [
            "00_00_Mock_rep1",
            "02_AvrRpt2_04h_rep1",
            "02_AvrRpt2_06h_rep1",
            "02_AvrRpt2_09h_rep1",
            "02_AvrRpt2_24h_rep1",
        ]
    )
]
chromvar_data_copy.obs["sample.order"] = (
    chromvar_data_copy.obs["sample.order"]
    .replace("00_00_Mock_rep1", "mock")
    .replace("02_AvrRpt2_04h_rep1", "4hr_avr")
    .replace("02_AvrRpt2_06h_rep1", "6hr_avr")
    .replace("02_AvrRpt2_09h_rep1", "9hr_avr")
    .replace("02_AvrRpt2_24h_rep1", "avrrpt24")
)
chromvar_data_copy.obs.index = [
    i.replace("00_col_Mock_rep1", "00_Mock")
    .replace("col_AvrRpt2_24h_rep1", "AvrRpt2_24h")
    .replace("col_AvrRpt2_04h_rep1", "AvrRpt2_04h")
    .replace("col_AvrRpt2_06h_rep1", "AvrRpt2_06h")
    .replace("col_AvrRpt2_09h_rep1", "AvrRpt2_09h")
    for i in chromvar_data_copy.obs.index
]

In [None]:
indices_match = [
    np.where(chromvar_data_copy.obs.index.values == seq_data.obs.index.values[i])[0][0]
    for i in range(len(seq_data.obs.index.values))
]
chromvar_data_copy = chromvar_data_copy[indices_match, :]
seq_data.X = seq_data.X
chromvar_data_copy_array = np.array(chromvar_data_copy.X)

In [None]:
# Assuming your original AnnData object is named 'adata'
adata = latent_adata.copy()

# Step 1: Filter spatial and seq cells
spatial_cells = adata[adata.obs["labels"] == "spatial"].copy()
seq_cells = adata[adata.obs["labels"] == "seq"].copy()


# Step 2: Create KD tree for seq cells
seq_gimvi = seq_data.obsm["X_gimvi"]
kdtree = KDTree(seq_gimvi)

# Step 3: Find nearest seq cells for each seq
n_neighbors = 30  # Specify the number of nearest neighbors
distances, indices = kdtree.query(spatial_data.obsm["X_gimvi"], k=n_neighbors)

# Step 4: Average gene expression for spatial cells
averaged_expression = np.zeros((spatial_cells.n_obs, len(chromvar_data_copy.X[0])))
for i in range(spatial_cells.n_obs):
    seq_neighbors_indices = indices[i]
    seq_neighbors_expression = chromvar_data_copy_array[seq_neighbors_indices]
    averaged_expression[i] = np.mean(seq_neighbors_expression, axis=0)

In [None]:
chromvar_to_impute = "MA1666.1_HSFB2B"
spatial_data.obs["current_imputed"] = averaged_expression[
    :, np.where(chromvar_data_copy.var.index == chromvar_to_impute)[0][0]
]

In [None]:
fig = sc.pl.embedding(
    spatial_data[spatial_data.obs["batch"] == experiment],
    basis="spatial",
    color=["current_imputed"],
    cmap="viridis",
    size=10,
    vmax=0.7,
    vmin=0.3,
    return_fig=True,
    show=False,
    title="HSFB2B Chromvar imputed",
)
fig.tight_layout()
fig.savefig(
    os.path.join(figure_output_folder, f"HSFB2B_chromvar_imputed_{experiment}.pdf")
)