In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# unscaled, log-normalized counts, with conditions subsampled to the same number of cells 
# and 2000 highly variable genes calculated jointly across all perturbation conditions, including control, using scanpy28 with default parameters (Supplementary Methods)

In [3]:
from anndata import read_h5ad
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd 
import scanpy as sc
import seaborn as sns
import string
import sys
sys.path.append("../../scxmatch/src/")
from scxmatch import *
np.random.seed(42)

found cupy installation, will try use the GPU to calculate the distance matrix.


In [4]:
def subsample_adata(adata, group_by):
    min_count = 5000 #adata.obs[group_by].value_counts().min()
    adata_subsampled = adata.copy()
    sampled_indices = []
    
    for dose in adata.obs[group_by].unique():
        group_indices = adata.obs[adata.obs[group_by] == dose].index
        sampled_group_indices = np.random.choice(group_indices, min_count, replace=False)
        sampled_indices.extend(sampled_group_indices)
    
    adata_subsampled = adata_subsampled[sampled_indices, :]
    return adata_subsampled

In [5]:
def categorize_perturbation(value):
    if value == "control":
        return "control"
    elif "_6" in value:
        return "t_6"
    elif "_24" in value:
        return "t_24"
    else:
        return "unknown"

In [7]:
def odd_components(G):
    _, hist = gt.label_components(G)
    print(hist)
    return np.sum(hist % 2)

In [8]:
test_group = 10000
reference = 0
group_by = "dose_value"

In [38]:
adata = read_h5ad("/data_nfs/datasets/scrnaseq_ji/sciplex_MCF7.hdf5")

In [40]:
# A549
adata.obs.dose_value.value_counts()[reference]

3927

In [12]:
adata = adata[adata.obs[group_by].isin([test_group, reference]), :].copy()
adata = scanpy_setup(adata)

In [13]:
kNN(adata, k=20, metric="sqeuclidean")

calculating kNN graph.


  from .autonotebook import tqdm as notebook_tqdm


In [14]:
G = construct_graph_via_kNN(adata)

5087271.946472168

5087271.946472168

In [26]:
compute_matching_cost(G, matching, )

TypeError: list indices must be integers or slices, not Edge

In [19]:
(p, z, s), G, matching = rosenbaum(adata, reference=reference, group_by=group_by, test_group=test_group, metric="sqeuclidean", k=20, return_matching=True)

  adata.obs["XMatch_group"] = np.where(adata.obs[group_by].isin(test_group), "test", "reference")


XMatch_group
test         5000
reference    5000
Name: count, dtype: int64
calculating kNN graph.
counting cross matches.


In [23]:
len(G.get_vertices())

10000

In [21]:
odd_components(G)

[10000]


0

In [20]:
s

0.844

In [None]:
group_by = "dose_value"
reference = 0.0

In [None]:
adata = prepare(adata)
#mcf7 = prepare(a2)
#k562 = prepare(a3)

In [None]:
dfs = pd.read_csv("../plots/fig2/sciplex_MCF7_combined_results_k_influence_k_small.csv")
groups = sorted(dfs["control_group"].unique())
dfs.set_index(["control_group", "metric"], inplace=True)

#dfs_2 = pd.read_csv("../plots/fig2/sciplex_A549_combined_results_k_influence_k_small.csv").drop(columns=["7543"])

#for dfs in [dfs_1, dfs_2]:
#    dfs.rename({"Unnamed: 0": "control_group", "Unnamed: 1": "metric"}, axis=1, inplace=True)
#    groups = sorted(dfs["control_group"].unique())
#    dfs.set_index(["control_group", "metric"], inplace=True)
#dfs = pd.concat([dfs_1, dfs_2], axis=1)

In [None]:
dfs.columns

In [None]:
colors = sns.color_palette("hls", len(groups) + 1)
pal = {group: colors[i] for i, group in enumerate(groups + [reference])}

In [None]:
adata.obs[group_by] = adata.obs[group_by].astype('category')
colors = [pal[v] for v in adata.obs[group_by].values]

In [None]:
dfs.sort_index(axis=1).sort_index(axis=0, inplace=True)

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(12, 8), sharex=False)

k_values = np.unique(dfs.columns.astype(int))

# Define subplot labels: 'a', 'b', 'c', etc.
labels = string.ascii_lowercase  

# Create a single figure with 5 subplots in one row
metrics = ['p-val', 'z-score', 'relative support', 'Percentage of reference matching edges in matching', 'Number of edges']
#axes[0,0].scatter(x=adata.obsm["X_umap"][:,0], y=adata.obsm["X_umap"][:,1], alpha=0.5, s=2, c=colors)
#axes[0,0].set_title("UMAP")
#axes[0,0].set_xticks([], [])
#axes[0,0].set_yticks([], [])

for i, metric in enumerate(metrics):
    x, y = (i + 1) // 3, (i + 1) % 3
    for group in groups:
        df = dfs.loc[group]
        df = df[np.array(sorted(df.columns.astype(int))).astype(str)]
        if metric == "Percentage of reference matching edges in matching":
            df.loc[metric] *= 100
        x_positions = np.searchsorted(k_values, df.columns.astype(int))
        axes[x, y].plot(x_positions, df.loc[metric], 'o--', label=group, color=pal[group], alpha=0.7)
    if metric in ['p-val']:
        axes[x, y].set_yscale('log')
    axes[x, y].set_title(metric.replace("Percentage of reference matching edges in matching", "contained % of perfect edges").replace("p-val", "$P$-val").replace("z-score", "$z$-score"))
    axes[x, y].set_xlabel('k')
    axes[x, y].set_xticks(ticks=(np.unique(x_positions)))

    axes[x, y].set_xticklabels(labels=k_values.astype(str)) 
    axes[x, y].tick_params(axis='x', labelrotation=90)
    

for i in range(np.prod(axes.shape)):
    x, y = i // 3, i % 3
    axes[x, y].text(
        -0.05, 1.1,  # Position (normalized figure coordinates)
        labels[i],   # Corresponding letter
        transform=axes[x, y].transAxes,  # Relative to subplot
        fontsize=10, fontweight='bold', va='top', ha='left'
    )
    
pal["Control"] = pal[0.0]
del pal[0.0]
handles = [patches.Patch(color=color, label=label) for label, color in pal.items()]
fig.legend(handles=handles, loc="upper center", bbox_to_anchor=(0.5, 1.07), ncol=len(pal), title="Dose value")
plt.tight_layout()
plt.savefig("../plots/fig2/fig2-1_A549.pdf", bbox_inches="tight")