In [1]:
import numpy as np
import pandas as pd
import anndata
import matplotlib.pyplot as plt
import spatialdata as sd
import scanpy as sc
from scipy import stats
import seaborn as sns

In [None]:
filtered_data = sc.read_h5ad('filtered_data.h5ad')
filtered_data

In [3]:
filtered_data.var.index.to_series().to_csv('var_index.csv', index=False)

In [None]:
filtered_data.obs['sample'].unique()

In [5]:
sc.pp.log1p(filtered_data,layer='TE')

In [None]:
from statsmodels.stats.multitest import multipletests
import numpy as np
from scipy import stats

control_group = 'C3control'
early_groups = ['B4Tg15min', 'B5Tg30min']
late_groups = ['C4Tg2h', 'C5Tg4h']
combined_groups = {'early': early_groups, 'late': late_groups}

results = {
    'adjusted_p_values': {group: [] for group in combined_groups},
    'log2_fold_change': {group: [] for group in combined_groups}
}

for gene in filtered_data.var_names:
    gene_te = filtered_data[:, gene].layers['TE']
    gene_totalRNA = filtered_data[:, gene].X

    valid_cells = (gene_totalRNA >= 2).squeeze()
    gene_te_valid = gene_te[valid_cells]

    if len(gene_te_valid) == 0:
        for group in combined_groups:
            results['adjusted_p_values'][group].append(np.nan)
            results['log2_fold_change'][group].append(np.nan)
        continue

    control_cells = filtered_data.obs['sample'] == control_group
    control_te = gene_te_valid[control_cells[valid_cells]]

    for group_name, groups in combined_groups.items():
        experimental_cells = filtered_data.obs['sample'].isin(groups)
        experimental_te = gene_te_valid[experimental_cells[valid_cells]]

        if len(experimental_te) == 0:
            results['adjusted_p_values'][group_name].append(np.nan)
            results['log2_fold_change'][group_name].append(np.nan)
            continue

        try:
            stat, p_value = stats.ranksums(experimental_te, control_te)
        except ValueError:
            p_value = np.nan

        adjusted_p_value = multipletests(p_value, method='fdr_bh')[1][0]

        mean_experimental = np.mean(experimental_te)
        mean_control = np.mean(control_te)
        if mean_control == 0:
            log2_fold_change = np.nan
        else:
            log2_fold_change = np.log2(mean_experimental / mean_control)

        results['adjusted_p_values'][group_name].append(adjusted_p_value)
        results['log2_fold_change'][group_name].append(log2_fold_change)

for group in combined_groups:
    filtered_data.var[f'{group}_adjusted_p_value'] = results['adjusted_p_values'][group]
    filtered_data.var[f'{group}_log2_fold_change'] = results['log2_fold_change'][group]

print(filtered_data.var.head())

In [7]:
last_four_columns = filtered_data.var.iloc[:, -4:]

last_four_columns.to_csv('wilcoxon_results_last_four_columns.csv')

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

early_groups = ['B4Tg15min', 'B5Tg30min']
late_groups = ['C4Tg2h', 'C5Tg4h']

p_value_threshold = 0.05
log2_fold_change_threshold = 0.4

fig, axes = plt.subplots(1, 2, figsize=(18, 4))
fig.subplots_adjust(wspace=0.4)

early_log2fc = filtered_data.var['early_log2_fold_change']
early_padj = filtered_data.var['early_adjusted_p_value']

late_log2fc = filtered_data.var['late_log2_fold_change']
late_padj = filtered_data.var['late_adjusted_p_value']

for i, (log2_fold_change, adjusted_p_values, title) in enumerate([
    (early_log2fc, early_padj, 'Early vs Control'),
    (late_log2fc, late_padj, 'Late vs Control')
]):
    ax = axes[i]
    y_values = -np.log10(adjusted_p_values)
    sns.scatterplot(
        x=log2_fold_change,
        y=np.minimum(y_values, 60),
        alpha=0.6,
        color='grey',
        ax=ax,
        s=10
    )
    
    upregulated_genes = (adjusted_p_values < p_value_threshold) & (log2_fold_change > log2_fold_change_threshold)
    downregulated_genes = (adjusted_p_values < p_value_threshold) & (log2_fold_change < -log2_fold_change_threshold)
    
    sns.scatterplot(
        x=log2_fold_change[upregulated_genes],
        y=np.minimum(y_values[upregulated_genes], 60),
        alpha=0.8,
        color='red',
        ax=ax,
        s=10
    )
    
    sns.scatterplot(
        x=log2_fold_change[downregulated_genes],
        y=np.minimum(y_values[downregulated_genes], 60),
        alpha=0.8,
        color='blue',
        ax=ax,
        s=10
    )
    
    ax.axvline(x=log2_fold_change_threshold, color='blue', linestyle='--', linewidth=1.5)
    ax.axvline(x=-log2_fold_change_threshold, color='blue', linestyle='--', linewidth=1.5)
    ax.axhline(y=-np.log10(p_value_threshold), color='green', linestyle='--', linewidth=1.5)
    
    ax.set_ylim(0, 60)
    ax.set_xlim(-2, 2)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.set_title(title, fontsize=14)
    ax.set_xlabel('Log2 Fold Change', fontsize=12)
    ax.set_ylabel('-Log10(Adjusted P-value)', fontsize=12)
    
    upregulated_gene_names = filtered_data.var.index[upregulated_genes].tolist()
    downregulated_gene_names = filtered_data.var.index[downregulated_genes].tolist()
    
    period = 'early' if i == 0 else 'late'
    pd.Series(upregulated_gene_names).to_csv(f'{period}_upregulated_genes.csv', index=False, header=False)
    pd.Series(downregulated_gene_names).to_csv(f'{period}_downregulated_genes.csv', index=False, header=False)

plt.savefig('volcano_plots_early_late.pdf')
plt.close(fig)