In [None]:
import requests
import os
import scanpy as sc
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import gc


## Set parameters and paths

In [None]:
# Define parameters
res = 1
n_hvg = 2000

In [None]:
# Define the input and output directories
input_dir = 'Input_files'
output_dir = 'Output_Seeker_2023'

# Create the output directory if it does not exist
if not os.path.exists(input_dir):
    os.makedirs(input_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## Analysis for Seeker_2023_DS9

studyID = 'Seeker_2023'
download_link = 'https://datasets.cellxgene.cziscience.com/32c319ef-10e2-4948-8b40-093d2f9d7cb5.h5ad'
new_filename = f"Input_files/{studyID}.h5ad"
response = requests.get(download_link)
with open(new_filename, 'wb') as f:
        f.write(response.content)
print(f"File downloaded and saved as {new_filename}")

In [None]:
studyID = 'Seeker_2023'
file = 'Seeker_2023.h5ad'
print(f'Start process {studyID}')
adata=sc.read_h5ad(os.path.join(input_dir,file))
print(f'    Before process: number of obs {adata.n_obs}, number of var {adata.n_vars}')

#adata.layers["counts"]=adata.X.copy()
adata.var['GeneID']=adata.var.index
adata.var.set_index('feature_name',inplace=True)
if adata.raw is not None:
    adata.raw.var.set_index('feature_name', inplace=True)
else:
    print("    adata.raw is None, skipping setting index.")
adata.var["mt"]=adata.var_names.str.startswith("MT-")

print(f'    Start QC')
sc.pp.calculate_qc_metrics(adata,qc_vars=["mt"],inplace=True,log1p=True)
sc.pp.filter_genes(adata,min_cells=3)
adata=adata[adata.obs['nFeature_RNA']>200,:]
adata=adata[adata.obs['pct_counts_mt']<10,:]
adata=adata[adata.obs['nCount_RNA']>400,:]
adata=adata[adata.obs['nCount_RNA']<60000,:]
print(f'    After process: number of obs {adata.n_obs}, number of var {adata.n_vars}')

sc.pp.highly_variable_genes(adata,n_top_genes=n_hvg)
sc.tl.pca(adata)
sc.pp.neighbors(adata,n_neighbors=10,n_pcs=30)
sc.tl.umap(adata)
sc.tl.leiden(adata,resolution=res,n_iterations=2)


In [None]:
adata.obs['cell_type'].value_counts()

In [None]:
adata

#### Figure 7A

In [None]:
# Figure 7A
print(f'    Start to plot {studyID}')
sc.pl.umap(adata,color=['cell_type'],save = f'_{studyID}_allcelltypes_leiden_res{res}.pdf')

In [None]:
# Figure 7A
from collections import OrderedDict
goi = []
gene_categories = {
    "microglia": ['BHLHE41', 'CX3CR1', 'P2RY12','TREM2', 'TMEM119', 'HEXB', 'SALL1'],
    "CAM": ['CD163', 'MRC1', 'LYVE1'],
    "oligodendrocytes": ["PLP1", "CNP"],
    "precursor_cells": ["PDGFRA", "PTPRZ1"],
    "astrocytes": ["GJA1", "GFAP"],    
    "excitatory_neurons": ["SNAP25", "SLC17A7"],
    "inhibitory_neurons": ["SNAP25", "GAD1"],
    "reelin_positive_neurons": ["SNAP25", "RELN"],
    "endothelial_cells_pericytes": ["CLDN5", "NOTCH3"],
    "immune_cells": ["HLA-A", "PTPRC"]
}
unique_genes = OrderedDict()
for genes in gene_categories.values():
    for gene in genes:
        unique_genes[gene] = None
        
goi = list(unique_genes.keys())
goi_in_adata = [gene for gene in goi if gene in adata.var_names]

# Define the new order of levels
new_order = ['microglial cell', 'central nervous system macrophage','oligodendrocyte', 'oligodendrocyte precursor cell', 'differentiation-committed oligodendrocyte precursor','astrocyte','neuron','GABAergic neuron','glutamatergic neuron','cerebellar granule cell','capillary endothelial cell','endothelial cell of artery','mural cell','vascular associated smooth muscle cell','leukocyte']
adata.obs['cell_type'] = pd.Categorical(adata.obs['cell_type'], categories=new_order, ordered=True)
print(adata.obs['cell_type'].cat.categories)

sc.pl.dotplot(adata, goi_in_adata, groupby="cell_type", standard_scale="var",title=f'Gene expression in each cell type in {studyID}',save=f'{studyID}_celltype.pdf')

#### Supplementary figure 14A

In [None]:
# Importing necessary libraries
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

# Define the markers you are interested in
markers = ['P2RY12','AIF1','C1QB','GPR34','CSF1R','SLCO2B1','TBXAS1','DOCK8','PCDH9','PLP1']

# Loop over each marker to create a density plot on UMAP
for marker in markers:
    # Create a UMAP scatter plot colored by marker expression
    sc.pl.umap(adata, color=marker, show=False, cmap='viridis', legend_loc=None,s=20)
    
    # Overlay density (this step is similar to Nebulosa, but done manually)
    ax = plt.gca()  # Get current axis
    
    # Extract UMAP coordinates
    x = adata.obsm['X_umap'][:, 0]  # UMAP 1 coordinates
    y = adata.obsm['X_umap'][:, 1]  # UMAP 2 coordinates
    
    # Extract expression levels of the marker
    expression = adata[:, marker].X.toarray().flatten()
    
   # Create a 2D histogram / density plot
    hexbin_plot = ax.hexbin(x, y, C=expression, gridsize=70, cmap='inferno', reduce_C_function=np.mean, mincnt=1)

    # Set axis labels and title
    plt.xlabel('UMAP 1')
    plt.ylabel('UMAP 2')
    plt.title(f'Density plot for {marker}')

    # Add the colorbar with the mappable hexbin plot
    plt.colorbar(hexbin_plot, label='Density')
    
    # Save or show the figure
    plt.savefig(f'figures/umap_density_{studyID}_{marker}.pdf')
    plt.show()

#### Figure 7B

In [None]:
# Importing necessary libraries
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

# Define the markers you are interested in
markers = ["CD22",'BHLHE41']

# Loop over each marker to create a density plot on UMAP
for marker in markers:
    # Create a UMAP scatter plot colored by marker expression
    sc.pl.umap(adata, color=marker, show=False, cmap='viridis', legend_loc=None,s=20)
    
    # Overlay density (this step is similar to Nebulosa, but done manually)
    ax = plt.gca()  # Get current axis
    
    # Extract UMAP coordinates
    x = adata.obsm['X_umap'][:, 0]  # UMAP 1 coordinates
    y = adata.obsm['X_umap'][:, 1]  # UMAP 2 coordinates
    
    # Extract expression levels of the marker
    expression = adata[:, marker].X.toarray().flatten()
    
   # Create a 2D histogram / density plot
    hexbin_plot = ax.hexbin(x, y, C=expression, gridsize=70, cmap='inferno', reduce_C_function=np.mean, mincnt=1)

    # Set axis labels and title
    plt.xlabel('UMAP 1')
    plt.ylabel('UMAP 2')
    plt.title(f'Density plot for {marker}')

    # Add the colorbar with the mappable hexbin plot
    plt.colorbar(hexbin_plot, label='Density')
    
    # Save or show the figure
    plt.savefig(f'figures/umap_density_{studyID}_{marker}.pdf')
    plt.show()

#### Extract microglia

In [None]:
adata_m = adata[adata.obs['cell_type']=='microglial cell', :]
adata_m

#### Figure 7D

In [None]:
# Figure 7D
res = 0.05
# Recluster microglia
sc.pp.highly_variable_genes(adata_m,n_top_genes=n_hvg)
sc.tl.pca(adata_m)
sc.pp.neighbors(adata_m,n_neighbors=10,n_pcs=30)
sc.tl.umap(adata_m)
sc.tl.leiden(adata_m,resolution=res,n_iterations=2)
sc.pl.umap(adata_m, color=['leiden'], palette='Paired',legend_loc='on data', save=f'_{studyID}_within_microglia_leiden_res{res}.pdf')    

In [None]:
goi=['P2RY12','TMEM119','AIF1','C1QB','CX3R1','GPR34','CSF1R','SLCO2B1','TBXAS1','DOCK8','APBB1IP','PCDH9']

goi_in_adata=[gene for gene in goi if gene in adata_m.var_names]
sc.pl.dotplot(adata_m, goi_in_adata, groupby="leiden",save=f'{studyID}_within_microglia_goi_res{res}_check_microgliaMarker.pdf')

#### Figure 7E Plot CD22 and BHLE41

In [None]:
# Importing necessary libraries
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

# Define the markers you are interested in
markers = ["CD22",'BHLHE41']

# Loop over each marker to create a density plot on UMAP
for marker in markers:
    # Create a UMAP scatter plot colored by marker expression
    sc.pl.umap(adata_m, color=marker, show=False, cmap='viridis', legend_loc=None,s=20)
    
    # Overlay density (this step is similar to Nebulosa, but done manually)
    ax = plt.gca()  # Get current axis
    
    # Extract UMAP coordinates
    x = adata_m.obsm['X_umap'][:, 0]  # UMAP 1 coordinates
    y = adata_m.obsm['X_umap'][:, 1]  # UMAP 2 coordinates
    
    # Extract expression levels of the marker
    expression = adata_m[:, marker].X.toarray().flatten()
    
   # Create a 2D histogram / density plot
    hexbin_plot = ax.hexbin(x, y, C=expression, gridsize=70, cmap='inferno', reduce_C_function=np.mean, mincnt=1)

    # Set axis labels and title
    plt.xlabel('UMAP 1')
    plt.ylabel('UMAP 2')
    plt.title(f'Density plot for {marker}')

    # Add the colorbar with the mappable hexbin plot
    plt.colorbar(hexbin_plot, label='Density')
    
    # Save or show the figure
    plt.savefig(f'figures/umap_density_{studyID}_{marker}_within_macrolia.pdf')
    plt.show()

In [None]:
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind

# Select cells from Leiden cluster 1 and cluster 0
cluster_1_cells = adata_m[adata_m.obs['leiden'] == '1']
cluster_0_cells = adata_m[adata_m.obs['leiden'] == '0']

# Extract expression levels of BHLHE41 and CD22 for cluster 1 and cluster 0
bhlhe41_expression_cluster_1 = cluster_1_cells[:, 'BHLHE41'].X.toarray().flatten()
cd22_expression_cluster_1 = cluster_1_cells[:, 'CD22'].X.toarray().flatten()

bhlhe41_expression_cluster_0 = cluster_0_cells[:, 'BHLHE41'].X.toarray().flatten()
cd22_expression_cluster_0 = cluster_0_cells[:, 'CD22'].X.toarray().flatten()

# Add 0.01 to the expression levels
bhlhe41_expression_cluster_1 += 0.01
cd22_expression_cluster_1 += 0.01

bhlhe41_expression_cluster_0 += 0.01
cd22_expression_cluster_0 += 0.01

# Calculate the value of BHLHE41/CD22
bhlhe41_cd22_ratio_cluster_1 = bhlhe41_expression_cluster_1 / cd22_expression_cluster_1
bhlhe41_cd22_ratio_cluster_0 = bhlhe41_expression_cluster_0 / cd22_expression_cluster_0

# Perform log transformation
bhlhe41_cd22_ratio_cluster_1_log = np.log1p(bhlhe41_cd22_ratio_cluster_1)
bhlhe41_cd22_ratio_cluster_0_log = np.log1p(bhlhe41_cd22_ratio_cluster_0)

# Create a DataFrame for plotting
data = {
    'BHLHE41_CD22_Ratio': np.concatenate([bhlhe41_cd22_ratio_cluster_1_log, bhlhe41_cd22_ratio_cluster_0_log]),
    'Group': ['Cluster 1'] * len(bhlhe41_cd22_ratio_cluster_1_log) + ['Cluster 0'] * len(bhlhe41_cd22_ratio_cluster_0_log)
}

df = pd.DataFrame(data)

# Calculate the median value for each group
median_values = df.groupby('Group')['BHLHE41_CD22_Ratio'].median()
print('Median value:',  median_values)

# Plot the data
fig, ax = plt.subplots()
df.boxplot(column='BHLHE41_CD22_Ratio', by='Group', ax=ax)

# Perform t-test to get the p-value between the two groups
t_stat, p_value = ttest_ind(bhlhe41_cd22_ratio_cluster_1_log, bhlhe41_cd22_ratio_cluster_0_log)
print(p_value)

# Add median values as red bars
for i, group in enumerate(median_values.index):
    median = median_values[group]
    ax.plot([i + 1 - 0.2, i + 1 + 0.2], [median, median], color='red', lw=3)

plt.title('Boxplot of BHLHE41_CD22_Ratio by Group')
plt.suptitle('')
plt.xlabel('Group')
plt.ylabel('BHLHE41_CD22_Ratio')
plt.savefig(f'figures/{studyID}_microglia_Cluster1vs0.pdf')
plt.show()