# DEGS Analysis

In [None]:
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np

### Define output path for saving CSV files

In [None]:
output_path = ".../DEGs"
os.makedirs(output_path, exist_ok=True)

## Load the data

In [None]:
## Multiome Male Donor
data = sc.read_h5ad("/ix/djishnu/peasena/primary_multiome/donor1_upmc/adata_aggregated_gene.leiden_updated.h5ad")

## scRNA Male Donor
#data = sc.read_h5ad("/ix/djishnu/Common_Folder/Jingyu_Data/scRNA_seq/human_B_cell_scRNA_seq_230327.umap.leiden_clusters.h5ad")
# Plot UMAP
sc.pl.umap(data, color=['leiden'])
plt.show()



### Cluster Annotations for scRNA-seq data from multiome dataset

#3: GC

#7: PBs

In [None]:
subset = data[data.obs['leiden'].isin(['3', '7'])] 
# Compute DEGs between leiden 7 (PB) and leiden 3 (GC)
sc.tl.rank_genes_groups(subset, groupby='leiden', groups=['7'], reference='3', method='wilcoxon')
# Extract relevant information from rank_genes_groups
degs = subset.uns['rank_genes_groups']
# Create a DataFrame 
deg_df = pd.DataFrame({
    'genes': degs['names']['7'],  # Top genes 
    'logfoldchanges': degs['logfoldchanges']['7'],  # Log2 fold changes
    'pvals': degs['pvals']['7'],  # p-values
    'pvals_adj': degs['pvals_adj']['7'],  # Adjusted p-values (FDR)
    'scores': degs['scores']['7']
})

deg_df.to_csv(".../DEGs/GC_PB_DEGs.csv")

### Cluster Annotations for scRNA-seq data

#1: GC

#0,8: PBs

In [None]:
sc.pp.log1p(data)
subset = data[data.obs['leiden'].isin(['1', '0', '8'])] 
# Compute DEGs between leiden [0,8] (PB) and leiden 1 (GC)
sc.tl.rank_genes_groups(subset, groupby='leiden', groups=['0','8'], reference='1', method='wilcoxon')
# Extract relevant information from rank_genes_groups
# Retrieve the DEGs dictionary
degs = subset.uns['rank_genes_groups']

# Create a DataFrame for groups '0' and '8' against reference '1'
deg_df = pd.DataFrame({
    'genes': degs['names']['0'],  # Top genes for group '0'
    'logfoldchanges': degs['logfoldchanges']['0'],  # Log2 fold changes for group '0'
    'pvals': degs['pvals']['0'],  # p-values for group '0'
    'pvals_adj': degs['pvals_adj']['0'],  # Adjusted p-values for group '0'
    'scores': degs['scores']['0']  # Wilcoxon scores for group '0'
})

deg_df_8 = pd.DataFrame({
    'genes': degs['names']['8'],  # Top genes for group '8'
    'logfoldchanges': degs['logfoldchanges']['8'],  # Log2 fold changes for group '8'
    'pvals': degs['pvals']['8'],  # p-values for group '8'
    'pvals_adj': degs['pvals_adj']['8'],  # Adjusted p-values for group '8'
    'scores': degs['scores']['8']  # Wilcoxon scores for group '8'
})
deg_df['group'] = '0'
deg_df_8['group'] = '8'

# Concatenate the DataFrames
combined_deg_df = pd.concat([deg_df, deg_df_8], ignore_index=True)
# Sort by 'scores' in descending order
combined_deg_df = combined_deg_df.sort_values(by='scores', ascending=False)
# Drop duplicates based on 'genes'
combined_deg_df = combined_deg_df.drop_duplicates(subset='genes', keep='first')

combined_deg_df.to_csv(".../DEGs/scRNAseq_GC_PB_DEGs.csv")