In [1]:
import scanpy as sc

dr = sc.read_h5ad('/storage/lingyuan2/STATES_data/withDR.h5ad')


In [None]:
# Filter for C3control cells and create new AnnData object
dr_c3 = dr[dr.obs['sample'] == 'C3control'].copy()
dr_c3

In [3]:
import numpy as np
import scanpy as sc

gene_pass_counts = np.sum(dr_c3.X >= 2, axis=0)
min_cells = int(0.1 * dr_c3.n_obs)
if isinstance(gene_pass_counts, np.matrix):
    gene_pass_counts = np.asarray(gene_pass_counts).squeeze()
gene_filter = gene_pass_counts >= min_cells
dr_c3 = dr_c3[:, gene_filter].copy()


In [None]:
points_df = dr_c3.uns['points_df']
genes_to_filter = ['ADGRL2', 'BATF3', 'BBS10', 'BLOC1S3', 'CEBPD', 'CENPO', 'CHAF1B', 'DEPDC7', 'EIF4EBP2', 'ENPP4', 'GATA2', 'GOLPH3L', 'LAMTOR3', 'NOL6', 'PLPP5', 'QPRT', 'RARS1', 'SESN3', 'SLC35C1', 'STEAP2', 'TPRA1', 'TTI1', 'UBP1', 'VPS13D', 'VSIG10', 'VWA8', 'XAB2', 'ZNF335']
filtered_points_df = points_df[~points_df['gene'].isin(genes_to_filter)]
filtered_points_df

In [None]:
filtered_points_df_control = filtered_points_df[filtered_points_df['condition'] == 'Control']
filtered_points_df_control

In [6]:
# Only keep genes that are in dr_c3.var.index
filtered_points_df_control = filtered_points_df_control[filtered_points_df_control['gene'].isin(dr_c3.var.index)]


In [None]:
print("Number of unique genes:", filtered_points_df_control['gene'].nunique())

In [None]:
filtered_points_df_control['group'] = filtered_points_df_control['feature_name'].str.split('_').str[-1]
filtered_points_df_control

In [11]:
import pandas as pd
import numpy as np

# Define DR bins
bins = [0, 0.33, 0.66, 1.0]

# Calculate TE for each gene in each DR bin
results = []
for gene, gene_data in filtered_points_df_control.groupby('gene'):
    # Bin DR values
    gene_data['DR_bin'] = pd.cut(gene_data['DR'], bins=bins, include_lowest=True, right=False)
    
    # Calculate TE for each bin
    for bin_range, bin_data in gene_data.groupby('DR_bin'):
        # Count rbRNA and ntRNA points
        rbRNA_counts = bin_data[bin_data['group'] == 'rbRNA'].shape[0]
        ntRNA_counts = bin_data[bin_data['group'] == 'ntRNA'].shape[0]
        
        # Calculate TE
        if (rbRNA_counts + ntRNA_counts) > 0:
            te = rbRNA_counts / (rbRNA_counts + ntRNA_counts)
        else:
            te = np.nan
            
        results.append({
            'gene': gene,
            'DR_bin': bin_range,
            'TE': te,
            'rbRNA_counts': rbRNA_counts,
            'ntRNA_counts': ntRNA_counts
        })

# Create DataFrame with results
te_by_dr_bin_gene = pd.DataFrame(results)


In [None]:
te_by_dr_bin_gene.to_csv('/storage/lingyuan2/STATES_data/te_by_dr_bin_gene_control1021_3bin.csv')
te_by_dr_bin_gene