In [17]:
import scanpy as sc

dr = sc.read_h5ad('/storage/lingyuan2/STATES_data/withDR.h5ad')


In [None]:
points_df = dr.uns['points_df']
genes_to_filter = ['ADGRL2', 'BATF3', 'BBS10', 'BLOC1S3', 'CEBPD', 'CENPO', 'CHAF1B', 'DEPDC7', 'EIF4EBP2', 'ENPP4', 'GATA2', 'GOLPH3L', 'LAMTOR3', 'NOL6', 'PLPP5', 'QPRT', 'RARS1', 'SESN3', 'SLC35C1', 'STEAP2', 'TPRA1', 'TTI1', 'UBP1', 'VPS13D', 'VSIG10', 'VWA8', 'XAB2', 'ZNF335']
points_df = points_df[~points_df['gene'].isin(genes_to_filter)]
points_df

In [None]:
points_df['group'] = points_df['feature_name'].str.split('_').str[-1]


In [None]:
# Group by gene and condition, then count ntRNA and rbRNA occurrences
gene_counts = points_df.groupby(['gene', 'condition', 'group']).size().unstack(fill_value=0)

gene_counts = gene_counts[['ntRNA', 'rbRNA']]

gene_counts = gene_counts.rename(columns={
    'ntRNA': 'ntRNA_count',
    'rbRNA': 'rbRNA_count'
})

gene_counts = gene_counts.reset_index()

gene_counts



In [None]:
gene_counts['TE'] = gene_counts['rbRNA_count'] / (gene_counts['rbRNA_count'] + gene_counts['ntRNA_count'])
gene_counts

In [8]:
# Calculate mean DR for each gene-condition combination
mean_dr = points_df.groupby(['gene', 'condition'])['DR'].mean().reset_index()
mean_dr = mean_dr.rename(columns={'DR': 'mean_DR'})

# Merge with gene_counts dataframe
gene_counts = gene_counts.merge(mean_dr, on=['gene', 'condition'], how='left')


In [None]:
gene_counts

In [33]:
gene_counts.to_csv('gene_counts.csv')

In [None]:
import pandas as pd
from scipy.stats import pearsonr

# Initialize empty list to store correlation results
results = []

# Get unique genes
unique_genes = gene_counts['gene'].unique()

# Calculate correlation for each gene
for gene in unique_genes:
    gene_data = gene_counts[gene_counts['gene'] == gene]
    
    # Only calculate correlation if there are multiple data points
    if len(gene_data) > 1:
        corr, pval = pearsonr(gene_data['TE'], gene_data['mean_DR'])
        results.append({
            'gene': gene,
            'corr': corr,
            'pval': pval
        })

# Convert list of results to DataFrame
corr_results = pd.DataFrame(results)

# Display the correlation results
corr_results


In [11]:
# Pivot gene_counts to get TE and mean_DR for each condition
te_pivot = gene_counts.pivot(index='gene', columns='condition', values='TE')
dr_pivot = gene_counts.pivot(index='gene', columns='condition', values='mean_DR')

# Rename columns to include condition
te_pivot = te_pivot.add_prefix('TE_')
dr_pivot = dr_pivot.add_prefix('mean_DR_')

# Merge with corr_results
corr_results = corr_results.merge(te_pivot, on='gene', how='left')
corr_results = corr_results.merge(dr_pivot, on='gene', how='left')


In [None]:
corr_results

In [13]:
corr_results.to_csv('/storage/lingyuan2/STATES_data/corr.csv')

In [None]:
# Read the control file with gene list
control_genes = pd.read_csv('/storage/lingyuan2/STATES_data/te_by_dr_bin_gene_control1021_3bin.csv')['gene'].unique()

# Filter corr_results to only include genes present in control file
filtered_corr_results = corr_results[corr_results['gene'].isin(control_genes)]
filtered_corr_results

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['svg.fonttype'] = 'none'
plt.figure(figsize=(4, 3))
ax = sns.histplot(filtered_corr_results['corr'], bins=50, kde=True, edgecolor=None, alpha=0.3)
ymin, ymax = ax.get_ylim()
from matplotlib import transforms

highlight_genes = ['PXDN', 'SLC16A1', 'CALR', 'EIF1', 'GNL3', 'SLC3A2']
highlight_color = 'red'
highlight_corrs = {}
for gene in highlight_genes:
    corr_val = filtered_corr_results.loc[filtered_corr_results['gene'] == gene, 'corr']
    highlight_corrs[gene] = corr_val.values[0] if not corr_val.empty else None

for idx, value in enumerate(filtered_corr_results['corr']):
    if any(corr_val is not None and value == corr_val for corr_val in highlight_corrs.values()):
        continue
    ax.plot([value, value], [-ymax*0.05, 0], color="gray", alpha=0.5, linewidth=0.1, zorder=2)

for gene in highlight_genes:
    corr_val = highlight_corrs[gene]
    if corr_val is not None:
        ax.plot([corr_val, corr_val], [-ymax*0.05, 0], color=highlight_color, alpha=1, linewidth=0.5, label=gene, zorder=3)

ax.axhline(0, color='black', linewidth=1, linestyle='-', zorder=1)

ax.set_ylim(-ymax*0.05, ymax)
plt.title('Distribution of Significant Correlations')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Frequency')
plt.grid(False)
handles, labels = ax.get_legend_handles_labels()
shown = set()
new_handles = []
new_labels = []
for h, l in zip(handles, labels):
    if l in highlight_genes and l not in shown:
        new_handles.append(h)
        new_labels.append(l)
        shown.add(l)
if new_handles:
    ax.legend(new_handles, new_labels)
#plt.savefig('corr_distribution_1021.svg', format='svg')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['svg.fonttype'] = 'none'

# Filter for SLC16A1 gene
aak1_data = gene_counts[gene_counts['gene'] == 'SLC16A1']

# Calculate correlation between TE and mean_DR
corr, pval = pearsonr(aak1_data['TE'], aak1_data['mean_DR'])

# Print the correlation results
print(f"Correlation between TE and mean_DR for SLC16A1 across conditions:")
print(f"Pearson r = {corr:.4f}, p-value = {pval:.4f}")

# Plot the relationship
plt.figure(figsize=(2, 2))
plt.scatter(aak1_data['mean_DR'], aak1_data['TE'])

# Add regression line
m, b = np.polyfit(aak1_data['mean_DR'], aak1_data['TE'], 1)
plt.plot(aak1_data['mean_DR'], m*aak1_data['mean_DR'] + b, color='red')
plt.title(f'TE vs mean_DR for SLC16A1 across conditions\nr = {corr:.4f}, p = {pval:.4f}')
plt.xlabel('mean_DR')
plt.ylabel('TE')
plt.legend()
#plt.savefig('SLC16A1_corr.svg', format='svg')
plt.show()
