# Looking For Correlations with the transgene

In [None]:
import os
import anndata as ad
import numpy as np
import scanpy as sc
import pandas as pd

from wrapper_functions import *

In [None]:
# Automatically re-load wrapper functions after an update
# Find details here: https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [None]:
organism = Organism.rat
analyze_params = Analyze(protocol=Protocol.FF, organism=organism)

In [None]:
root_path = os.getcwd()
inpath='your_inpath_folder' # Replace with the location of your samples
results_folder = os.path.join(root_path, 'analyzed')

In [None]:
file_names = [f for f in os.listdir(results_folder) if os.path.isfile(os.path.join(results_folder, f))]

adata_list = [ad.read(os.path.join(results_folder, file)) for file in file_names if file.endswith('.h5ad')]

In [None]:
adatas = norm_hvg(adata_list)

In [None]:
transgene_id = 'cisAAV-CMV-GFP-WPRE'
num_top_genes = 10
# Initialize lists to store results
correlations = []
genders = []
treatments = []
gene_names = []
sample_names = []

for adata in adatas:
    
    if transgene_id in adata.var.index:
        
        
        gene_idx = list(adata.var_names).index(transgene_id)
        
        normalized_counts = adata.X.toarray() 
        
        treatment = adata.obs['Condition'].values[0]
        gender = adata.obs['Gender'].values[0]
        sample = adata.obs['Sample_ID'].values[0]
        
        corr_matrix = np.corrcoef(normalized_counts, rowvar=False)
        gene_correlations = corr_matrix[gene_idx]
        
        correlations.extend(gene_correlations)
        gene_names.extend(adata.var_names) 
        treatments.extend([treatment] * len(gene_correlations))
        genders.extend([gender] * len(gene_correlations))
        sample_names.extend([sample] * len(gene_correlations)) 
        
        
 # Create a DataFrame from the results
df = pd.DataFrame({'Gene': gene_names, 'Gender': genders, 'Treatment': treatments, 'Correlation': correlations, 'Sample': sample_names})  
df = df[df['Gene'] != transgene_id]


In [None]:
result = df.groupby(['Gene', 'Gender', 'Treatment'])['Correlation'].mean().reset_index()
result['Absolute_Correlation'] = result['Correlation'].abs()
result = result.sort_values(by=['Absolute_Correlation'], ascending=[False])
top_correlations = result.groupby(['Gender', 'Treatment']).head(num_top_genes)

In [None]:
top_correlations

In [None]:
TopGenes = top_correlations['Gene'].unique()
result_filtered = result[result['Gene'].isin(TopGenes)]
heatmap_data = result_filtered.pivot(index=['Gender', 'Treatment'], columns='Gene', values='Correlation')

In [None]:
cg = sns.clustermap(heatmap_data, annot=False, cmap='coolwarm', cbar=True, cbar_pos=(0.85, 0.8, 0.025, 0.1))
cg.ax_row_dendrogram.set_visible(False)
cg.ax_col_dendrogram.set_visible(False)
cg.ax_row_dendrogram.set_xlim([0,0])

In [None]:
cg = sns.clustermap(heatmap_data.T, annot=False, cmap='vlag', cbar=True,
                   linewidths=0.75, linecolor= 'black',
                    dendrogram_ratio=(.175, .025), center=0, vmin=-0.5, vmax = 0.5,
                    square=True, cbar_pos=(0.05, 0.75, 0.05, 0.18))
cg.ax_row_dendrogram.set_visible(False)
cg.ax_col_dendrogram.set_visible(False)
cg.cax.set_title('Pearson \n Correlation', pad=10)
cg.ax_row_dendrogram.set_xlim([0,0])
plt.savefig('/home/valdeola/Figs_Bettina/Correlation_Heatmap.jpg', dpi=300)

## Top correlated genes from Chromosome Y

We extractred genes that are located in Chromosome Y from MGI and ENSEMBL

In [None]:
MGI_Ygenes_df = pd.read_csv("MGImarkerQuery_20240305_Ygenes.txt", sep="\t")

In [None]:
MGI_Ygenes_df

In [None]:
ENSEMBLE_Ygenes_df =  pd.read_csv("mart_export_Ygenes.txt", sep="\t")

In [None]:
ENSEMBLE_Ygenes_df

In [None]:
final_list  = list(set(MGI_Ygenes_df['Symbol'].unique().tolist()) | set(ENSEMBLE_Ygenes_df['Gene name'].unique().tolist()))

In [None]:
result_ychromosome = result[result['Gene'].isin(final_list)]

In [None]:
result_ychromosome.sort_values(by=['Absolute_Correlation'], ascending=[False])

## Top correlated receptors

We extracted mouse genes from biomart that are associated with receptor activity according to their GO annotations (GO:0038023, GO:0019041 or GO:0004872)

In [None]:
ENSEMBLE_genes_receptors_df =  pd.read_csv("mart_export_Receptors.txt", sep="\t")

In [None]:
ENSEMBLE_genes_receptors_df

In [None]:
result_receptors = result[result['Gene'].isin(ENSEMBLE_genes_receptors_df['Gene name'].unique().tolist())]

Some of the genes seems to be ligands rather than receptors, but for instance these Mup have annotations as insulin receptor activity.

In [None]:
result_receptors.sort_values(by=['Absolute_Correlation'], ascending=[False]).head(50)

In [None]:
receptors_set = [
    'Met',
    'AU040320',
    'Fgfr1',
    'Hspg2',
    'Rpsa',
    'Cd9',
    'Itgb5',
    'Itgav',
    'Itgb1',
] 

In [None]:
result_receptors_2 = result[result['Gene'].isin(receptors_set)]

In [None]:
result_receptors_2.sort_values(by=['Absolute_Correlation'], ascending=[False]).head(50)

 We extracted the list of receptors from [CellCommuNet](https://academic.oup.com/nar/advance-article/doi/10.1093/nar/gkad906/7321072?login=true). To get receptors that are relevant in mice liver, we filtered by mus musculus, normal condition, study type single, and tissue liver. Of note, the complexes have human nomenclature,so I will drop them.

In [None]:
df_cell_interactions = pd.read_csv('CellCommResults.csv')
df_cell_interactions

In [None]:
receptors_cellComm = df_cell_interactions['Receptor'].unique().tolist() + receptors_set
receptors_cellComm

In [None]:
result_receptors_3 = result[result['Gene'].isin(receptors_cellComm)]
result_receptors_3.sort_values(by=['Absolute_Correlation'], ascending=[False]).head(50)

## And if group per sample as well to see differences between samples? 

In [None]:
df

In [None]:
result_perSample = df.groupby(['Gene', 'Sample'])['Correlation'].mean().reset_index()
result_perSample['Absolute_Correlation'] = result_perSample['Correlation'].abs()
result_perSample = result_perSample.sort_values(by=['Absolute_Correlation'], ascending=[False])

In [None]:
result_perSample

In [None]:
result_perSample_receptorSet = result_perSample[result_perSample['Gene'].isin(receptors_set)]

In [None]:
heatmap_data = result_perSample_receptorSet.pivot(index=['Sample'], columns='Gene', values='Correlation')

In [None]:
cg = sns.clustermap(heatmap_data, annot=False, cmap='coolwarm', cbar=True, cbar_pos=(0.85, 0.8, 0.025, 0.1))
cg.ax_row_dendrogram.set_visible(False)
cg.ax_col_dendrogram.set_visible(False)
cg.ax_row_dendrogram.set_xlim([0,0])

In [None]:
! jupyter nbconvert --to html 11_Top_CorrelatedGenes_withTransgene.ipynb