In [None]:
from cellassign import assign_cats

import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

import pandas as pd

import seaborn as sns

import scanpy as sc
import scanpy.external as sce

import triku as tk

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
sc.settings.set_figure_params(dpi=100) 
seed = 0

In [None]:
# Alevin outputs Ensembl IDs, and we will transform those to Gene symbols

from pybiomart import Server

server = Server(host='http://www.ensembl.org')

df = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['mmusculus_gene_ensembl'].query(attributes=['ensembl_gene_id', 'external_gene_name'])
dict_ensemble_gene = dict(zip(df['Gene stable ID'], df['Gene name']))

# Data loading and QC

In [None]:
adata = sc.read('data/ARAUZO_03/20230623/output_nfcore/alevin/mtx_conversions/combined_matrix.h5ad')

adata.var['ensemble_ID'] = adata.var_names.copy()

adata.var_names = [dict_ensemble_gene[i] if i in dict_ensemble_gene.keys() else 'NA' for i in adata.var_names]
adata = adata[:, adata.var_names != 'NA']

adata.var['gene_symbol'] = adata.var_names.copy()

adata.var_names_make_unique()

In [None]:
adata.raw = adata
adata.X = adata.X.astype(np.float32) # If int64 it cannot be later normalised

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, )

In [None]:
sc.pp.filter_cells(adata, min_counts=100) # based on a previous analysis
sc.pp.filter_genes(adata, min_counts=20)


In [None]:
adata.var['mt'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=True, inplace=True)

In [None]:
sc.pl.violin(adata, ['log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)
sc.pl.scatter(adata, x='log1p_total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='log1p_total_counts', y='log1p_n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata.obs['sample'], 'y': adata.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
adata = adata[(((adata.obs['sample'] == 'AZ7845') & (adata.obs.log1p_n_genes_by_counts < 8.5) & 
                                                (adata.obs.log1p_n_genes_by_counts > 7.2)) | 
                                              ((adata.obs['sample'] == 'AZ7846') & (adata.obs.log1p_n_genes_by_counts < 8.5) & 
                                                (adata.obs.log1p_n_genes_by_counts > 7))  
                                             ).values, :]
adata = adata[adata.obs.pct_counts_mt < 12, :]

## Doublet flagging

In [None]:
sce.pp.scrublet(adata, batch_key=None, expected_doublet_rate=0.05, random_state=seed, knn_dist_metric='cosine', log_transform=False)

print(f"Number of doublets: {adata.obs['predicted_doublet'].sum()}")

In [None]:
print(f'N cells in AZ7845: {len(adata[adata.obs["sample"] == "AZ7845"])}')
print(f'N cells in AZ7846: {len(adata[adata.obs["sample"] == "AZ7846"])}')

# Data processing

In [None]:
sc.pp.filter_genes(adata, min_counts=1)
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

In [None]:
# Feature selection
sc.pp.pca(adata, random_state=seed)
sc.pp.neighbors(adata, n_neighbors=int(0.5 * len(adata) ** 0.5), random_state=seed, metric='correlation')
tk.tl.triku(adata, use_raw=False)

In [None]:
sc.pp.pca(adata, random_state=seed, use_highly_variable=True)
sc.pp.neighbors(adata, n_neighbors=int(0.5 * len(adata) ** 0.5), random_state=seed, metric='correlation')

sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.07, key_added='leiden')
sc.tl.leiden(adata, resolution=1, key_added='leiden_sub')

In [None]:
sc.pl.umap(adata, color=['leiden', 'leiden_sub', 'sample'], ncols=3, alpha=0.4, legend_loc='on data')
sc.pl.umap(adata, color=['log1p_n_genes_by_counts', 'pct_counts_mt'])

# Kranocyte characterisation

In [None]:
A_markers = ['Smim41', 'Col9a2', 'Dlk1', 'Shisa3',  'Saa1',  'Nipal1']
B_markers = ['Lypd2', 'Wnt6', 'Cldn1', 'Moxd1', 'Mansc4', 'Dleu7', 'Efnb3', 'Stra6', 'Sbspon',
              'Hcn4', 'Cldn22']  


In [None]:
sc.pl.umap(adata, color=A_markers, cmap=magma, use_raw=False)

In [None]:
sc.pl.umap(adata, color=B_markers, cmap=magma, use_raw=False)

In [None]:
# The FACs was done as PDPN(+) CD31(-)

sc.pl.umap(adata, color=['Pdpn', 'Pecam1', 'Pdgfra', 'Tnmd', 'Lum', 'Prg4'], cmap=magma, use_raw=False)

## Analysis of major populations

In [None]:
krano_12 = ['Nr2f2', 'Gfra1', 'Vwa1', 'Cdh19', 'Rasgrp2', 'Sipa1l1', 'Sorcs2', 'Smim41', 'Ptgfr', 'Plxdc1', 'Nrp2', 'Septin9', 
            'Gng2', 'Ngfr', 'Malt1', 'Col9a2', 'Plxnc1', 'Rcsd1', 'Nipal1', 'Greb1', 'Pla2g7', 'Sox9', 'Matn2', 'Ogfrl1', 
            'Gfra2', 'Cp', 'Vwa1', 'P2ry1', 'Hlf', 'Sdc3', 'Mylk', 'Trf', 'Gli1', 'Cst6', 'Pcp4l1', 'Malt1', 'Kcnk2', 
            'Greb1', 'Nipal1']

FAP = ['Cd248', 'Dpt', 'Dpep1', 'Adamts5', 'Emilin2', 'Pcsk6', 'Ifi205', 'Col6a6', 'Procr', 'Osr1', 'Ifi207', 
                        'Hspb8', 'Wnt11', 'Tmeff2']

tnmd = ['Col12a1', 'Cilp2', 'Fmod', 'Col11a1', 'Chad', 'Col8a2', 'Comp', 'Tnmd', 'Pdgfa', 'Actn1', 'Tns3', 'Scx', 'Gas2', 
        'Ccn4', 'Col11a2', 'Kctd1', 'Myo1d', 'Ptpn14', 'Runx1', 'Tnc', 'Ptx4', 'Col13a1', 'Zfp185', 'Gnai1', 'Cdh2', 'P3h2', 
        'Emb', 'Atp6v0a4', 'Rflnb', 'Kcnma1', 'Edil3', 'Ppfibp2']

satellite = ['Chodl', 'Pax7', 'Edn3', 'Notch3', 'Cavin2', 'Peg3', 'Fgfr4', 'Myf5', 'Cdh15', 'Chrdl2', 
                          'Heyl', 'Prox1', 'Gal', 'Tanc2', 'Des', 'Rapsn', 'Ank3', 'Jsrp1', 'Notch1', 'Cd82', 'Hs6st2', 
                          'Traf3ip3']

dict_markers = {'FAP_1': FAP + ['Cd248', 'Efemp1', 'Efhd1', 'Dpp4', 'Sema3c', 'Pi16', 'Emilin2', 'Anxa3', 'Pla1a', 'Pcolce2', 
                                'Gfpt2', 'Procr', 'Cadm3', 'Stmn4', 'Limch1', 'Il18', 'Cd55', 'Cmah', 'Lurap1l', 'Mustn1', 
                                'Arl4d', 'Car8', 'Aif1l', 'Pde8a', 'Uchl1', 'Smpd3', 'Dact2', 'Rorb', 'Sytl2', 'Chst1', 'Tmem158', 
                                'Fam167a', 'Tubb4a', 'Sbsn', 'Islr2', 'Ackr2', 'Adamts16', 'Duoxa1', 'Aldh1a3', 'Gap43', 'Kcnk5'],
                'FAP_2': FAP + ['Smoc2', 'Col4a1', 'Col15a1', 'Lamb1', 'Hsd11b1', 'Mme', 'Col4a2', 'Vwa1', 'Cxcl14', 'Atp1a2',
                                'Lifr', 'Rgma', 'Ret', 'Fbln7', 'Gpm6b', 'Cyria', 'P2ry1', 'Hlf', 'Cldn15', 'Colgalt2', 
                                'Sorl1', 'Rem1', 'Vtn', 'Prex2', 'Alpl', 'Ces1d', 'Tmem64', 'Cbfa2t3', 'Clec14a', 
                                'P2ry14', 'Tspan9', 'Fscn1', 'Spon1', 'Csgalnact1', 'Plxna2', 'Kcna2', 'Adcyap1r1', 'Hunk'],
                'FAP_3': FAP + ['Srpx', 'Fbln1', 'Gdf10', 'Igfbp7', 'Gpnmb', 'C2', 'Sfrp1', 'C7', 'Dkk2', 'Serpina3n', 
                                'Armh4', 'Clec11a', 'Inmt', 'C4b', 'Lox', 'Chrdl1', 'Ism1', 'Emb', 'Abcc9', 'Grem2'],
                'FAP_4': FAP + ['Mgp', 'Meox2', 'Meox1', 'Hmcn1', 'Clec1a', 'Serpine2', 'Prg4', 'Kctd12', 'Clu', 'Cdh11', 
                                'Ap1s2', 'Daam2', 'Tmem204', 'Ezr', 'Plekha6', 'Etl4', 'Myo10', 'Ptgir', 'Tenm3', 'Cgnl1', 
                                'Etv1', 'Myo1b', 'Arhgdib', 'Trib2', 'Entrep1', 'Piezo2', 'Tbx2', 'Ptn', 'Enpp1', 'Col8a2', 
                                'Rspo3', 'Klf5', 'Mettl24', 'Tmem176a', 'Ddit4l', 'Stmnd1', 'Dchs2', 'Fam180a'], 
                'Krano_1': krano_12 + ['Cpe', 'Slc1a3', 'Tec', 'Tenm2', 'Piezo2', 'Kif21a', 'Foxd1', 'Mamdc2', 
                                       'Foxd2os', 'Unc13c', 'Cldn1', 
                                       'Col28a1', # Found in terminally differentiated Schwann and Merkel cells
                                       'Kcnk2', 'Klf15', 'Prxl2a', 'Spp1', 'Trabd2b', 'Pilra', 'Lrp4', 'Slc27a1', 
                                       'Foxd2', 'Foxs1', 'Grin2b', 
                                       'Shisa2', 'Sox8', 'Homer2',  # Positive in Tnmc+ population
                                       'Acot1', 'Clic6', 
                                       'Col26a1', 'Cdkn2b', # Partially in Krano_2
                                       'Ndnf', 'Asns', 'Gjb5', 'Celf4', 'Gdf11', 'Sfxn5', 'Cerox1', 
                                       'Rab20', 'Sbspon', 'Lypd6', 'Ccl9', 'Rida', 
                                       'Ifitm1'], 
                'Krano_2': krano_12 + ['Sorl1', 'Spon1', 'Alpl', 'Gpld1', 'Rgs17', 'Psat1', 'Trpm6', 'Nkd1', 'Sphkap', 
                                       'Dlk1', 'Thrsp', 'Susd5', 'Fetub', 'Nrk', 'Saa1', 'Itga8', 'Usp35', 'Lbh', 
                                       'Grm8', 'Emilin3', 'Necab1', 'Gria1', 'Il12a', 'Col6a6'],
                'TNMD_1': tnmd + ['Sparcl1', 'Col22a1', 'Chodl', 'Rbp1', 'Naalad2', 'Cdh2', 'Col18a1', 'Tnfrsf21', 'Scube2', 
                                  'Postn', 'Rab31', 'Metrnl', 'Plcb4', 'Crispld1', 'Lrrn2', 'Cdh5', 'Tubb2b', 
                                  'Reln', 'Fbn2', 'Cotl1', 'Olfml2a', 'Pla2g7', 'Cdk5rap2', 'Shisal2b', 'Man2a2', 'Adam23', 
                                  'Prph', 'Gab2', 'Prex2', 'Frem1', 'Tyms', 'Rapgef4', 'Epas1', 'Atp1a2', 'Egflam'],
                'TNMD_2': tnmd + ['Fmod', 'Serping1', 'Ccdc3', 'Fxyd6', 'Cav1', 'Sema3b', 'Angptl7', 'Mylk', 'C3', 'Egfl6', 
                                  'Wif1', 'Pcolce2', 'Grem2', 'Sema3a', 'Ntn1', 'Cgref1', 'Itga2', 'Ggta1', 'Bmp3', 'Hpgd', 
                                  'Uts2r', 'Fgf9', 'Cav2', 'Bmpr1b', 'Chrdl1', 'Fndc5', 'Sned1', 'Thy1', 'Il33', 'Fez1', 
                                  'Loxl4'],
                'TNMD_3': tnmd + ['Itm2a', 'Gpx3', 'Bicc1', 'Dlx5', 'Fgl2', 'Gpc1', 'Cemip2', 'Ptn', 'Cdkn1c', 'Wnt16', 
                                  'Tmem100', 'Flrt2', 'Crabp2', 'Fgfr2', 'Slc20a2', 'Epha3', 'Cdc42ep3', 'Gfra1', 'S100b', 
                                  'Ostn', 'Shisa2', 'Megf6', 'H19', 'Prss23', 'Slc1a3', 'Spon1', 'Nr4a2', 'Spon1', 'Gdf10', 
                                  'Pthlh', 'Igsf3', 'Kcns1', 'Igf2', 'Ror2', 'Aqp1', 'Serpine2', 'Nppc'],
                'Sat Cav1+': satellite + ['Cav1', 'Kitl', 'Cd36', 'Emcn', 'Itga1', 'Sparcl1', 'Cxcl12', 'Myct1', 'Tspan13', 'Fabp4', 
                              'Flt1', 'Kdr', 'Cdh5', 'Esam', 'Tie1', 'S1pr1', 'Cped1', 'Adgrf5', 'Apbb2', 'Nrp1', 'Rasgrp3', 
                              'Adgrl4', 'Elk3', 'Slfn5', 'Epas1', 'Tns1', 'Clic5', 'Pecam1', 'Cd93', 'Egfl7', 'Car8', 'F11r', 
                              'Ptprm', 'Gimap6', 'Tmem204', 'Ptprb', 'Sult1a1', 'Rgcc', 'Arhgef15', 'Nos3', 'Meox2', 'Fgd5', 
                              'Aqp7', 'Ushbp1', 'Sox18', 'Arhgap31'],
                'Sat Pax7+': satellite + ['Chrdl2', 'Myf5', 'Pax7', 'Edn3', 'Olfml2a', 'Heyl', 'Erfe', 'Fgfr4', 'Msc', 
                              'Hs6st2', 'Rapsn', 'Tac4', 'Tenm4', 'Serinc2', 'Gm13703', 'Golm2', 'Megf10', 'Nppc', 'Pde1c', 
                              'Flnc', 'Iqgap2', 'Slc7a2'],
                'Sat fibro': satellite + ['Rnase4', 'S100a6', 'Dcn', 'Col1a2', 'Clec3b', 'Gsn', 'Serpinf1', 'C1s1', 'Col6a2', 
                             'Mfap5', 'Col5a2', 'Col14a1', 'Ifi207', 'Pcolce', 'Lum', 'Igfbp6', 'Axl', 'Myoc', 'Adamts2', 'Ecm1', 
                             'Sulf2', 'Lgi2', 'Pdlim2', 'Cd248', 'C3', 'Slc1a5', 'Dpep1', 'Emilin2', 'Dpysl3'],
                    
                }



In [None]:
for key, val in dict_markers.items():
    print(key, [i for i in val if i not in adata.var_names])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
sc.pl.umap(adata, color=adata.uns['rank_genes_groups']['names']['3'][0:150], cmap=magma, use_raw=False)

## Analysing Tnmd+ populations

In [None]:
adata_tnmd = adata[adata.obs['leiden'] == '2']

sc.pp.pca(adata_tnmd, random_state=seed, use_highly_variable=True)
sc.pp.neighbors(adata_tnmd, n_neighbors=int(0.5 * len(adata_tnmd) ** 0.5), random_state=seed, metric='correlation')

sc.tl.umap(adata_tnmd)

In [None]:
sc.tl.leiden(adata_tnmd, resolution=0.2, key_added='leiden_tnmd')
sc.pl.umap(adata_tnmd, color=['leiden_tnmd'])

In [None]:
sc.tl.rank_genes_groups(adata_tnmd, groupby='leiden_tnmd')

In [None]:
sc.pl.umap(adata_tnmd, color=adata_tnmd.uns['rank_genes_groups']['names']['2'][0:170], cmap=magma, use_raw=False)

## Analysing FAP populations

In [None]:
adata_FAP = adata[adata.obs['leiden'] == '0']

sc.pp.pca(adata_FAP, random_state=seed, use_highly_variable=True)
sc.pp.neighbors(adata_FAP, n_neighbors=int(0.5 * len(adata_FAP) ** 0.5), random_state=seed, metric='correlation')

sc.tl.umap(adata_FAP)

In [None]:
sc.tl.leiden(adata_FAP, resolution=0.3, key_added='leiden_fap')
sc.pl.umap(adata_FAP, color=['leiden_fap'])

In [None]:
sc.tl.rank_genes_groups(adata_FAP, groupby='leiden_fap')

In [None]:
sc.pl.umap(adata_FAP, color=adata_FAP.uns['rank_genes_groups']['names']['3'][0:170], cmap=magma, use_raw=False)

## Analysing Satellite populations

Part of the characterisation is from https://elifesciences.org/articles/51576

In [None]:
adata_SAT = adata[adata.obs['leiden'] == '1']

sc.pp.pca(adata_SAT, random_state=seed, use_highly_variable=True)
sc.pp.neighbors(adata_SAT, n_neighbors=int(0.5 * len(adata_SAT) ** 0.5), random_state=seed, metric='correlation')

sc.tl.umap(adata_SAT)

In [None]:
sc.tl.leiden(adata_SAT, resolution=0.2, key_added='leiden_sat')
sc.pl.umap(adata_SAT, color=['leiden_sat'])

In [None]:
sc.tl.rank_genes_groups(adata_SAT, groupby='leiden_sat')

In [None]:
sc.pl.umap(adata_SAT, color=adata_SAT.uns['rank_genes_groups']['names']['0'][0:170], cmap=magma, use_raw=False)

## Applying cellasign

In [None]:
sc.tl.leiden(adata, resolution=8, key_added='leiden_assigncats')
sc.pl.umap(adata, color=['leiden_assigncats'])

In [None]:
assign_cats(adata, dict_cats=dict_markers, column_groupby='leiden_assigncats', quantile_gene_sel=0.9, diff=0.05, )

In [None]:
sc.pl.umap(adata, color=['assigned_cats'], cmap=magma, use_raw=False)

## Plotting markers of populations

### FAPs (general)

In [None]:
sc.pl.umap(adata, color=FAP, cmap=magma, use_raw=False, ncols=5)

### FAPs (1)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + dict_markers['FAP_1'][len(FAP):], cmap=magma, use_raw=False, ncols=5)

### FAPs (2)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + dict_markers['FAP_2'][len(FAP):], cmap=magma, use_raw=False, ncols=5)

### FAPs (3)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + dict_markers['FAP_3'][len(FAP):], cmap=magma, use_raw=False, ncols=5)

### FAPs (4)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + dict_markers['FAP_4'][len(FAP):], cmap=magma, use_raw=False, ncols=5)

### Krano (general)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + krano_12, cmap=magma, use_raw=False, ncols=5)

### Krano (1)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + dict_markers['Krano_1'][len(krano_12):], cmap=magma, use_raw=False, ncols=5)

### Krano (2)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + dict_markers['Krano_2'][len(krano_12):], cmap=magma, use_raw=False, ncols=5)

### Satellite (general)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + satellite, cmap=magma, use_raw=False, ncols=5)

### Satellite (1)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + dict_markers['Sat Cav1+'][len(satellite):], cmap=magma, use_raw=False, ncols=5)

### Satellite (2)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + dict_markers['Sat Pax7+'][len(satellite):], cmap=magma, use_raw=False, ncols=5)

### TNMD (general)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + tnmd, cmap=magma, use_raw=False, ncols=5)

### TNMD (1)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + dict_markers['TNMD_1'][len(tnmd):], cmap=magma, use_raw=False, ncols=5)

### TNMD (2)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + dict_markers['TNMD_2'][len(tnmd):], cmap=magma, use_raw=False, ncols=5)

### TNMD (3)

In [None]:
sc.pl.umap(adata, color=['assigned_cats'] + dict_markers['TNMD_3'][len(tnmd):], cmap=magma, use_raw=False, ncols=5)

# Preparing adata for cellxgene

In [None]:
adata.uns['schema_version'] = '3.0.0'
adata.uns['title'] = 'POR PONER'
adata.uns['batch_condition'] = 'sample'
adata.uns['default_embedding'] = 'X_umap'

In [None]:
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:10090'
adata.obs['tissue_ontology_term_id'] = 'XOXO'
adata.obs['assay_ontology_term_id'] = 'EFO:0009922' # 10x 3' v3
adata.obs['disease_ontology_term_id'] = 'PATO:0000461'
adata.obs['cell_type_ontology_term_id'] = 'XOXO'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'na'
adata.obs['development_stage_ontology_term_id'] = 'XOXO'
adata.obs['sex_ontology_term_id'] = 'XOXO'
adata.obs['donor_id'] = adata.obs['sample']
adata.obs['suspension_type'] = 'cell'


In [None]:
adata.var_names = adata.var['ensemble_ID']

In [None]:
adata.write_h5ad('data/ARAUZO_03/20230623/output_nfcore/alevin/mtx_conversions/combined_matrix_cellxgene.h5ad')

# Exporting HTML

In [None]:
!jupyter nbconvert --to html /data/Proyectos/kranocito/4_Analysis_of_krano_dataset.ipynb