In [None]:
import seaborn as sns
import scanpy as sc
import scanpy.external as sce
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
from scipy import cluster
import math
import matplotlib.pyplot as plt
from itertools import permutations
import random
from scipy import cluster
from scipy import stats
import os
import glob
import pickle
import pandas as pd
import numpy as np
import os
from matplotlib.pyplot import figure
from collections import Counter
from scipy.stats import spearmanr, pearsonr

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)

In [None]:
#Function to compute counts per 10,000
def cpm(x):
    s = sum(list(x[0]))
    x[0] = [i*10000/s for i in list(x[0])]
    return x

In [None]:
#Read in and filter the data

#First, get the 1-1 orthologs from ENSEMBL
orthos = pd.read_csv("mouse_rat_orthologs.txt", sep = "\t").dropna()
dorth = {}
d_eg = {}
for index, row in orthos.iterrows():
    if row["Mouse homology type"] == "ortholog_one2one":
        dorth[row["Gene stable ID"]] = row["Mouse gene stable ID"]
        d_eg[row["Mouse gene stable ID"]] = row["Mouse gene name"] 

ids = {"RM1":0, "RM2":0, "RM3":0, "MR1":0, "MR2":0, "MR3":0, "WT":0}
#ids = {"MR1":0}

#For each sample
for i in ids:
    
    #Read in the data
    v1 = sc.read_10x_mtx(i + "_Mouse_Rat_Raw")
    v1.obs["Sample"] = np.repeat(i, v1.obs.shape[0])
    v1_m = sc.read_10x_mtx(i + "_Mouse_Raw")
    v1_m.obs["Sample"] = np.repeat(i, v1_m.obs.shape[0])
    v1_r = sc.read_10x_mtx(i + "_Rat_Raw")
    v1_r.obs["Sample"] = np.repeat(i, v1_r.obs.shape[0])
    v1_m.obs["BCS"] = [x + "_" + i for x in list(v1_m.obs.index)]
    v1_r.obs["BCS"] = [x + "_" + i for x in list(v1_r.obs.index)]
    
    #Save the right gene ID (mouse or rat)
    mouse_keep = []
    rat_keep = []
    for index, row in v1.var.iterrows():
        if row["gene_ids"][0] == "m":
            mouse_keep.append(row["gene_ids"])
        elif row["gene_ids"][0] == "r":
            rat_keep.append(row["gene_ids"])
    
    #Do basid filtering of genes and cells, filter to only keep the right genes
    sc.pp.filter_cells(v1, min_genes=5)
    sc.pp.filter_genes(v1, min_cells=1)
    v1 = v1.T
    v1_rg = v1[v1.obs["gene_ids"].isin(rat_keep)]
    v1_rg = v1_rg.copy().T
    v1_mg = v1[v1.obs["gene_ids"].isin(mouse_keep)]
    v1_mg = v1_mg.copy().T
    v1 = v1.T
    
    #Compute total counts per cell per genome and mitochondrial counts
    v1_mg.var['mt'] = v1_mg.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(v1_mg, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    v1_rg.var['mt'] = v1_rg.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(v1_rg, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    
    #Assign cell barcodes to species
    rat = v1_rg.obs
    mouse = v1_mg.obs
    d = {}
    
    #Compute the total counts aligned to the rat genome and the total counts aligned to the mouse genome
    for index, row in rat.iterrows():
        d[index + "_" + i] = [row["total_counts"]+1]

    for index, row in mouse.iterrows():
        d[index + "_" + i].append(row["total_counts"]+1)
    
    #Classify cells using the cutoff of 70% of reads needing to come from one species
    rat = []
    mouse = []
    doublet = []
    for key in d.keys():
        ratio = d[key][0]/(d[key][1] + d[key][0])
        if ratio > 0.7:
            rat.append(key)
        elif ratio < 0.3:
            mouse.append(key)
        else:
            doublet.append(key)
    
    #Standard filtering from scanpy
    sc.pp.filter_cells(v1_m, min_genes=200)
    sc.pp.filter_cells(v1_r, min_genes=200)
    
    #Now we work with the data aligned to only a single species
    
    #Create a new anndata object with correct species assignments and put in dictionary
    v1_m_filt = v1_m[v1_m.obs["BCS"].isin(mouse)]
    v1_m_filt.obs["Species"] = np.repeat("Mouse", len(list(v1_m_filt.obs.index)))
    v1_r_filt = v1_r[v1_r.obs["BCS"].isin(rat)]
    v1_r_filt.obs["Species"] = np.repeat("Rat", len(list(v1_r_filt.obs.index)))
    
    #Filter to keep only 1-1 orthologs and reassign gene names appropriately
    v1_m_filt.var["Gene Name"] = v1_m_filt.var.index
    v1_m_filt = v1_m_filt.T
    v1_m_filt = v1_m_filt[v1_m_filt.obs["gene_ids"].isin(list(dorth.values()))]
    v1_m_filt = v1_m_filt.copy().T

    v1_r_filt.var["Gene Name"] = v1_r_filt.var.index
    v1_r_filt = v1_r_filt.T
    v1_r_filt = v1_r_filt[v1_r_filt.obs["gene_ids"].isin(list(dorth.keys()))]
    v1_r_filt = v1_r_filt.copy().T
    
    #Rename the rat genes to their mouse ortholog
    new_names = []
    for index, row in v1_r_filt.var.iterrows():
        new_names.append(d_eg[dorth[row["gene_ids"]]])
    v1_r_filt.var["New Gene Name"] = new_names
    v1_r_filt.var.index = v1_r_filt.var["New Gene Name"]
    c = Counter(v1_r_filt.var.index)
    keepr = []
    for key in c.keys():
        if c[key] == 1:
            keepr.append(key)

    c = Counter(v1_m_filt.var.index)
    keepm = []
    for key in c.keys():
        if c[key] == 1:
            keepm.append(key)
    
    #Filter the genes
    keep = np.intersect1d(keepr, keepm)
    v1_r_filt = v1_r_filt.T
    v1_r_filt = v1_r_filt[v1_r_filt.obs["New Gene Name"].isin(keep)]
    v1_m_filt = v1_m_filt.T
    v1_m_filt = v1_m_filt[v1_m_filt.obs["Gene Name"].isin(keep)]

    v1_m_filt = v1_m_filt.copy().T
    v1_r_filt = v1_r_filt.copy().T
    print(i, "# of rat cells", len(list(v1_r_filt.obs.index)), "# of mouse cells", len(list(v1_m_filt.obs.index)))

    #Now that gene names are fixed, concatenate and assign in dictionary
    v = v1_m_filt.concatenate([v1_r_filt])
    ids[i] = v

In [None]:
#Concatenate all the anndata objects from each sample together
ind = 0
v = 0
for i in ids.keys():
    if ind:
        v = v.concatenate([ids[i]])
    else:
        v = ids[i]
        ind = 1

#Filter to only genes on sex chroms
sc.pp.filter_genes(v, min_cells=10)
f = list(pd.read_csv("Mouse_Sex_Chr_Genes.txt", sep = "\t", header = None)[0])
vs = v.T
vs = vs[vs.obs["New Gene Name-1"].isin(f)]
vs = vs.copy().T

In [None]:
### NOT USED ###

#For each sample, split into mouse and rat
v_MR1 = vs[vs.obs["Sample"].isin(["MR1"])]
v_MR1_m = v_MR1[v_MR1.obs["Species"].isin(["Mouse"])]
v_MR1_r = v_MR1[v_MR1.obs["Species"].isin(["Rat"])]

v_RM1 = vs[vs.obs["Sample"].isin(["RM1"])]
v_RM1_m = v_RM1[v_RM1.obs["Species"].isin(["Mouse"])]
v_RM1_r = v_RM1[v_RM1.obs["Species"].isin(["Rat"])]

v_RM2 = vs[vs.obs["Sample"].isin(["RM2"])]
v_RM2_m = v_RM2[v_RM2.obs["Species"].isin(["Mouse"])]
v_RM2_r = v_RM2[v_RM2.obs["Species"].isin(["Rat"])]

v_WT = vs[vs.obs["Sample"].isin(["WT"])]
v_WT_m = v_WT[v_WT.obs["Species"].isin(["Mouse"])]
v_WT_r = v_WT[v_WT.obs["Species"].isin(["Rat"])]

#Write out the CPM for each gene on sex chromosomes to determine the sex of each sample
c = 0
for i in [v_MR1_m, v_MR1_r, v_RM1_m, v_RM1_r, v_RM2_r, v_RM2_m, v_WT_m, v_WT_r]:
    print(i)
    c += 1
    sc.pp.filter_cells(i, min_genes=100)
    z = pd.DataFrame(i.X.todense())
    z.columns = i.var.index
    z.index = i.obs.index
    #z["Species"] = i.obs["Species"]
    #z["Sample"] = i.obs["Sample"]
    z = z.T
    z = pd.DataFrame(z.sum(axis=1))
    z["CPM"] = cpm(pd.DataFrame(z.sum(axis=1)))
    z.to_csv("Final/DetermineSex/Determine_Sex" + str(c) + ".csv")

In [None]:
#Now we can proceed with the rest of the analysis
ind = 0
v = 0
for i in ids.keys():
    if ind:
        v = v.concatenate([ids[i]])
    else:
        v = ids[i]
        ind = 1

#Do basic filtering again
sc.pp.filter_genes(v, min_cells=10)

#### Doing without sex chromsomes ####
f = list(pd.read_csv("Mouse_Sex_Chr_Genes.txt", sep = "\t", header = None)[0])
v = v.T
v = v[~v.obs["New Gene Name-1"].isin(f)]
v = v.copy().T

In [None]:
#Save an initial file as a starting place
v.write("Final/All_Cells_Start.h5ad")

In [None]:
#Do further filtering and clustering
v.var['mt'] = v.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(v, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

#Remove low quality cells
v = v[v.obs.n_genes_by_counts < 7500, :]
v = v[v.obs.pct_counts_mt < 15, :]

#Save raw data
v.raw = v

#Normalize, compute highly variable genes and PCA as recommended by scanpy
sc.pp.normalize_total(v, target_sum=1e4)
sc.pp.log1p(v)
sc.pp.highly_variable_genes(v, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(v)
sc.tl.pca(v, svd_solver='arpack')
sc.pl.pca_variance_ratio(v, log=True)

#Use harmony to correct for batch effects
sce.pp.harmony_integrate(v, key = 'Species', basis = "X_pca", max_iter_harmony = 20)
sc.pp.neighbors(v, n_neighbors=10, n_pcs=40, use_rep = "X_pca_harmony")

In [None]:
#Do UMAP stuff and leiden clustering
#0, 3, 4, 5 is blood-derived
#1 is neuronal
#2 is connective
sc.tl.leiden(v, resolution=0.1)
sc.tl.umap(v)
sc.pl.umap(v, color=['leiden'])
x = sc.tl.rank_genes_groups(v, 'leiden', method='wilcoxon', use_raw = False)
sc.pl.rank_genes_groups(v, n_genes=25, sharey=False)

In [None]:
#Classify as from blood, brain, or connective tissue
from_blood = [0, 1, 4, 5, 6, 7]
from_con = [3]
from_brain = [2, 8]

#Create sub-objects for each one
v_blood = v[v.obs["leiden"].isin([str(x) for x in from_blood])]
v_con = v[v.obs["leiden"].isin([str(x) for x in from_con])]
v_brain = v[v.obs["leiden"].isin([str(x) for x in from_brain])]

In [None]:
#Reclustering brain data using same strategy as above
sc.pp.highly_variable_genes(v_brain, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(v_brain)
sc.tl.pca(v_brain, svd_solver='arpack')
sc.pl.pca_variance_ratio(v_brain, log=True)
sce.pp.harmony_integrate(v_brain, key = 'Species', basis = "X_pca", max_iter_harmony = 20)
sc.pp.neighbors(v_brain, n_neighbors=10, n_pcs=40, use_rep = "X_pca_harmony")

In [None]:
#Reclustering brain data using same strategy as above, continued
sc.tl.leiden(v_brain, resolution=0.5)
sc.tl.umap(v_brain)
sc.pl.umap(v_brain, color=['leiden'])
x = sc.tl.rank_genes_groups(v_brain, 'leiden', method='wilcoxon', use_raw = False)
sc.pl.rank_genes_groups(v_brain, n_genes=25, sharey=False)


In [None]:
### DESCRIPTION OF CELL TYPES AND MARKER GENES USED TO IDENTIFY THEM ###

#0 Glutamatergic neurons
#1 GABAergic Meis2+ neurons
#2 GABAergic progenitors
#3 MGE inhibitory neurons (Nkx2-1+)
#4 Glutamatergic progenitors
#5 Immature glutamatergic neurons
#6 Spinal cord excitatory neurons (Hoxb8+)
#7 Spinal cord inhibitory neurons (Hoxb8+)
#8 Nr2f2+ excitatory, contains CR cells so needs to be subclustered
#9 GABAergic neurons
#10 Intermediate progenitors
#11 Thalamic neurons (Tcf7l2+)
#12 Spinal cord progenitors
#13 I think these are olfactory bulb exctiatory neurons
#14 A very small population of glial-like cells I think
#15 No idea
#16 Also no idea

#8 was split into:
#17 Nr2f2+, Nhlh2+ excitatory neurons recently exiting the cell cycle
#18 Cajal-Retzius cells
v_brain = sc.read("Final/Brain_Subclustered_Prelim.h5ad")
excit = [0, 5]
inhib = [1, 3, 9]
exc_prog = [4]
inh_prog = [2]
inter_prog = [10]
spinal_excit = [6]
spinal_inhib = [7]
excit_other = [17]

sc.pl.umap(v_brain, color=['leiden'])
sc.pl.umap(v_brain, color=["Gad2", "Slc17a6", "Eomes", "Gli3", "Hoxb8", "Reln", "Nr2f2", "Ebf1", "Mki67", "Zfhx3", "Meis2", "Nkx2-1"], use_raw = False)

In [None]:
#Subcluster the inhibitory neurons
v_inhib = v_brain[v_brain.obs["leiden"].isin([str(x) for x in inhib])].copy()
sc.tl.leiden(v_inhib, resolution=0.35)
sc.tl.umap(v_inhib)
sc.pl.umap(v_inhib, color=['leiden'])
x = sc.tl.rank_genes_groups(v_inhib, 'leiden', method='wilcoxon', use_raw = False)
sc.pl.rank_genes_groups(v_inhib, n_genes=25, sharey=False)

In [None]:
#Eliminate 6 as it is likely thalamic interneurons
#Group 2 and 5 together due to lack of distinguishing markers
#Rest stay separate
sc.pl.umap(v_inhib, color=['leiden'])
sc.pl.umap(v_inhib, color=['Erbb4', 'Meis2', 'Zfhx3', 'Nkx2-1', 'Usp29', 'Zic1', 'Tenm3', 'Tcf7l2', 'Nfib', 'Adarb2', 'Peg3'])
v_inhib.write("GABAergic_Forebrain.h5ad")

In [None]:
#Subclustering 8 as it contains Cajal-Retzius cells
v_cr = v_brain[v_brain.obs["leiden"].isin(["8"])]
sc.tl.leiden(v_cr, resolution=0.05)
sc.tl.umap(v_cr)
sc.pl.umap(v_cr, color=['leiden'])
x = sc.tl.rank_genes_groups(v_cr, 'leiden', method='wilcoxon', use_raw = False)
sc.pl.rank_genes_groups(v_cr, n_genes=25, sharey=False)

#Add the new correct annotations
d = {}
for index, row in v_cr.obs.iterrows():
    d[index] = str(int(row["leiden"]) + 17)
x = v_brain.obs
out = []
for index, row in x.iterrows():
    if index in d.keys():
        l = list(row)
        row["leiden"] = d[index]  
    out.append(row)
new_obs = pd.DataFrame(out)
new_obs.index = x.index
new_obs.columns = x.columns
v_brain.obs = new_obs

v_brain.write("Final/Brain_Subclustered_Prelim.h5ad")

In [None]:
#Now let's process and subcluster the connective tissue
#Reclustering connective data
sc.pp.highly_variable_genes(v_con, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(v_con)
sc.tl.pca(v_con, svd_solver='arpack')
sc.pl.pca_variance_ratio(v_con, log=True)
sce.pp.harmony_integrate(v_con, key = 'Species', basis = "X_pca", max_iter_harmony = 20)
sc.pp.neighbors(v_con, n_neighbors=10, n_pcs=40, use_rep = "X_pca_harmony")

sc.tl.leiden(v_con, resolution=0.15)
sc.tl.umap(v_con)
sc.pl.umap(v_con, color=['leiden'])
x = sc.tl.rank_genes_groups(v_con, 'leiden', method='wilcoxon', use_raw = False)
sc.pl.rank_genes_groups(v_con, n_genes=25, sharey=False)

In [None]:
#0 mesenchyme
#1 chondrocytes
#2 smooth muscle
#3 endothelial cells
#4, 5, 6 too few cells
sc.pl.umap(v_con, color=['leiden', 'Mki67', "Col2a1", "Acan", "Sox9", "Ptn", "Perp", "Runx2", "Tnnt1", "Col4a1", "Gpc5", "Col1a1", "Lamc3", "Runx1", "Tgfb2", "Adamts9", "Postn", "Epha7"], use_raw = False)

In [None]:
#The next few cells subcluster the mesenchymal and chondrocytes just as done for the connective tissue

v_mes = v_con[v_con.obs["leiden"].isin(["0"])]
sc.tl.leiden(v_mes, resolution=0.25)
sc.tl.umap(v_mes)
sc.pl.umap(v_mes, color=['leiden'])
x = sc.tl.rank_genes_groups(v_mes, 'leiden', method='wilcoxon', use_raw = False)
sc.pl.rank_genes_groups(v_mes, n_genes=25, sharey=False)

In [None]:
sc.pl.umap(v_mes, color=['leiden', 'Mki67', "Col2a1", "Twist2", "Isl1", "Bmp5", "Mecom", "Gli3", "Col3a1"], use_raw = False)

In [None]:
v_chondro = v_con[v_con.obs["leiden"].isin(["1"])]
sc.tl.leiden(v_chondro, resolution=0.25)
sc.tl.umap(v_chondro)
sc.pl.umap(v_chondro, color=['leiden'])
x = sc.tl.rank_genes_groups(v_chondro, 'leiden', method='wilcoxon', use_raw = False)
sc.pl.rank_genes_groups(v_chondro, n_genes=25, sharey=False)

In [None]:
sc.pl.umap(v_chondro, color=['leiden', 'Mki67', "Col1a2", "Bmpr1b", "Mef2c"], use_raw = False)

In [None]:
#Save files for later
v_mes.write("Final/Mesenchymal_Subclustered_Prelim.h5ad")
v_chondro.write("Final/Chondrocyte_Subclustered_Prelim.h5ad")