In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
import csv
import pandas as pd
from scipy import stats
import re
from scipy.stats.stats import pearsonr
#import rpy2.robjects as robjects
import random
from statsmodels.stats.multitest import fdrcorrection
import copy
from collections import Counter
import seaborn as sns
from scipy.stats import binom_test
import gseapy as gs
import gzip
import os

In [None]:
meta = pd.read_csv("PEC2_sample_metadata.txt", sep = "\t")
meta = meta[meta["Cohort"].isin(["CMC", "LIBD", "UCLA-ASD", "ROSMAP", "SZBDMulti-Seq"])]
meta = meta[meta["Disorder"].isin(["control", "Control"])]
meta = meta[meta["1000G_ancestry"].isin(["EUR"])]
meta = meta[meta["Age_death"].astype(int) >= 25]
keep_inds = list(meta["Individual_ID"])

In [None]:
#Iterate through all the expression matrices
df_all = pd.DataFrame()
ind = 1
c = 0
for folder in os.listdir("snrna_expr_matrices/snrna_expr_matrices"):
    for file in os.listdir("snrna_expr_matrices/snrna_expr_matrices/" + folder):
        with gzip.open('snrna_expr_matrices/snrna_expr_matrices/' + folder + "/" + file, 'rt') as f:
            line = f.readline()
            indiv = file.replace("-annotated_matrix.txt.gz", "")
            ctd = Counter(line.replace("\n", "").split("\t"))
            df_ind = pd.DataFrame.from_dict(ctd, orient='index')
            df_ind.columns = [indiv]
            if indiv in keep_inds:
                c += 1
                if ind:
                    ind = 0
                    df_all = df_ind
                else:
                    df_all = df_all.join(df_ind, how = "outer")

df_all = df_all.replace(np.nan, 0)
out = []
for index, row in df_all.iterrows():
    s = 0
    for i in list(row):
        if i >= 50:
            s += 1
    out.append([index, s])
above_50 = pd.DataFrame(out)
above_50.sort_values(1)

#Cell types to include based on the above analysis computing how many cells are in each dataset
include = ["Sncg", "L5/6 NP", "Lamp5 Lhx6", "L6b", "L6 CT", "Lamp5", "Sst", "Vip", "Pvalb", "L4 IT", "L6 IT", "L5 IT", "L2/3 IT"]

#Keep only the ones with enough cells per cell type
c = 0
keep_samp = []
for folder in os.listdir("snrna_expr_matrices/snrna_expr_matrices"):
    for file in os.listdir("snrna_expr_matrices/snrna_expr_matrices/" + folder):
        with gzip.open('snrna_expr_matrices/snrna_expr_matrices/' + folder + "/" + file, 'rt') as f:
            line = f.readline()
            indiv = file.replace("-annotated_matrix.txt.gz", "")
            ctd = Counter(line.replace("\n", "").split("\t"))
            df_ind = pd.DataFrame.from_dict(ctd, orient='index')
            df_ind.columns = [indiv]
            if len(np.intersect1d(include, df_ind.index)) == len(include) and indiv in keep_inds:
                c += 1
                df_ind = df_ind.loc[include]
                if np.min(df_ind[indiv]) >= 50:
                    keep_samp.append(file)
keep_genes = list(pd.read_csv("Human_Sestan_DLPFC_SampSize_1000_Round100.txt", sep = "\t")["0"])

In [None]:
#Downsample counts and number of cells
for folder in os.listdir("snrna_expr_matrices/snrna_expr_matrices"):
    for file in os.listdir("snrna_expr_matrices/snrna_expr_matrices/" + folder):
        if file in keep_samp and file.replace("-annotated_matrix.txt.gz", "") not in done:
            print(file)
            v = pd.read_csv('snrna_expr_matrices/snrna_expr_matrices/' + folder + "/" + file, sep = "\t")
            v = v.set_index("featurekey")
            cts = np.array([x.split(".")[0] for x in v.columns])
            v.columns = list(range(len(v.columns)))
            v = v.loc[np.intersect1d(keep_genes, v.index)]
            for ite in range(11, 101):
                np.random.seed(ite)
                ind = 1
                pseudo_all = pd.DataFrame()
                for ct in include:
                    inds = np.where(np.array(cts) == ct)[0]
                    keep_inds = np.random.choice(inds, replace = False, size = 50)
                    v_samp = v[keep_inds]
                    pseudo = pd.DataFrame(v_samp.sum(axis = 1))
                    pseudo.columns = [ct]
                    if ind:
                        ind = 0
                        pseudo_all = pseudo
                    else:
                        pseudo_all = pseudo_all.join(pseudo, how = "outer")
                pseudo_all = pseudo_all.replace(np.nan, 0)
                pseudo_all.to_csv("Downsamplings_Control/" + file.replace(".txt.gz", "") + "_Size50_Round" + str(ite) + ".txt", sep = "\t")

In [None]:
#Rewriting keep_indiv here
keep_indiv = ['CMC_MSSM_049-annotated_matrix.txt.gz', 'CMC_MSSM_056-annotated_matrix.txt.gz', 'CMC_MSSM_089-annotated_matrix.txt.gz', 'CMC_MSSM_227-annotated_matrix.txt.gz',\
 'CMC_MSSM_234-annotated_matrix.txt.gz',\
 'CMC_MSSM_272-annotated_matrix.txt.gz',\
 'CON1-annotated_matrix.txt.gz',\
 'CON11-annotated_matrix.txt.gz',\
 'CON12-annotated_matrix.txt.gz',\
 'CON13-annotated_matrix.txt.gz',\
 'CON15-annotated_matrix.txt.gz',\
 'CON16-annotated_matrix.txt.gz',\
 'CON18-annotated_matrix.txt.gz',\
 'CON19-annotated_matrix.txt.gz',\
 'CON20-annotated_matrix.txt.gz',\
 'CON21-annotated_matrix.txt.gz',\
 'CON23-annotated_matrix.txt.gz',\
 'CON3-annotated_matrix.txt.gz',\
 'CON4-annotated_matrix.txt.gz',\
 'CON5-annotated_matrix.txt.gz',\
 'CON6-annotated_matrix.txt.gz',\
 'CON9-annotated_matrix.txt.gz',\
 '75QW-annotated_matrix.txt.gz',\
 'AN19760-annotated_matrix.txt.gz',\
 'HctZGA002-annotated_matrix.txt.gz']

In [None]:
#Making keep_samp again just in case
keep_samp = [x.replace("-annotated_matrix.txt.gz", "") for x in list(keep_indiv)]

In [None]:
#Compute cell type proportions
include = ["Sncg", "L5/6 NP", "Lamp5 Lhx6", "L6b", "L6 CT", "Lamp5", "Sst", "Vip", "Pvalb", "L4 IT", "L6 IT", "L5 IT", "L2/3 IT"]

v = pd.read_csv("CellTypeNumbers_388.csv")
v = v.set_index("individualID").loc[keep_samp]
out = []
for i in include:
    v2 = v[v["CellType"].isin([i])]
    out.append([i, np.sum(v2["counts"])])
df_prop = pd.DataFrame(out)
df_prop.columns = ["Cell type", "Counts"]
df_prop["Proportion"] = df_prop["Counts"]/np.sum(df_prop["Counts"])
df_prop = df_prop.set_index("Cell type")

In [None]:
#Go through and compute median correlation of within human variance with cell type proportion 
#Save iteration 2 as it is the first iteration that has the median correlation
plt.rcParams['xtick.major.size'] = 10
plt.rcParams['xtick.major.width'] = 1.5
plt.rcParams['xtick.minor.size'] = 4
plt.rcParams['xtick.minor.width'] = 1
plt.rcParams['xtick.bottom'] = True
plt.rcParams['ytick.left'] = True

from scipy.stats import spearmanr
def downsample_counts(x, min_counts, iteration):
    prob = x/np.sum(x)
    np.random.seed(iteration)
    return np.random.multinomial(n=min_counts, pvals = prob, size = 1)[0]

def cpm(x):
    return x/np.sum(x)*1000000

include = ["Sncg", "L5/6 NP", "Lamp5 Lhx6", "L6b", "L6 CT", "Lamp5", "Sst", "Vip", "Pvalb", "L4 IT", "L6 IT", "L5 IT", "L2/3 IT"]
import os
final_out = []
df_save = 0
for iteration in range(1, 101):
    ind = 1
    df = 0
    for file in os.listdir("Downsamplings_Control"):
        if "Round" + str(iteration) + ".txt" in file:
            v = pd.read_csv("Downsamplings_Control/" + file, sep = "\t").set_index("featurekey")
            v.columns = [x + "-" + file.replace("-annotated_matrix_Size50_Round" + str(iteration) + ".txt", "") for x in list(v.columns)]
            lowest_counts = np.min(np.sum(v))
            v = v.apply(downsample_counts, axis = 0, min_counts = lowest_counts, iteration = 1)
            check = []
            for q in v.columns:
                check.append(np.sum(v[q]))
            assert(len(set(check)) == 1)
            if ind:
                df = v.copy()
                ind = 0
            else:
                df = df.join(v, how = "outer")
    out = []
    for i in include:
        keep_cols = []
        for j in list(df.columns):
            if i in j:
                keep_cols.append(j)
        df_cur = df[keep_cols].copy()
        df_cur["Mean"] = np.mean(df_cur, axis = 1)
        df_cur = df_cur[df_cur["Mean"] >= 25]
        df_cur = df_cur.drop("Mean", axis = 1)
        df_cur = df_cur.apply(cpm, axis = 0)
        centroid = np.mean(df_cur, axis = 1)
        rhos = []
        for j in keep_cols:
            rhos.append(spearmanr(df_cur[j], centroid)[0])
        out.append([i, np.mean(rhos), np.median(rhos)])
    df_plot = pd.DataFrame(out)
    df_plot.columns = ["Cell type", "Mean", "Median"]
    
    df_plot = df_plot.set_index("Cell type")
    df_plot = df_plot.join(df_prop)
    if iteration == 2:
        df_save = df_plot.copy()
    spear = spearmanr(np.log10(df_plot["Proportion"]), 1-df_plot["Mean"])
    final_out.append([iteration, spear[0], spear[1]])
df = pd.DataFrame(final_out)

In [None]:
#Write out iteration 2 for plotting later
df_save.to_csv("WithinHuman_ToPlot.txt", sep = "\t")

In [None]:
#Print the median p-vlaue etc.
print(np.median(df[1]))
print(np.median(df[2]))