In [None]:
##########################################################
#                                                        #
#        Genome-wide CRISPR screen in human T cells      #
#              reveals regulators of FOXP3               #
#                                                        #
##########################################################
#                                                        #
#                   Permutation test                     #
#                                                        #
##########################################################

In [None]:
# Libraries

import glob,os
import numpy as np
import scipy.stats as sp_stats
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet
from statsmodels.distributions.empirical_distribution import ECDF
import argparse
import fastcluster
from scipy.cluster.hierarchy import dendrogram
import seaborn as sns
from statsmodels.stats.multitest import multipletests
from sklearn.cluster import KMeans
from sklearn.metrics import r2_score
from adjustText import adjust_text
import scanpy as sc
import pandas as pd
import re

In [None]:
# Load functions 
# From https://github.com/klarman-cell-observatory/Perturb-CITE-seq

def fit_lm(X, y, l1_ratio=0.5, alpha=0.0005, max_iter=10000, z_score=False):
    lmfit = ElasticNet(precompute=True, l1_ratio=l1_ratio, alpha=alpha, max_iter=max_iter)
    if z_score:
        y = sp_stats.zscore(y, axis=0)
    lmfit.fit(X, y)
    return lmfit.coef_, lmfit


# Calculate null distributions for given covariate (specified by "cov_ind" in X matrix) [adapted]
def shuffle_and_fit_AG(X, y, cov_ind, ntc_ind, num_iters=1000):
    # Can pre-allocate "all_nulls" for speed
    for iter in np.arange(num_iters):
        X_cov = X.copy()
        mask_cov = np.logical_or((X_cov[:, cov_ind] >= 0.5), (X_cov[:, ntc_ind] >= 0.5))
        X_cov = X_cov[mask_cov, :]
        X_cov[:, cov_ind] = np.random.permutation(X_cov[:, cov_ind])
        X_other = X.copy()
        mask_other = np.logical_and((X_other[:, cov_ind] < 0.5), (X_other[:, ntc_ind] < 0.5))
        X_other = X_other[mask_other, :]
        X_shuffled = np.concatenate((X_cov, X_other))
        lm_coefs, lm_model = fit_lm(X_shuffled, y, alpha = 0.0001)
        if iter == 0:
            all_nulls = lm_coefs[:, cov_ind].flatten()
        else:
            all_nulls = np.append(all_nulls, lm_coefs[:, cov_ind].flatten())
        del X_shuffled
        del X_cov
        del X_other
    return all_nulls

# Calculate empirical p values for a given covariate (specified by "cov_ind") given regulatory matrix and null distributions
def calc_p_vals(beta_mat, null_distrib, cov_ind):
    curr_coeffs = beta_mat[:, cov_ind].flatten()
    curr_ECDF = ECDF(null_distrib)
    p_vals = np.ones(curr_coeffs.size)
    neg_inds = np.where(curr_coeffs < 0)[0]
    pos_inds = np.where(curr_coeffs >= 0)[0]
    p_vals[neg_inds] = curr_ECDF(curr_coeffs[neg_inds])
    p_vals[pos_inds] = 1-curr_ECDF(curr_coeffs[pos_inds])
    return p_vals


In [None]:
# Load data

X = pd.read_csv("Xsecond_matrix.txt", sep="\t", index_col = 0)
X_array = X.to_numpy()

expr_mtx = pd.read_csv("Y_matrix.txt", sep="\t", index_col = "CellBarcode")
expr_mtx = pd.DataFrame(expr_mtx, index = X.index.values)

coeff_mtx = pd.read_csv("Beta_matrix_LR4.txt", sep="\t", index_col=0)
coeff_array = coeff_mtx.to_numpy()
coeff_array = np.transpose(coeff_array)

In [None]:
# List of covariates
ncov = np.shape(X)[1]

#7 because the first 7 don't correspond to gRNA
for cov in np.arange(7, ncov+1):
    null_d = shuffle_and_fit_AG(X_array, expr_mtx, cov, ntc_ind=133, num_iters=100)
    pval = calc_p_vals(coeff_array, null_d, cov)
    if cov == 7:
        pval_array = pd.DataFrame(pval, columns=[coeff_mtx.index.values[cov]], index=[coeff_mtx.columns.values])
    else:
        pval_array = pd.concat([pval_array, pd.DataFrame(pval, columns=[coeff_mtx.index.values[cov]], index=[coeff_mtx.columns.values])], axis=1)

df.close()

pval_array.to_csv('PermutationTest_Pval.txt', sep='\t')