In [1]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
import napari
from scipy.sparse import issparse
from scipy.stats import ranksums, false_discovery_control
from tqdm import tqdm
import pickle
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
%matplotlib qt

In [4]:
def hex2rgb(hexstr):
    return tuple(int(hexstr[i:i+2], 16) for i in (0, 2, 4))


def compute_pearson_residuals(X, theta=100.0, clip=None, 
                              copy=False, return_params=False):
    """from dynamo-release"""

    """Compute Pearson residuals from count data.

    Pearson residuals are a measure of the deviation of observed counts from expected counts under a Poisson or negative
    binomial model.

    Args:
        X: array_like count matrix, shape (n_cells, n_genes).
        theta: the dispersion parameter for the negative binomial model. Must be positive.
        clip: The maximum absolute value of the residuals. Residuals with absolute value larger than `clip` are clipped
            to `clip`. If `None`,`clip` is set to the square root of the number of cells in `X`.
        check_values: whether to check if `X` contains non-negative integers. If `True` and non-integer values are
            found, a `UserWarning` is issued.
        copy: whether to make a copy of `X`.

    Returns:
        The Pearson residuals.
    """
    X = X.copy() if copy else X

    # check theta
    if theta <= 0:
        # TODO: would "underdispersion" with negative theta make sense?
        # then only theta=0 were undefined..
        raise ValueError("Pearson residuals require theta > 0")
    # prepare clipping
    if clip is None:
        n = X.shape[0]
        clip = np.sqrt(n)
    if clip < 0:
        raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")

    if issparse(X):
        sums_genes = np.sum(X, axis=0)
        sums_cells = np.sum(X, axis=1)
        sum_total = np.sum(sums_genes).squeeze()
    else:
        sums_genes = np.sum(X, axis=0, keepdims=True)
        sums_cells = np.sum(X, axis=1, keepdims=True)
        sum_total = np.sum(sums_genes)

    mu = np.array(sums_cells @ sums_genes / sum_total)
    diff = np.array(X - mu)
    residuals = diff / np.sqrt(mu + mu**2 / theta)

    # clip
    residuals = np.clip(residuals, a_min=-clip, a_max=clip)
    
    if return_params:
        sigma = np.sqrt(mu + mu**2 / theta)
        return residuals, mu, sigma
    else:
        return residuals

    
def logxp1(counts):
    """return normalized log(x+1) transform of cellxgene matrix of raw counts"""
    return np.log(counts / np.expand_dims(np.sum(counts, axis=1), axis=1) + 1)
    
    
def bin_aps(aps, bins):
    _counts, bins = np.histogram(aps, bins)
    bins = bins[1:]
    binned_aps = np.zeros_like(aps)
    for i in range(len(binned_aps)):
        binned_aps[i] = get_ap_bin(aps[i], bins)
    
    return binned_aps
    

def get_ap_bin(this_ap, bins):
    this_bin = np.where(np.abs(this_ap - bins) == np.nanmin(np.abs(this_ap - bins)))[0][0]

    return this_bin


def bin_exp(exp, binned_ys, y_bins, n_bootstraps=300):
    binned_exp = np.zeros_like(y_bins)
    std_exp = np.zeros_like(y_bins)
    for i in range(len(y_bins)):
        these_expression_levels = exp[y_bins[binned_ys.astype('int')] == y_bins[i]]
        binned_exp[i] = np.nanmean(these_expression_levels)
        bootstrapped_means = np.zeros(n_bootstraps)
        for n in range(n_bootstraps):
            these_expression_levels = np.random.choice(these_expression_levels, len(these_expression_levels))
            bootstrapped_means[n] = np.mean(these_expression_levels)
        std_exp[i] = np.nanstd(bootstrapped_means)
    return binned_exp, std_exp


def get_filtered_index(index, gene_list, filtered_gene_list):
    this_gene = gene_list[index]
    filtered_index = np.where(np.array([g == this_gene for g in filtered_gene_list]))[0][0]
    
    return filtered_index
    
    
def get_binned_line_dist(index, residuals, binned_ys, y_bins, gene_list, filtered_gene_list):
    filtered_index = get_filtered_index(index, gene_list, filtered_gene_list)
    expression = residuals[:, filtered_index]
    binned_expression, _ = bin_exp(expression, binned_ys, y_bins)
    
    return binned_expression
    

def get_line_dist_uncertainty(index, residuals, binned_ys, y_bins, gene_list, filtered_gene_list, n_bootstraps=100):
    line_dist_arr = np.zeros((n_bootstraps, len(y_bins)))
    for i in range(n_bootstraps):
        scrambled_ys = np.random.choice(binned_ys, len(binned_ys))
        line_dist_arr[i] = get_binned_line_dist(index, residuals, scrambled_ys, y_bins, gene_list, filtered_gene_list)

    return np.nanmean(line_dist_arr, axis=0), np.nanstd(line_dist_arr, axis=0)
        

def compute_anterior_middle_difference(gene_id, X, 
                                  binned_ys, y_bins, 
                                  gene_list, filtered_gene_list, 
                                  n_bootstraps=300, a_start=-113.0, 
                                  a_end=-63, m_start=0, m_end=50):
    
    exp = X[:, get_filtered_index(gene_id, gene_list, 
                                          filtered_gene_list)]
    binned_exp, std_exp = bin_exp(exp, binned_ys, y_bins, 
                                  n_bootstraps=n_bootstraps)
    mean_scramble, std_scramble = get_line_dist_uncertainty(
                gene_id, X, binned_ys, y_bins, gene_list, 
                filtered_gene_list, n_bootstraps=n_bootstraps)
    
    anterior_ids = (y_bins >= a_start) & (y_bins < a_end)
    middle_ids = (y_bins >= m_start) & (y_bins < m_end)

    anterior_signal = np.mean(binned_exp[anterior_ids])
    middle_signal = np.mean(binned_exp[middle_ids])
    
    difference = anterior_signal - middle_signal
    sigma_anterior = np.sqrt(np.mean(std_exp[anterior_ids] ** 2) / np.sum(anterior_ids))
    sigma_middle = np.sqrt(np.mean(std_exp[middle_ids] ** 2))
    sigma_difference = np.sqrt((sigma_anterior ** 2 + sigma_middle **2) / np.sum(middle_ids))
    

    anterior_scramble = np.mean(mean_scramble[anterior_ids])
    middle_scramble = np.mean(mean_scramble[middle_ids])
    
    scramble_difference = anterior_scramble - middle_scramble
    sigma_scramble_anterior = np.sqrt(np.mean(std_scramble[anterior_ids] ** 2) / np.sum(anterior_ids))
    sigma_scramble_middle = np.sqrt(np.mean(std_scramble[middle_ids] ** 2) / np.sum(middle_ids))
    sigma_scramble_difference = np.sqrt((sigma_scramble_anterior ** 2 + sigma_scramble_middle **2))
    
    
    
    return difference, sigma_difference, scramble_difference, sigma_scramble_difference

        

    

In [5]:
"""load the anndata file"""
file_path = r'/media/brandon/Data2/Brandon/fly_immune/Flysta3d/L3_b_count_normal_stereoseq.h5ad'
f = h5py.File(file_path, 'r')
f.keys()

<KeysViewHDF5 ['X', 'layers', 'obs', 'obsm', 'uns', 'var']>

In [6]:
"""extract the raw reads and processes"""
raw_reads =  np.array(f['layers'].get('raw_counts'))

# filter reads to 5% detection
detection_percent = np.sum(raw_reads > 0, axis=0) / len(raw_reads)
filter_sel = detection_percent > 0.05
filtered_reads = raw_reads[:, filter_sel]

# convert to pearson residuals
residuals = compute_pearson_residuals(filtered_reads, theta=10)

  raw_reads =  np.array(f['layers'].get('raw_counts'))
  residuals = diff / np.sqrt(mu + mu**2 / theta)


In [8]:
"""get selection of just fat body cells"""
a = np.array(f['obs'].get('annotation'))
sel = (a == 4)

  a = np.array(f['obs'].get('annotation'))


In [9]:
"""get the y coordinates of fat body cells"""
spatial = np.array(f['obsm'].get('spatial'))
spatial = spatial[sel, :]

ys = spatial[:, 1]
print(f'fat body cells range from y={np.min(ys)} to y={np.max(ys)}')

fat body cells range from y=-178.21475 to y=184.0076


  spatial = np.array(f['obsm'].get('spatial'))


In [10]:
"""load the gene names and convert to list"""
gene_ids = f['var'].get('geneID')
gene_indices = np.arange(len(gene_ids))

"""convert gene_ids to more convenient lists"""
gene_list = [gene.decode() for gene in gene_ids]
filtered_gene_list = [gene.decode() for gene in gene_ids[filter_sel]]

In [12]:
# parameters for ap bins
n_y_bins = 30
y_bins = np.linspace(np.min(ys), np.max(ys), n_y_bins)
binned_ys = bin_aps(ys, y_bins)



# Identify genes with positive or negative anterior bias

In [27]:
run = False

In [28]:
"""y coordinates that define the start and end of anterior and middle regions"""
a_start = -113
a_end = -113 + 50

m_start = 0
m_end = 50


In [15]:
"""run the computation of residual differences. Takes ~2.5 hrs"""
if run:
    all_differences = np.zeros(len(filtered_gene_list))
    all_sigma_differences = np.zeros(len(filtered_gene_list))
    all_scramble_differences = np.zeros(len(filtered_gene_list))
    all_sigma_scramble_differences = np.zeros(len(filtered_gene_list))


    for i in tqdm(range(len(filtered_gene_list))):
        this_id = np.where(np.array([g == filtered_gene_list[i] for g in gene_list]))[0][0]
        all_differences[i], 
        all_sigma_differences[i], 
        all_scramble_differences[i], 
        all_sigma_scramble_differences[i] = compute_anterior_middle_difference(this_id, residuals[sel], 
                                      binned_ys, y_bins, 
                                      gene_list, filtered_gene_list, 
                                      n_bootstraps=10, a_start=a_start, 
                                      a_end=a_end, m_start=m_start, m_end=m_end)

NameError: name 'filtered_gene_list' is not defined

In [34]:
"""load the results"""
with open(r'/media/brandon/Data2/Brandon/fly_immune/Flysta3d/anterior-middle-differences.pkl', 'rb') as file:
    all_differences, all_sigma_differences, all_scramble_differences, all_sigma_scramble_differences  =  pickle.load(file)

In [51]:
"""filter the results"""
# filter by error bars
high_expressing = all_differences - all_sigma_differences > (all_scramble_differences + all_sigma_scramble_differences)
low_expressing = all_differences + all_sigma_differences < (all_scramble_differences - all_sigma_scramble_differences)

# filter by effect size
difference_thresh = 2
high_expressing = high_expressing * (all_differences > difference_thresh)
low_expressing = low_expressing * (all_differences < -1 * difference_thresh)

# extract the gene names that survived the filter
high_expressing_genes = [filtered_gene_list[i] for i in range(len(filtered_gene_list)) if high_expressing[i]]
low_expressing_genes = [filtered_gene_list[i] for i in range(len(filtered_gene_list)) if low_expressing[i]]

In [52]:
"""print the gene names"""
for g in high_expressing_genes:
    this_id = np.where(np.array([gg == g for gg in gene_list]))[0][0]

    print(f'{gene_indices[this_id]}, {g}')

1902, CG14302
2329, CG15369
4737, CG42587
5979, CG6870
6276, CG7953
7808, Fbp1
8682, LysS
10822, TotA
12109, deltaTry
12787, lncRNA:CR33938
12797, lncRNA:CR34335
12802, lncRNA:CR40469
15171, regucalcin


In [53]:
"""print the gene names"""
for g in low_expressing_genes:
    this_id = np.where(np.array([gg == g for gg in gene_list]))[0][0]

    print(f'{gene_indices[this_id]}, {g}')

1104, CG12116
1211, CG12522
1254, CG12699
1628, CG13641
2548, CG16713
2603, CG16926
2729, CG17376
3296, CG30430
3409, CG31226
3440, CG31313
3648, CG31988
3672, CG32073
4216, CG34166
4218, CG34168
5341, CG46059
5418, CG4716
5700, CG5773
6918, CR43186
8671, Lsp1beta
8672, Lsp1gamma
8673, Lsp2
8896, Mst84Db
8898, Mst84Dd
8900, Mst87F
9229, ORY
10150, S-Lap1
10153, S-Lap4
11345, asRNA:CR11538
14921, ocn
15101, pre-rRNA:CR45856


In [44]:
#this_id = 10822    # totA
#this_id = 15171    # regulacin
#this_id = 8673 
this_id = 7604      # dptA
expression = residuals[sel, get_filtered_index(this_id, gene_list, filtered_gene_list)]
binned_exp, std_exp = bin_exp(expression, binned_ys, y_bins)
mean_scramble, std_scramble = get_line_dist_uncertainty(this_id, residuals[sel], binned_ys, y_bins, gene_list, filtered_gene_list, n_bootstraps=10)
m = mean_scramble
s = std_scramble
l = m - s
u = m + s

plt.figure()
plt.fill_between(y_bins, l, u, color='k', alpha=0.3)
plt.errorbar(y_bins, binned_exp, std_exp, linewidth=4, color='c')
#plt.errorbar(y_bins, binned_dptA, bootstrapped_uncertainty)

plt.xlabel('ap')
plt.ylabel(f'{gene_list[this_id]}')
#plt.xlim([-113, np.max(ys)])
#plt.ylim([-1.0, 2.5])

  binned_exp[i] = np.nanmean(these_expression_levels)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  return np.nanmean(line_dist_arr, axis=0), np.nanstd(line_dist_arr, axis=0)


Text(0, 0.5, 'DptA')

## Make plots of all up and down regulated gene line dists

In [57]:
"""up regulated"""
plt.figure()

for i, g in tqdm(enumerate(high_expressing_genes)):
    this_id = np.where(np.array([gg == g for gg in gene_list]))[0][0]
    expression = residuals[sel, get_filtered_index(this_id, gene_list, filtered_gene_list)]
    binned_exp, _ = bin_exp(expression, binned_ys, y_bins, n_bootstraps=0)
    plt.plot(y_bins, binned_exp - np.nanmin(binned_exp), linewidth=4)



  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_e

In [58]:
"""down regulated"""
plt.figure()

for i, g in tqdm(enumerate(low_expressing_genes)):
    this_id = np.where(np.array([gg == g for gg in gene_list]))[0][0]
    expression = residuals[sel, get_filtered_index(this_id, gene_list, filtered_gene_list)]
    binned_exp, _ = bin_exp(expression, binned_ys, y_bins, n_bootstraps=0)
    plt.plot(y_bins, binned_exp - np.nanmin(binned_exp), linewidth=4)



  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_e

  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
30it [00:00, 235.71it/s]


## Standard approach
Normalize by read depth per cell and logp1 transform

In [13]:
run = True

In [14]:
"""construct array of normalized expression levels filtered for detection and fat body markers"""
raw_reads =  np.array(f['layers'].get('raw_counts'))

norm_reads = raw_reads / np.expand_dims(np.sum(raw_reads, axis=1), axis=1)

X = logxp1(raw_reads)

# filter reads to 5% detection
detection_percent = np.sum(raw_reads > 0, axis=0) / len(raw_reads)
filter_sel = detection_percent > 0.05
filtered_reads = raw_reads[:, filter_sel]

X_filtered = X[:, filter_sel]
X_fb = X_filtered[sel]
norm_reads_filtered = norm_reads[:, filter_sel]
norm_reads_fb = norm_reads_filtered[sel]

  raw_reads =  np.array(f['layers'].get('raw_counts'))


In [16]:
# check variance-mean relationship.
plt.figure()
plt.plot(np.mean(X_fb, axis=0), np.var(X_fb, axis=0), 'ko', markerfacecolor='c', markersize=4, alpha=0.1)
x = np.logspace(-5, -1)
plt.plot(x, x, 'm--', linewidth=2)
plt.xscale('log')
plt.yscale('log')
plt.xlabel('mean count')
plt.ylabel('var count')

Text(0, 0.5, 'var count')

In [15]:
"""y coordinates that define the start and end of anterior and middle regions"""
a_start = -113
a_end = a_start + 70

m_start = 0
m_end = m_start + 70

anterior_ids = (ys >= a_start) & (ys < a_end)
middle_ids = (ys >= m_start) & (ys < m_end)

print(np.sum(anterior_ids))
print(np.sum(middle_ids))

1079
1470


In [16]:
"""run the computation of differential expression analysis. """
n_bootstraps = 10
if run:
    all_logfc = np.zeros(len(filtered_gene_list))
    all_pvalues = np.zeros(len(filtered_gene_list))
    all_means = np.zeros(len(filtered_gene_list))
    all_uncertainties = np.zeros(len(filtered_gene_list))
    all_scrambled_means = np.zeros(len(filtered_gene_list))
    all_scrambled_uncertainties = np.zeros(len(filtered_gene_list))
    all_middle_means = np.zeros(len(filtered_gene_list))
    all_middle_uncertainties = np.zeros(len(filtered_gene_list))
    for i in tqdm(range(len(filtered_gene_list))):
        anterior_expression = X_fb[anterior_ids, i]
        middle_expression = X_fb[middle_ids, i]
        _, all_pvalues[i] = ranksums(anterior_expression, middle_expression)
        #all_logfc[i] = np.log(np.mean(anterior_expression) / np.mean(middle_expression))
        
        anterior_norm = norm_reads_fb[anterior_ids, i]
        middle_norm = norm_reads_fb[middle_ids, i]
        all_logfc[i] = np.log2(np.mean(anterior_norm) / np.mean(middle_norm))
        
        # real mean
        all_means[i] = np.mean(anterior_norm)

        # get real uncertainty
        bootstrapped_mean = np.zeros(n_bootstraps)
        for n in range(n_bootstraps):
            bootstrapped_mean[n] = np.mean(np.random.choice(anterior_norm, len(anterior_ids)))
        
        all_uncertainties[i] = np.std(bootstrapped_mean)

        # real middle mean
        all_middle_means[i] = np.mean(middle_norm)

        # get real middle uncertainty
        bootstrapped_mean = np.zeros(n_bootstraps)
        for n in range(n_bootstraps):
            bootstrapped_mean[n] = np.mean(np.random.choice(middle_norm, len(anterior_ids)))
        
        all_middle_uncertainties[i] = np.std(bootstrapped_mean)
        
        # get scrambled mean and uncertainty
        bootstrapped_mean = np.zeros(n_bootstraps)
        for n in range(n_bootstraps):
            bootstrapped_mean[n] = np.mean(np.random.choice(norm_reads_fb[:, i], len(anterior_ids)))
        
        all_scrambled_means[i] = np.mean(bootstrapped_mean)
        all_scrambled_uncertainties[i] = np.std(bootstrapped_mean)
        
            




100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 6505/6505 [00:16<00:00, 401.84it/s]


In [17]:
lfc_thresh = 1#0.5#np.log2(2)
p_thresh = 0.05 / len(filtered_gene_list)
all_corrected_pvalues = all_pvalues #false_discovery_control(all_pvalues)
up_sel = (all_corrected_pvalues < p_thresh) & (all_logfc > lfc_thresh)
down_sel = (all_corrected_pvalues < p_thresh) & (all_logfc < -lfc_thresh)
diffex_sel = (all_corrected_pvalues < p_thresh) & (np.abs(all_logfc) > lfc_thresh)

In [18]:
plt.figure()
plot_x = all_logfc[diffex_sel]
plot_y = all_pvalues[diffex_sel]
plt.plot(plot_x, -np.log10(plot_y), 'ko', markersize=6, alpha=0.5)
plt.yscale('linear')


  plt.plot(plot_x, -np.log10(plot_y), 'ko', markersize=6, alpha=0.5)


In [23]:
"""assemble results into a dataframe"""
df = pd.DataFrame()
df['gene'] = filtered_gene_list
df['p_val'] = all_pvalues
df['log2fc'] = all_logfc
df['anterior_mean'] = all_means
df['anterior_uncertainty'] = all_uncertainties
df['middle_mean'] = all_middle_means
df['middle_uncertainty'] = all_middle_uncertainties
df['scrambled_mean'] = all_scrambled_means
df['scrambled_uncertainty'] = all_scrambled_uncertainties

In [26]:
"""filter on p value, fold change, and expression relative to a uniform null"""
anterior_df = df[(df.p_val < p_thresh) & (df.log2fc > lfc_thresh) & (df.anterior_mean - df.anterior_uncertainty > 1.5 * (df.scrambled_mean + df.scrambled_uncertainty))]
middle_df = df[(df.p_val < p_thresh) & (df.log2fc < -lfc_thresh) & (df.middle_mean - df.middle_uncertainty > 1.5 * (df.scrambled_mean + df.scrambled_uncertainty))]


In [27]:
"""futher filter by cross referencing a list of genes expressed in the larval fat body by bulk RNA seq"""
bulk_df = pd.read_excel(r'/home/brandon/Downloads/GSE95800_GEO_upload_processed_data.xlsx')
bulk_df

Unnamed: 0,Gene,r4gal4_v60100_0_15m_1,r4gal4_v60100_0_15m_2,r4gal4_v60100_0_15m_3,r4gal4_svpi_0_15m_1,r4gal4_svpi_0_15m_2,r4gal4_svpi_0_15m_3,r4gal4_v60100_0_7m_1,r4gal4_v60100_0_7m_2,r4gal4_v60100_0_7m_3,r4gal4_svpi_0_7m_1,r4gal4_svpi_0_7m_2,r4gal4_svpi_0_7m_3
0,Alas,288.58000,163.1200,186.1700,226.95000,205.9600,177.2200,320.1900,199.9600,319.3900,193.55000,222.2200,172.1100
1,alc,130.61000,146.0300,155.2300,163.78000,163.0600,161.8100,185.6400,155.1600,141.8000,218.14000,202.9800,215.9600
2,AlCR2,15.39500,1.3285,6.2577,0.96298,9.4032,10.1400,10.6350,7.7955,8.2266,2.83440,2.3485,2.5456
3,ald,48.98900,34.5790,19.3240,36.49200,41.1770,45.5720,35.2910,44.9340,33.9270,38.86500,26.7900,22.7610
4,Ald,48.98900,34.5790,19.3240,36.49200,41.1770,45.5720,2595.8000,1966.5000,2609.0000,1851.70000,1638.1000,1829.1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9522,zuc,0.82583,1.1566,4.0080,2.95120,5.0812,4.9883,3.2505,3.8929,7.1624,0.87707,7.8542,4.2576
9523,Zw,85.34800,111.5100,145.1500,67.09700,88.9270,37.2600,200.5100,139.5100,150.0000,154.82000,140.5700,186.4200
9524,zwilch,35.36500,30.2110,36.8080,18.72800,29.7150,21.3560,24.6340,35.1830,24.6810,28.38700,26.5840,19.3860
9525,zye,339.24000,365.5200,306.0800,259.56000,183.2200,110.6100,247.6900,231.2900,230.1000,146.30000,148.5000,172.6400


In [31]:
"""extract list of genes in the bulk data set. replace known name discrepencies"""
bulk_genes = bulk_df.Gene.to_list()
bulk_genes = list(map(lambda x: x.replace('Dpt', 'DptA'), bulk_genes))

In [34]:
anterior_df = anterior_df[[g in bulk_genes for g in anterior_df.gene.to_list()]]
middle_df = middle_df[[g in bulk_genes for g in middle_df.gene.to_list()]]

In [35]:
anterior_df

Unnamed: 0,gene,p_val,log2fc,anterior_mean,anterior_uncertainty,middle_mean,middle_uncertainty,scrambled_mean,scrambled_uncertainty
99,Adh,1.606079e-23,1.636547,0.000104,2.250335e-06,0.000034,1.436296e-06,0.000065,0.000002
141,Amun,3.333921e-07,1.399808,0.000047,1.936108e-06,0.000018,1.245992e-06,0.000026,0.000002
230,B52,5.202520e-20,1.012013,0.000129,2.444933e-06,0.000064,2.490844e-06,0.000078,0.000002
240,Bace,3.310686e-25,2.262314,0.000103,1.920798e-06,0.000022,1.207054e-06,0.000059,0.000001
244,Bap55,4.707239e-06,1.424649,0.000036,5.932932e-07,0.000013,7.158474e-07,0.000020,0.000001
...,...,...,...,...,...,...,...,...,...
6449,vkg,2.098643e-10,1.100035,0.000061,2.100971e-06,0.000029,1.209043e-06,0.000036,0.000002
6482,x16,4.566345e-11,1.321541,0.000060,1.845769e-06,0.000024,2.343774e-06,0.000031,0.000002
6493,yps,1.593800e-25,1.124519,0.000148,3.077616e-06,0.000068,1.905147e-06,0.000079,0.000001
6495,ytr,9.760874e-08,1.197079,0.000049,1.226140e-06,0.000021,1.645635e-06,0.000022,0.000001


In [40]:
for g in anterior_df.gene.to_list():
    print(g + ',')

Adh,
Amun,
B52,
Bace,
Bap55,
Bin1,
Brd,
CG11300,
CG11370,
CG11459,
CG12115,
CG12310,
CG13044,
CG13460,
CG13461,
CG13947,
CG14265,
CG14302,
CG14332,
CG14852,
CG15093,
CG15201,
CG17278,
CG17362,
CG2310,
CG30122,
CG31460,
CG31789,
CG32198,
CG33333,
CG42748,
CG42823,
CG42834,
CG43117,
CG5789,
CG6870,
CG7290,
CG7637,
CG7953,
CG8087,
CG8289,
CG8661,
CG8664,
CG8929,
CG8997,
CG9135,
CG9672,
CG9682,
CG9686,
Chrac-16,
Cirl,
Cont,
CtBP,
D1,
Df31,
DptA,
EndoG,
Ent2,
Fas1,
Fas3,
Fbp1,
GstD1,
HP4,
Hel25E,
His2Av,
His4r,
HmgD,
Hsp67Ba,
ImpL2,
Lam,
LysS,
Nxt1,
O-fut1,
Obp44a,
Obp56d,
Pebp1,
Phae1,
Phae2,
Pig1,
Prp8,
RpII140,
Set,
Sin3A,
SmD2,
SmF,
Srp54,
Tom,
Top2,
TotA,
Tsp42Ee,
Uba2,
baf,
bnb,
bowl,
brat,
cg,
crol,
deltaTry,
drl,
e(y)2,
ed,
emc,
fax,
hdc,
jigr1,
kuk,
kune,
l(3)neo38,
miple2,
mod,
ng3,
nocte,
ogre,
pck,
pnut,
pot,
pros,
psq,
pzg,
regucalcin,
serp,
smt3,
stai,
vkg,
x16,
yps,
ytr,
zf30C,


In [36]:
middle_df

Unnamed: 0,gene,p_val,log2fc,anterior_mean,anterior_uncertainty,middle_mean,middle_uncertainty,scrambled_mean,scrambled_uncertainty
16,7SLRNA:CR32864,1.302461e-18,-2.362382,2.6e-05,6.589371e-07,0.000133,3e-06,8.1e-05,4e-06
17,7SLRNA:CR42652,1.425579e-13,-2.297837,2.4e-05,1.508762e-06,0.000119,4e-06,6.8e-05,3e-06
573,CG12229,6.53944e-07,-1.776069,2.1e-05,6.789324e-07,7.4e-05,2e-06,4.6e-05,2e-06
606,CG12605,5.901773e-06,-1.799067,2.2e-05,8.576129e-07,7.7e-05,3e-06,4.7e-05,2e-06
935,CG14995,1.284945e-06,-1.456249,4.1e-05,2.083878e-06,0.000112,3e-06,6.9e-05,3e-06
1088,CG16719,8.069228e-07,-1.907928,2e-05,9.124538e-07,7.3e-05,2e-06,4.4e-05,2e-06
1276,CG18568,6.744778e-09,-1.473717,3.6e-05,1.507184e-06,9.9e-05,3e-06,6.2e-05,2e-06
1320,CG2127,1.124712e-15,-1.509781,8.4e-05,1.411644e-06,0.000239,6e-06,0.000148,3e-06
1446,CG31029,1.79869e-16,-2.015389,4.1e-05,2.036882e-06,0.000164,4e-06,9e-05,3e-06
1517,CG31538,4.990743999999999e-19,-1.969059,6.2e-05,1.911879e-06,0.000243,5e-06,0.000136,5e-06


In [37]:
len(middle_df)

30

In [41]:
np.where([g == 'jigr1' for g in gene_list])[0][0]

12588

In [101]:
"""plot example genes"""
this_id = np.where([gg == 'pros' for gg in gene_list])[0][0]
#this_id = 12588
#expression = X[sel, this_id]
expression = norm_reads[sel, this_id]

plt.figure()
plt.plot(ys, expression, 'ko', alpha=0.5)

[<matplotlib.lines.Line2D at 0x7f431a108520>]

In [102]:
"""plot example binned dist"""

binned_exp, std_exp = bin_exp(expression, binned_ys, y_bins)
#mean_scramble, std_scramble = get_line_dist_uncertainty(this_id, X_fb, binned_ys, y_bins, gene_list, filtered_gene_list, n_bootstraps=10)
mean_scramble, std_scramble = get_line_dist_uncertainty(this_id, norm_reads[sel], binned_ys, y_bins, gene_list, filtered_gene_list, n_bootstraps=10)

m = mean_scramble
s = std_scramble
l = m - s
u = m + s

plt.figure()
plt.fill_between(y_bins, l, u, color='k', alpha=0.3)
plt.errorbar(y_bins, binned_exp, std_exp, linewidth=4, color='c')
#plt.errorbar(y_bins, binned_dptA, bootstrapped_uncertainty)

plt.xlabel('ap')
plt.ylabel('binned expression')
#plt.xlim([-113, np.max(ys)])

  binned_exp[i] = np.nanmean(these_expression_levels)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  return np.nanmean(line_dist_arr, axis=0), np.nanstd(line_dist_arr, axis=0)


Text(0, 0.5, 'binned expression')

In [57]:
panther_df = pd.read_csv('/home/brandon/Downloads/pantherGeneList(2).txt', sep="\t", header=None)
panther_df = panther_df.rename({0: 'gene_id', 1: 'gene', 2: 'long_name', 3: 'protein_family', 4: 'protein_class'}, axis='columns')
panther_df

Unnamed: 0,gene_id,gene,long_name,protein_family,protein_class
0,DROME|FlyBase=FBgn0004362|UniProtKB=Q05783,HmgD,High mobility group protein D;HmgD;PTN00071477...,"TRANSCRIPTION FACTOR A, MITOCHONDRIAL (PTHR481...",HMG box transcription factor(PC00024)
1,DROME|FlyBase=FBgn0035829|UniProtKB=Q8IQ92,HP4,FI01426p;HP4;;orthologs,,
2,DROME|FlyBase=FBgn0014163|UniProtKB=Q95RI5,fax,Failed axon connections;fax;PTN000265169;ortho...,FAILED AXON CONNECTIONS (PTHR12289:SF78),transporter(PC00227)
3,DROME|FlyBase=FBgn0004587|UniProtKB=P26686,B52,Serine-arginine protein 55;B52;PTN002751804;or...,SERINE-ARGININE PROTEIN 55 (PTHR23003:SF51),RNA splicing factor(PC00148)
4,DROME|FlyBase=FBgn0029506|UniProtKB=Q7KJ73,Tsp42Ee,Tetraspanin;Tsp42Ee;PTN000445347;orthologs,IP01817P-RELATED (PTHR19282:SF521),scaffold/adaptor protein(PC00226)
...,...,...,...,...,...
84,DROME|FlyBase=FBgn0033313|UniProtKB=A1Z7G7,Cirl,Latrophilin Cirl;Cirl;PTN000244766;orthologs,LATROPHILIN CIRL (PTHR12011:SF347),G-protein coupled receptor(PC00021)
85,DROME|FlyBase=FBgn0000055|UniProtKB=P00334,Adh,Alcohol dehydrogenase;Adh;PTN000672297;orthologs,ALCOHOL DEHYDROGENASE (PTHR42901:SF1),dehydrogenase(PC00092)
86,DROME|FlyBase=FBgn0034390|UniProtKB=Q9V8M5,CG15093,"Probable 3-hydroxyisobutyrate dehydrogenase, m...","3-HYDROXYISOBUTYRATE DEHYDROGENASE, MITOCHONDR...",dehydrogenase(PC00092)
87,DROME|FlyBase=FBgn0263234|UniProtKB=Q9VKA9,Phae1,AT26814p;Phae1;PTN000668564;orthologs,AT26814P-RELATED (PTHR24276:SF91),serine protease(PC00203)


In [64]:
def is_TF(x):
    return 'transcription' in x

In [72]:
'transcription' in classes[0]

True

In [82]:
classes = panther_df.protein_class.to_list()
classes = [str(c) for c in classes]
TFs = ['transcription' in c for c in classes]

tf_df = panther_df[TFs]
tf_df

Unnamed: 0,gene_id,gene,long_name,protein_family,protein_class
0,DROME|FlyBase=FBgn0004362|UniProtKB=Q05783,HmgD,High mobility group protein D;HmgD;PTN00071477...,"TRANSCRIPTION FACTOR A, MITOCHONDRIAL (PTHR481...",HMG box transcription factor(PC00024)
9,DROME|FlyBase=FBgn0020309|UniProtKB=O61360,crol,CROL ALPHA;crol;PTN000694590;orthologs,ZINC FINGER PROTEIN 239-LIKE ISOFORM X1 (PTHR2...,C2H2 zinc finger transcription factor(PC00248)
12,DROME|FlyBase=FBgn0259785|UniProtKB=Q9VP57,pzg,LD15904p;pzg;PTN001971830;orthologs,ENHANCER OF VARIEGATION 3-9-RELATED (PTHR24403...,C2H2 zinc finger transcription factor(PC00248)
15,DROME|FlyBase=FBgn0020496|UniProtKB=O46036,CtBP,C-terminal-binding protein;CtBP;PTN000107759;o...,C-TERMINAL-BINDING PROTEIN (PTHR46029:SF7),transcription cofactor(PC00217)
20,DROME|FlyBase=FBgn0039350|UniProtKB=Q9VBP5,jigr1,"Jing interacting gene regulatory 1, isoform A;...","JING INTERACTING GENE REGULATORY 1, ISOFORM A ...",DNA-binding transcription factor(PC00218)
31,DROME|FlyBase=FBgn0000618|UniProtKB=Q9VYX1,e(y)2,Enhancer of yellow 2 transcription factor;e(y)...,TRANSCRIPTION AND MRNA EXPORT FACTOR ENY2 (PTH...,DNA-binding transcription factor(PC00218)
35,DROME|FlyBase=FBgn0265276|UniProtKB=A0A0B4KGA3,l(3)neo38,"Lethal (3) neo38, isoform L;l(3)neo38;PTN00222...","LETHAL (3) NEO38, ISOFORM L (PTHR23235:SF75)",C2H2 zinc finger transcription factor(PC00248)
40,DROME|FlyBase=FBgn0263102|UniProtKB=A0A6M3Q7H0,psq,"Pipsqueak, isoform O;psq;PTN002350197;orthologs","PIPSQUEAK, ISOFORM O (PTHR23110:SF109)",DNA-binding transcription factor(PC00218)
41,DROME|FlyBase=FBgn0000289|UniProtKB=A8DYD1,cg,"Combgap, isoform L;cg;PTN002811048;orthologs","COMBGAP, ISOFORM L-RELATED (PTHR24388:SF53)",C2H2 zinc finger transcription factor(PC00248)
42,DROME|FlyBase=FBgn0000575|UniProtKB=P18491,emc,Protein extra-macrochaetae;emc;PTN000197694;or...,PROTEIN EXTRA-MACROCHAETAE (PTHR11723:SF17),DNA-binding transcription factor(PC00218)


In [100]:
tf_genes = tf_df.gene.to_list()
anterior_genes = anterior_df.gene.to_list()

plt.figure()
for i, g in enumerate(tf_genes):
    this_id = np.where([gg == g for gg in gene_list])[0][0]
    expression = norm_reads[sel, this_id]

    binned_exp, _ = bin_exp(expression, binned_ys, y_bins)
    
    plt.plot(y_bins, binned_exp / np.nanmax(binned_exp), linewidth=2, alpha=0.8)

plt.xlabel('ap')
plt.ylabel('binned expression')
plt.xlim([-130, 185])
plt.yscale('linear')

  binned_exp[i] = np.nanmean(these_expression_levels)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  var = nanvar(a, axis=axis, dtype=dtype, 

  binned_exp[i] = np.nanmean(these_expression_levels)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  binned_exp[i] = np.nanmean(these_expression_levels)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


In [98]:
ys.max()

184.0076