In [3]:
# !pip uninstall -y screenpro2
# # !pip install git+https://github.com/ArcInstitute/screenpro2.git@dev
# !pip install ScreenPro2

In [10]:
drug_names = {
    'Pi': 'PARPi',
    'Ri': 'ATRi',
    'Wi': 'WEE1i',
    # 'Mi': 'ATMi',
    # 'Ki': 'DNAPKi',
    'PiRi': 'PARPi+ATRi',
    'PiWi': 'PARPi+WEE1i',
    # 'PiMi': 'PARPi+ATMi',
    # 'PiKi': 'PARPi+DNAPKi'
}

In [5]:
from glob import glob

import numpy as np
import pandas as pd 
import anndata as ad
import scanpy as sc

import screenpro as scp

import matplotlib.pyplot as plt

sc.settings.set_figure_params(
    dpi=150, format='svg', 
    frameon=False, 
    figsize=(3, 3), 
    color_map='RdGy', 
    facecolor='white', 
    vector_friendly=True
)

In [6]:
import datetime

import matplotlib
import matplotlib.ticker as ticker

from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import font_manager as fm
from matplotlib import rcParams, rc_context

from screenpro.plotting._utils import almost_black, dark2


matplotlib.use('cairo')

font_files = fm.findSystemFonts(fontpaths='/home/abea/miniconda3/envs/screenpro2/fonts/', fontext='ttf')

for font_file in font_files:
    fm.fontManager.addfont(font_file)


# {f.name for f in matplotlib.font_manager.fontManager.ttflist}

rcParams['font.sans-serif'] = 'Helvetica'
rcParams['font.family'] = ['Helvetica']
rcParams['figure.dpi'] = 140

rcParams['pdf.fonttype'] = 42
rcParams['ps.fonttype'] = 42

## Step 1: FASTQ processing


In [7]:
library_table = pd.read_csv(
    'CRISPRa_v2_human_librarytable.txt.gz', 
    sep = '\t', index_col=False, low_memory=False)

In [8]:
samples = [
    s.split('/')[-1].replace('.fastq.gz','')
    for s in glob('fastq/A549_CRISPRa*.fastq.gz')
]

samples.sort()

In [9]:
samples

['A549_CRISPRa_DMSO_rep1',
 'A549_CRISPRa_PiRi_rep1',
 'A549_CRISPRa_PiWi_rep1',
 'A549_CRISPRa_Pi_rep1',
 'A549_CRISPRa_Ri_rep1',
 'A549_CRISPRa_T0_rep1',
 'A549_CRISPRa_Wi_rep1']

### run `guideCounter`

In [11]:
counter = scp.GuideCounter(cas_type = 'cas9', library_type = 'single_guide_design')

In [12]:
library_table

Unnamed: 0,sgID,sublibrary,gene,transcripts,sequence
0,A1BG_-_58859113.23-P1,h3_top5,A1BG,P1,GCGCGCCTGCGCCTCAGCCC
1,A1BG_+_58859204.23-P1,h3_top5,A1BG,P1,GGTGCGGGGACACTCACGTG
2,A1BG_+_58859110.23-P1,h3_top5,A1BG,P1,GAAGACAGGGAAGATGAAGC
3,A1BG_-_58859158.23-P1,h3_top5,A1BG,P1,GTGGGCGCAGAGGGCTCCTC
4,A1BG_+_58859199.23-P1,h3_top5,A1BG,P1,GGGGACACTCACGTGTGGCG
...,...,...,...,...,...
209075,non-targeting_03785,h7_supp5,negative_control,na,GCGCCGGATGCCTCTTCCAT
209076,non-targeting_03786,h7_supp5,negative_control,na,GATTAGAAGCTTGGGCACTG
209077,non-targeting_03787,h7_supp5,negative_control,na,GGTAACAGAATGCGTTGCGT
209078,non-targeting_03788,h7_supp5,negative_control,na,GGAGACCCTCGGATTCGTAT


In [13]:
counter.load_library(
    'CRISPRa_v2_human_librarytable.txt.gz', sep = '\t', index_col=False,
    # 'JR_V3lib_top2_v2.csv', sep = ',', 
    verbose = True,
    low_memory=False
)

Trimming protospacer sequences in 'protospacer' column.
Library table successfully loaded.
total # of cas9 sgRNAs: 201530


In [14]:
counter.get_counts_matrix(
    fastq_dir = 'fastq',
    samples = samples,
    # write='force',
    trim_first_g=True,
    verbose = True
)

[1;32mA549_CRISPRa_DMSO_rep1[0m
done in 33.904s
count file written ...
% mapped reads 84.93409136295905
[1;32mA549_CRISPRa_PiRi_rep1[0m
done in 26.545s
count file written ...
% mapped reads 86.54296460195145
[1;32mA549_CRISPRa_PiWi_rep1[0m
done in 28.636s
count file written ...
% mapped reads 85.55822885890126
[1;32mA549_CRISPRa_Pi_rep1[0m
done in 31.766s
count file written ...
% mapped reads 86.63087651772959
[1;32mA549_CRISPRa_Ri_rep1[0m
done in 27.962s
count file written ...
% mapped reads 85.710806394241
[1;32mA549_CRISPRa_T0_rep1[0m
done in 36.812s
count file written ...
% mapped reads 86.0935950393324
[1;32mA549_CRISPRa_Wi_rep1[0m
done in 26.471s
count file written ...
% mapped reads 86.54402108381132


In [15]:
counter.counts_mat.sum() / 10**6

A549_CRISPRa_DMSO_rep1    26.760140
A549_CRISPRa_PiRi_rep1    29.742765
A549_CRISPRa_PiWi_rep1    29.187205
A549_CRISPRa_Pi_rep1      33.923629
A549_CRISPRa_Ri_rep1      30.462140
A549_CRISPRa_T0_rep1      39.063599
A549_CRISPRa_Wi_rep1      28.989469
dtype: float64

In [16]:
adata = counter.build_counts_anndata()

In [19]:
adata.obs['condition'] = adata.obs.index.str.split('_').str[-2]
adata.obs['replicate'] = adata.obs.index.str[-1:].astype(int)
# adata.obs['pop_doublings'] = [1 if cond != 'T0' else 0 for cond in adata.obs['condition']]

adata.var['targetType'] = ['gene' if t != 'negative_control' else 'negative_control' for t in adata.var.target]

In [20]:
adata.obs

Unnamed: 0,condition,replicate
A549_CRISPRa_DMSO_rep1,DMSO,1
A549_CRISPRa_PiRi_rep1,PiRi,1
A549_CRISPRa_PiWi_rep1,PiWi,1
A549_CRISPRa_Pi_rep1,Pi,1
A549_CRISPRa_Ri_rep1,Ri,1
A549_CRISPRa_T0_rep1,T0,1
A549_CRISPRa_Wi_rep1,Wi,1


In [22]:
adata.write_h5ad('A549_CRISPRa_screens.h5ad.gz', compression='gzip')

### load counts and metadata

In [23]:
adata = ad.read_h5ad('A549_CRISPRa_screens.h5ad.gz')

# adata.obs = meta 
# adata.obs.treatment = adata.obs.treatment.str.replace('-','T0')
# adata.obs = adata.obs.rename(columns={'treatment':'condition','rep':'replicate'})

# adata.obs['pop_doublings'] = meta['pop doublings'].replace('-',np.nan).astype(float)

In [26]:
adata.var['transcript'] = library_table.set_index('sgID').loc[adata.var.index, 'transcripts']
adata.obs['replicate'] = adata.obs['replicate'].astype(int)

___
<!-- - filter low counts -->

In [27]:
adata0 = adata.copy()

___

# Step 2: Phenotype calculation

- [ ] 

### run phenoscore

In [32]:
screen = scp.PooledScreens(adata, verbose=True, n_reps=1)

screen.filterLowCounts()

screen.countNormalization()

99878 variables with less than 1 reads (filter_type: 'all')
Pseudocount added to counts.
Counts normalized by sequencing depth.


In [33]:
screen.calculateDrugScreen(
    score_level='compare_guides',
    untreated='DMSO', treated=[
        'Pi','Ri','PiRi',
        # 'Mi','PiMi',
        'Wi','PiWi',
        # 'Ki','PiKi'
    ],
    t0='T0', 
    # count_filter_type='either',
    keep_top_n = 3,
    var_names=['target','transcript'],
    collapse_var='target'
    # run_name='compare_guides_top_3',
)



	DMSO vs T0


  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error 

	Pi vs T0


  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error 

	Pi vs DMSO


  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error 

	Ri vs T0


  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error 

	Ri vs DMSO


  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error 

	PiRi vs T0


  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error 

	PiRi vs DMSO


  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error 

	Wi vs T0


  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error 

	Wi vs DMSO


  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error 

	PiWi vs T0


  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error 

	PiWi vs DMSO


  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error 

___

In [34]:
scp.load._write_screen_pkl(screen,'A549_CRISPRa_screens')

Object successfully saved to "A549_CRISPRa_screens.pkl"


### extract result tables

In [35]:
screen = scp.load._read_screen_pkl('A549_CRISPRa_screens')

In [42]:
result_tables = []

for phenotype_name in screen.listPhenotypeScores(run_name='compare_guides'):
    if 'rho' in phenotype_name:
        result_tables.append((phenotype_name,
            screen.getPhenotypeScores(
                run_name='compare_guides', phenotype_name=phenotype_name,
                pvalue_col = 'ttest pvalue',
                threshold=6
            ).query('target!="negative_control"').set_index(['target','transcript'])
        ))
    
result_tables = dict(result_tables)

  result = getattr(ufunc, method)(*inputs, **kwargs)


### get result tables

In [43]:
def getAnnotatedTables(screen, threshold):
    return dict([
        (phenotype_name, 
         screen.getPhenotypeScores(
             run_name='compare_guides',phenotype_name=phenotype_name,threshold=threshold, pvalue_col='ttest pvalue'
         ).query('target!="negative_control"').set_index(['target','transcript']))
        for phenotype_name in screen.listPhenotypeScores(run_name='compare_guides')
    ])

In [44]:
annotated_result_table = pd.concat(dict([
    (drug_names[k.split(':')[1].split('_vs_')[0]],table) for k, table in 
    getAnnotatedTables(screen, threshold=6).items()
    if 'rho' in k
]),axis=1).dropna()

  result = getattr(ufunc, method)(*inputs, **kwargs)


## Pathway analysis

In [55]:
import blitzgsea as blitz

In [56]:
pager_dir = "/home/abea/tools/pager/"
pager_annotation_path = '/home/abea/tools/pager/annotations/human'

c5_gobp_gmt = blitz.enrichr.read_gmt(
    f'{pager_annotation_path}/msigdb_v7.4_c5.go.bp/c5.go.bp.v7.4.symbols.gmt'
)

In [57]:
def run_rho_gsea_directional(df,var_col,gmt,min_size=15,max_size=150):
    signature = df[var_col].reset_index().drop(columns='transcript').copy()

    result = blitz.gsea(
        signature=signature,
        library=gmt,
        min_size=min_size,
        max_size=max_size,
        verbose=True
    )
    
    return signature, result

run gsea

In [67]:
gsea_results = {}

for drug_name in drug_names.values():
    if "+" not in drug_name:
        _, res = run_rho_gsea_directional(annotated_result_table[drug_name], var_col='score', gmt=c5_gobp_gmt)
    
        gsea_results[drug_name] = res

Use cached anchor parameters


Enrichment : 100%|██████████| 7481/7481 [00:01<00:00, 6684.86it/s]
  pvals_corrected = -np.expm1(ntests * np.log1p(-pvals))


Use cached anchor parameters


Enrichment : 100%|██████████| 7481/7481 [00:01<00:00, 6708.87it/s]


Use cached anchor parameters


Enrichment : 100%|██████████| 7481/7481 [00:01<00:00, 6728.55it/s]


### save to file

In [70]:
with pd.ExcelWriter('A549_CRISPRa_screen_analysis.xlsx', engine='openpyxl') as writer:
    screen.adata.obs.to_excel(writer, sheet_name='sample sheet')
    screen.adata.to_df(layer='raw_counts').astype(int).T.to_excel(writer, sheet_name='raw counts')
    screen.adata.to_df(layer='seq_depth_norm').astype(int).T.to_excel(writer, sheet_name='normalized counts')
    annotated_result_table.to_excel(writer, sheet_name='gene scores')
    pd.concat(gsea_results,axis=1).to_excel(writer, sheet_name='GSEA')

___

# 

In [50]:
%reload_ext watermark

In [51]:
%watermark

Last updated: 2024-09-21T22:49:21.118336-07:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.27.0

Compiler    : GCC 11.2.0
OS          : Linux
Release     : 5.15.0-119-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 64
Architecture: 64bit



In [52]:
%watermark --iversions

scanpy    : 1.10.3
anndata   : 0.10.9
pandas    : 1.5.3
numpy     : 1.26.4
screenpro : 0.4.14
matplotlib: 3.6.2



___