Original code by Dede et al. (https://figshare.com/articles/software/enCas12a_screen_analysis_pipeline/12275642), licensed under CC BY 4.0
https://creativecommons.org/licenses/by/4.0/
Modified by Hamda Ajmal, March 2025

Changes: Extracted relevant portions of the code and applied them to different datasets.

In [None]:
%matplotlib inline
import pandas as pd
import scipy.stats as stats
import scipy.cluster.hierarchy as clust
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
reads = pd.read_csv('../InputData/Chymera/Chymera.csv', index_col=0, sep=',')


In [None]:
reads = reads[[ 'gene1', 'gene2', 'RPE1.T18A',
       'RPE1.T18B', 'RPE1.T18C', 'RPE1.T24A',
       'RPE1.T24B', 'RPE1.T24C',  'RPE1.T0']] # remove early time points too as we dont need them


In [None]:
reads = reads[(pd.notna(reads['gene1'] )) & (pd.notna(reads['gene2']))]


In [None]:
reads = reads[(reads['gene1'] != "NT") & (reads['gene2'] != "NT")]
## Remove NTs too
reads.shape
bothNegs = reads[((reads['gene1'] == 'NegControl') & (reads['gene2'] == 'NegControl'))]
## also remove rows where both are 
reads = reads[~((reads['gene1'] == 'NegControl') & (reads['gene2'] == 'NegControl'))]
reads.shape

In [None]:
numGuides, numSamples = reads.shape

numGuides, numSamples 

reads.head(10)

In [None]:
sample_sum = reads.iloc[:,range(2,numSamples)].sum(0)


In [None]:
import matplotlib.pyplot as plt
%pylab inline
highlight_index = -1
colors = ['tab:blue'] * len(sample_sum)
colors[highlight_index] = 'tab:red'
figure(figsize = (6,4))
barh( arange(len(sample_sum)), sample_sum, align='center', color = colors)
ylim(-1, len(sample_sum)) # len(sample_sum) = 10
yticks(arange(len(sample_sum)), reads.columns.values[2:], rotation=0)

show()

In [None]:
pseudo=5
reads[reads.columns[2:]] = reads[reads.columns[2:]] + pseudo


In [None]:
meanReads = reads.iloc[:,range(2,numSamples)].mean(0) # colmean
meanReads

In [None]:
reads['GENE'] = reads.apply(lambda row: f"{row['gene1']}|{row['gene2']}", axis=1)
reads
reads.drop(['gene1', 'gene2'], axis = 1, inplace = True)

In [None]:
reads = reads[['GENE', 'RPE1.T18A', 'RPE1.T18B', 'RPE1.T18C','RPE1.T24A', 'RPE1.T24B', 'RPE1.T24C',  'RPE1.T0']]
reads


In [None]:
numGuides, numSamples = reads.shape

numGuides, numSamples 
normed = pd.DataFrame(index=reads.index, columns=reads.columns) # empty data frame
normed['GENE'] = reads.iloc[:, 0]

normed[ normed.columns[1:] ] =   (reads.iloc[:, range(1, numSamples)] ) / np.tile(meanReads.values, [numGuides, 1]) * 500  # normalize to mean 500 read
normed.head()

In [None]:
fc = pd.DataFrame(index=reads.index, columns=reads.columns[reads.columns != 'RPE1.T0'])#non-normalised
fc['GENE'] = reads['GENE']  # first column is gene name
fc
numFCsamples = fc.shape[1]-1   # number of columns for which to calculate FC
numFCsamples
#fc.head()
pseudo = 0 # remmeber pseudocount is already in the data 
fc.head()
#normed.columns
#fc = fc[['GENE', 'RPE1.T18A', 'RPE1.T18B', 'RPE1.T18C']]


In [None]:
fc[ fc.columns[1:] ] = log2( (normed[ normed.columns[1:-1]] + pseudo) / np.tile( normed[ normed.columns[-1]] + pseudo , [numFCsamples, 1]).T )
fc

In [None]:
fc_base = pd.DataFrame(index=fc.index, columns=fc.columns) # non-normalised
fc_base.iloc[:] = fc.iloc[:]
fc_base
fc_base[['GENE1', 'GENE2']] = fc_base.GENE.str.split("|", expand = True)
fc_base.drop(['GENE'], inplace = True,axis=1)
fc_base
fc_base.loc[fc_base['GENE1'].str.contains('NegControl'), 'GENE1'] = 'control'
fc_base.loc[fc_base['GENE2'].str.contains('NegControl'),'GENE2'] = 'control'

fc_base

In [None]:
cells = list([ 'RPE1.T18', 'RPE1.T24'])
cols = list(['GENE1']) + list( ['GENE2']) + cells
cols
fc_merge = pd.DataFrame( columns=cols, index=fc_base.index, dtype=float)
fc_merge.GENE1 = fc_base.GENE1
fc_merge.GENE2 = fc_base.GENE2
fc_merge
for cell in cells:
    samples = [x for x in fc_base.columns if cell in x]
    fc_merge[cell] = fc[ samples ].mean(1)

fc_merge

In [None]:
is_ctrl = where( (fc_merge.GENE1=='control') | (fc_merge.GENE2=='control') )[0]

is_ctrl1 = where( fc_merge.GENE1=='control' )[0]
is_ctrl2 = where( fc_merge.GENE2=='control' )[0]


smf_gene1 = fc_merge.iloc[is_ctrl2].groupby('GENE1').mean(numeric_only = True) # calculate means of multiple runs of same gene
smf_gene2 = fc_merge.iloc[is_ctrl1].groupby('GENE2').mean(numeric_only = True)


smf_guide1 = fc_merge.iloc[is_ctrl2].groupby('GENE1')
smf_guide2 = fc_merge.iloc[is_ctrl1].groupby('GENE2')

In [None]:
smf_gene = smf_gene1.join(smf_gene2, lsuffix='_Aposn', rsuffix='_Bposn')
smf_gene # so this dataset has CTRL-GENE as A_549_Bposn and GENE-CTRL a sA_549_APosn

smf_gene

In [None]:
for cell in cells:
    f, ax = pyplot.subplots(figsize=(10,5))
    sns.scatterplot(data=smf_gene, x=cell + '_Aposn',y=cell + '_Bposn', ax=ax)
    plot([-4,1],[-4,1],'r--')
    print(cell + ': ' + str( stats.pearsonr( smf_gene[ cell + '_Aposn' ], smf_gene[cell + '_Bposn' ])) ) 

In [None]:
for cell in cells:
    smf_gene[cell] = smf_gene[ [cell + '_Aposn', cell + '_Bposn'] ].mean(1)
    smf_gene.drop( [cell + '_Aposn', cell + '_Bposn'], axis=1, inplace=True )
    
smf_gene

In [None]:
pairs = fc_merge[(fc_merge['GENE1'] != "control") & (fc_merge['GENE2'] != "control")] # These are all experiments without control,
pairs =  pairs[["GENE1", "GENE2"]]
pairs =pairs.drop_duplicates(keep='first')
pairs.insert(2, "GENE1_GENE2",np.tile("ZZ",len(pairs)), True)



pairs.columns
for ind in pairs.index:
    g1 = pairs.loc[ind, 'GENE1']
    g2 = pairs.loc[ind, 'GENE2']
    newval = g1 + "_" + g2
    if g1 > g2:
        newval = g2 + "_" + g1
    pairs.loc[ind,"GENE1_GENE2"] = newval

    
pairs
print(len(pairs))
pairs.drop_duplicates(subset="GENE1_GENE2", keep="first",inplace = True)
print(len(pairs))
pairs

In [None]:
fc_merge.columns
dLFC = pd.DataFrame( index=list(pairs.GENE1 + "_" + pairs.GENE2), columns=fc_merge.columns[2:], dtype=float)
dLFC


In [None]:
smf = smf_gene
for pair_idx in pairs.index:
    g1 = pairs.loc[pair_idx].GENE1
    g2 = pairs.loc[pair_idx].GENE2
    expt_idx  = list( where( ( (fc_merge.GENE1==g1) & (fc_merge.GENE2==g2) )  | ( (fc_merge.GENE1==g2) & (fc_merge.GENE2==g1)  ))[0] )
    
    if ( len(expt_idx)==0 ):
        continue
    smf_sum = smf.loc[g1] + smf.loc[g2]
    expt = fc_merge.iloc[ expt_idx ]
    genepair = g1 + "_" + g2
    dLFC.loc[genepair] = expt.median(0,numeric_only=True) - smf_sum

In [None]:
zdLFC = pd.DataFrame( index=dLFC.index, columns=dLFC.columns, dtype=float ) 

percentile =  2.5
for col in zdLFC.columns:
    #print(col)
    top = np.percentile( dLFC.loc[:,col], percentile)
    bot = np.percentile( dLFC.loc[:,col], 100-percentile)
    #print(top,bot)
    mu = dLFC.iloc[ where( (dLFC[col]>top) & (dLFC[col]<bot))[0] ][col].mean() # This is because our dist is truncated normal
    std = dLFC.iloc[ where( (dLFC[col]>top) & (dLFC[col]<bot))[0] ][col].std()
    #print(mu, std)
    zdLFC[col] = (dLFC[col] - mu) / std

In [None]:
xx = linspace(-10,10,500)
kde_RPE1T18 = stats.gaussian_kde( zdLFC['RPE1.T18'] )
kde_RPE1T24 = stats.gaussian_kde( zdLFC['RPE1.T24'] )

plot( xx, stats.norm.pdf( xx), label='normal', linewidth=4 )
plot( xx, kde_RPE1T18.evaluate(xx), label='RPE1.T18')
plot( xx, kde_RPE1T24.evaluate(xx), label='RPE1.T24')


In [None]:
def reindex_alphbetically(df):
    result = []
    for index, row in df.iterrows():
        a, b = index.split('_')
        if a < b:
            result.append(f'{a}_{b}')
        else:
            result.append(f'{b}_{a}')
    
    
    return(result)


zdLFC.index = reindex_alphbetically(zdLFC)
zdLFC.to_csv("zdLFC Output/ChymeraRPE1.csv", index=True)  # Set index=False to exclude the index column