Original code by Dede et al. (https://figshare.com/articles/software/enCas12a_screen_analysis_pipeline/12275642), licensed under CC BY 4.0
https://creativecommons.org/licenses/by/4.0/
Modified by Hamda Ajmal, March 2025

Changes: Extracted relevant portions of the code and applied them to different datasets.

In [None]:
%matplotlib inline
import pandas as pd
import scipy.stats as stats
import scipy.cluster.hierarchy as clust
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
reads = pd.read_csv('../InputData/Dede/counts.txt', index_col=0, sep='\t')
reads.head()


In [None]:
%pylab inline
numGuides, numSamples = reads.shape
# 12328 rows = numGuides and numSamples = 11 columns
sample_sum = reads.iloc[:,range(1,numSamples)].sum(0)

figure( figsize(6,3))
bar( arange(len(sample_sum)), sample_sum, align='center')
plot( [-0.5, len(sample_sum)-0.5], [numGuides*500, numGuides*500], 'r--')  #This is for the red dashed line, r for red
#Format string fmt = '[marker][line][color]' why is this line

xlim(-1, len(sample_sum)) # len(sample_sum) = 10
xticks(arange(len(sample_sum)), reads.columns.values[1:], rotation=90)

show()

In [None]:
pseudo=5
reads[ reads.columns[1:]] = reads[ reads.columns[1:]] + pseudo


In [None]:
reads

In [None]:
meanReads = reads.iloc[:,range(1,numSamples)].mean(0) # colmean
meanReads

In [None]:
normed = pd.DataFrame(index=reads.index, columns=reads.columns) # empty data frame
normed['GENE'] = reads.iloc[:, 0]
#pseudo has already been added to reads
#The numpy.tile() function constructs a new array by repeating array – ‘arr’, 
#the number of times we want to repeat as per repetitions. 
#The resulted array will have dimensions max(arr.ndim, repetitions) where, 
#repetitions is the length of repetitions. If arr.ndim > repetitions,
#reps is promoted to arr.ndim by pre-pending 1’s to it.
#
#If arr.ndim < repetitions, reps is promoted to arr.ndim by pre-pending new axis. Syntax : 
normed[ normed.columns[1:] ] =   (reads.iloc[:, range(1, numSamples)] ) / np.tile(meanReads.values, [numGuides, 1]) * 500  # normalize to mean 500 read
normed.shape


In [None]:
fc = pd.DataFrame(index=reads.index, columns=reads.columns[:-1]) # non-normalised
fc['GENE'] = reads.iloc[:, 0]  # first column is gene name

numFCsamples = fc.shape[1]-1   # number of columns for which to calculate FC
numFCsamples

pseudo = 0 # remmeber pseudocount is already in the data 

# calculation fc wrt to last column, fc of each cell line/replicate wrt to sample at t0
fc[ fc.columns[1:] ] = log2( (normed[ normed.columns[1:-1]] + pseudo) / np.tile( normed[ normed.columns[-1]] + pseudo , [numFCsamples, 1]).T )


In [None]:
genelist = {}
for idx in fc.index.values:
    g1cid, g2cid = fc.loc[idx]['GENE'].split(':')
    g1, cid1 = g1cid.split('.')
    g2, cid2 = g2cid.split('.')
    genelist[g1]=1
    genelist[g2]=1
    fc.loc[idx,'GENE1']=g1
    fc.loc[idx,'GENE2']=g2
    
fc

In [None]:
noness = pd.read_csv('../InputData/Dede/pan-species-control-nonessentials-50genes.txt', sep='\t', index_col=0)
noness.head()

In [None]:
fc_base = fc
# So whereever a non essential is used, replace it with CTRL
fc_base.loc[ fc_base['GENE1'].isin(noness.index), 'GENE1' ] = 'CTRL'
fc_base.loc[ fc_base['GENE2'].isin(noness.index), 'GENE2' ] = 'CTRL'
fc_base.head()

In [None]:
cells = list(['A549','HT29','OVCAR8'])
cols = list(['GENE']) + cells
fc_merge = pd.DataFrame( columns=cols, index=fc.index, dtype=float)
fc_merge.GENE = fc.GENE
for cell in cells:
    samples = [x for x in fc.columns if cell in x]
    fc_merge[cell] = fc[ samples ].mean(1)
fc_merge['GENE1'] = fc.GENE1
fc_merge['GENE2'] = fc.GENE2
fc_merge.head()


In [None]:
# Get SMF for each gene (gene-CTRL pairs)
is_ctrl = where( (fc_merge.GENE1=='CTRL') | (fc_merge.GENE2=='CTRL') )[0]
len(is_ctrl) # 5124 screens with CTRLs used  so we used them to measure SMF of each gene
fc_merge.iloc[is_ctrl].head()
is_ctrl1 = where( fc_merge.GENE1=='CTRL' )[0]
is_ctrl2 = where( fc_merge.GENE2=='CTRL' )[0]


In [None]:
smf_gene1 = fc_merge.iloc[is_ctrl2].groupby('GENE1').mean(numeric_only = True) # calculate means of multiple runs of same gene
smf_gene2 = fc_merge.iloc[is_ctrl1].groupby('GENE2').mean(numeric_only = True)
smf_gene1

In [None]:
smf_guide1 = fc_merge.iloc[is_ctrl2].groupby('GENE1')
smf_guide1.head()
smf_guide2 = fc_merge.iloc[is_ctrl1].groupby('GENE2')

smf_gene1

In [None]:
smf_gene = smf_gene1.join(smf_gene2, lsuffix='_Aposn', rsuffix='_Bposn')
smf_gene # so this dataset has CTRL-GENE as A_549_Bposn and GENE-CTRL a sA_549_APosn

In [None]:
for cell in cells:
    f, ax = pyplot.subplots(figsize=(5,5))
    sns.scatterplot(data=smf_gene, x=cell + '_Aposn',y=cell + '_Bposn', ax=ax)
    plot([-4,1],[-4,1],'r--')
    print(cell + ': ' + str( stats.pearsonr( smf_gene[ cell + '_Aposn' ], smf_gene[cell + '_Bposn' ])) ) 

In [None]:
# A_posn and B_posn of each gene is averaged
for cell in cells:
    smf_gene[cell] = smf_gene[ [cell + '_Aposn', cell + '_Bposn'] ].mean(1)
    smf_gene.drop( [cell + '_Aposn', cell + '_Bposn'], axis=1, inplace=True )
    
smf_gene


In [None]:
pairs = fc_merge[(fc_merge['GENE1'] != "CTRL") & (fc_merge['GENE2'] != "CTRL")] # These are all experiments without control,
pairs =  pairs[["GENE1", "GENE2"]]
pairs =pairs.drop_duplicates(keep='first')
pairs.insert(2, "GENE1_GENE2",np.tile("ZZ",len(pairs)), True)



pairs.columns
for ind in pairs.index:
    g1 = pairs.loc[ind, 'GENE1']
    g2 = pairs.loc[ind, 'GENE2']
    newval = g1 + "_" + g2
    if g1 > g2:
        newval = g2 + "_" + g1
    pairs.loc[ind,"GENE1_GENE2"] = newval
#    print(g1_g2)
#print(pairs.columns)
   
pairs
print(len(pairs))
pairs.drop_duplicates(subset="GENE1_GENE2", keep="first",inplace = True)
print(len(pairs))
pairs

In [None]:
dLFC = pd.DataFrame( index=list(pairs.GENE1 + "_" + pairs.GENE2), columns=fc_merge.columns[:-2], dtype=float)
dLFC.head(3)
dLFC.drop('GENE', inplace = True, axis = 1)
dLFC

In [None]:
fc = fc_merge
for pair_idx in pairs.index:
    g1 = pairs.loc[pair_idx].GENE1
    g2 = pairs.loc[pair_idx].GENE2
    expt_idx  = list( where( ( (fc.GENE1==g1) & (fc.GENE2==g2) ) | ( (fc.GENE1==g2) & (fc.GENE2==g1) )  )[0] )
    if ( len(expt_idx)==0 ):
        continue
    smf_sum = smf_gene.loc[g1] + smf_gene.loc[g2]
    expt = fc.iloc[ expt_idx ]
    genepair = g1 + "_" + g2
    dLFC.loc[genepair] = expt.median(0, numeric_only=True) - smf_sum

In [None]:
print(len(dLFC)) # 2 pairs not present here # All pairs were not listed in the file given by authors
dropme = where( isnan( dLFC.A549) )[0]
dropme
dLFC.drop( dLFC.index[dropme], axis=0, inplace=True)
print(len(dLFC)) # 2 pairs not present here


In [None]:
zdLFC = pd.DataFrame( index=dLFC.index, columns=dLFC.columns, dtype=float ) 
zdLFC.head()

In [None]:
#To compare across screens, convert dLFC scores to a Z score, zdLFC, by truncating
#the top and bottom 2.5% of dLFC scores. At a zdLFC score < − 3, all three
#screens showed high concordance, with 19 of 24 (79%) synthetic lethals present in at
#least two out of three cell lines and 14 of 24 (58%) present in all three (Fig. 4a, b)

#Z-transformation of distribution of dLFC (zdLFC) after truncating top/bottom 2.5%
#of values approximates a normal distribution.

percentile = 2.5
for col in zdLFC.columns:
    #print(col)
    top = np.percentile( dLFC.loc[:,col], percentile)
    bot = np.percentile( dLFC.loc[:,col], 100-percentile)
    #print(top,bot)
    mu = dLFC.iloc[ where( (dLFC[col]>top) & (dLFC[col]<bot))[0] ][col].mean() # This is because our dist is truncated normal
    std = dLFC.iloc[ where( (dLFC[col]>top) & (dLFC[col]<bot))[0] ][col].std()
    #print(mu, std)
    zdLFC[col] = (dLFC[col] - mu) / std

In [None]:
zdLFC

In [None]:
## Representation of a kernel-density estimate using Gaussian kernels.Kernel density estimation is a way to estimate
# the probability density function (PDF) of a random variable in a non-parametric way. gaussian_kde works for both 
# uni-variate and multi-variate data. It includes automatic bandwidth determination. The estimation works best for
# a unimodal distribution; bimodal or multi-modal distributions tend to be oversmoothed.
xx = linspace(-10,10,500)
kde_a549 = stats.gaussian_kde( zdLFC.A549 )
kde_ht29 = stats.gaussian_kde( zdLFC.HT29 )
kde_ovcar8 = stats.gaussian_kde( zdLFC.OVCAR8 )
figure( figsize(5,4) )
plot( xx, stats.norm.pdf( xx), label='normal', linewidth=4 )
plot( xx, kde_a549.evaluate(xx), label='A549')
plot( xx, kde_ht29.evaluate(xx), label='HT29' )
plot( xx, kde_ovcar8.evaluate(xx), label='OVCAR8' )

legend(loc=2)
#savefig('normfit-of-zdLFC.pdf')
show()


In [None]:
kde_a549_dlfc = stats.gaussian_kde( dLFC.A549 )

figure( figsize(5,4) )
plot( xx, stats.norm.pdf( xx), label='normal', linewidth=4 )
plot( xx, kde_a549.evaluate(xx), label='A549')
plot( xx, kde_a549_dlfc.evaluate(xx), label='A549-Before Z transformation' )

legend(loc=2)
#savefig('normfit-of-zdLFC.pdf')
show()

In [None]:
def reindex_alphbetically(df):
    result = []
    for index, row in df.iterrows():
        a, b = index.split('_')
        if a < b:
            result.append(f'{a}_{b}')
        else:
            result.append(f'{b}_{a}')
    
    
    return(result)


zdLFC.index = reindex_alphbetically(zdLFC)
print(zdLFC)
zdLFC.to_csv("zdLFC Output/DeDe_zdLFC.csv", index=True)  # Set index=False to exclude the index column
