Original code by Dede et al. (https://figshare.com/articles/software/enCas12a_screen_analysis_pipeline/12275642), licensed under CC BY 4.0
https://creativecommons.org/licenses/by/4.0/
Modified by Hamda Ajmal, March 2025

Changes: Extracted relevant portions of the code and applied them to different datasets.

In [None]:
%matplotlib inline

#%pylab inline
import pandas as pd
import scipy.stats as stats
import scipy.cluster.hierarchy as clust
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
# Now read ITO file
reads = pd.read_excel('../InputData/Ito/Count_data_ParalogV1.xlsx')
reads = reads.set_index('Left-sgRNA_Right-sgRNA')
reads.columns

In [None]:
numGuides, numSamples = reads.shape

sample_sum = reads.iloc[:,range(2,numSamples)].sum(0)


In [None]:
from matplotlib import pyplot
%pylab inline
#import numpy as np
#import matplotlib.pyplot


figure(figsize(6,8))
highlight_index = 0
colors = ['tab:blue'] * len(sample_sum)
colors[highlight_index] = 'tab:red'

barh( arange(len(sample_sum)), sample_sum, align='center', color = colors)

# Create the bar chart

ylim(-1, len(sample_sum)) # len(sample_sum) = 10
yticks(arange(len(sample_sum)), reads.columns.values[2:], rotation=0)

show()

In [None]:
#min(reads.min(axis = 1))
pseudo=5
reads.head()
reads[ reads.columns[2:]] = reads[ reads.columns[2:]] + pseudo
reads.head()


In [None]:
meanReads = reads.iloc[:,range(2,numSamples)].mean(0) # colmean
#meanReads


In [None]:
normed = pd.DataFrame(index=reads.index, columns=reads.columns) # empty data frame
normed['Aureus_gene'] = reads.loc[:, 'Aureus_gene']
normed['Pyogenes_gene'] = reads.loc[:, 'Pyogenes_gene']
normed.head()
#each replcate has been normalised using its own mean read count
normed[ normed.columns[2:] ] =   (reads.iloc[:, range(2, numSamples)] ) / np.tile(meanReads.values, [numGuides, 1]) * 500  # normalize to mean 500 read


In [None]:
fc = pd.DataFrame(index=reads.index, columns=reads.columns[reads.columns != 'pDNA']) # non-normalised
fc['Aureus_gene'] = reads.iloc[:, 0]  # first column is gene name
fc['Pyogenes_gene'] = reads.iloc[:, 1] 
#fc.head()
numFCsamples = fc.shape[1]-2  # number of columns for which to calculate FC
numFCsamples

fc.head()
fc[ fc.columns[2:] ] = log2( (normed[ normed.columns[3:]]) / np.tile( normed[ normed.columns[2]], [numFCsamples, 1]).T )
fc.head()


In [None]:
## AAVS1 is control in this study
fc_base = pd.DataFrame(index=fc.index, columns=fc.columns) # non-normalised
fc_base.iloc[:] = fc.iloc[:]


#= 'CTRL'
fc_base.loc[ fc_base['Pyogenes_gene'] == "AAVS1", 'Pyogenes_gene' ] = 'control'
fc_base.loc[ fc_base['Aureus_gene'] == "AAVS1", 'Aureus_gene' ] = 'control'

fc_base.loc[(fc_base['Pyogenes_gene'] == 'control')]
fc_base.rename(columns = {fc_base.columns[0]:'GENE1', fc_base.columns[1]: 'GENE2'}, inplace = True)
fc_base.head()


In [None]:
## Merge replicates by mean
cells = list([ 'Meljuso', 'GI1_004',
       'MEL202_003', 'PK1', 'MEWO',
       'HS944T', 'IPC298', 'A549',
       'HSC5', 'HS936T', 'PATU8988S'])
cols = list(['GENE1']) + list( ['GENE2']) + cells
cols
fc_merge = pd.DataFrame( columns=cols, index=fc_base.index, dtype=float)
fc_merge.GENE1 = fc_base.GENE1
fc_merge.GENE2 = fc_base.GENE2

for cell in cells:
    samples = [x for x in fc_base.columns if cell in x]
    fc_merge[cell] = fc[ samples ].mean(1)

fc_merge

In [None]:
is_ctrl = where( (fc_merge.GENE1=='control') | (fc_merge.GENE2=='control') )[0]

is_ctrl1 = where( fc_merge.GENE1=='control' )[0]
is_ctrl2 = where( fc_merge.GENE2=='control' )[0]


smf_gene1 = fc_merge.iloc[is_ctrl2].groupby('GENE1').mean(numeric_only = True) # calculate means of multiple runs of same gene
smf_gene2 = fc_merge.iloc[is_ctrl1].groupby('GENE2').mean(numeric_only = True)


smf_guide1 = fc_merge.iloc[is_ctrl2].groupby('GENE1')
smf_guide2 = fc_merge.iloc[is_ctrl1].groupby('GENE2')


In [None]:
smf_gene = smf_gene1.join(smf_gene2, lsuffix='_Aposn', rsuffix='_Bposn')
smf_gene # so this dataset has CTRL-GENE as A_549_Bposn and GENE-CTRL a sA_549_APosn

len(smf_gene)
smf_gene

In [None]:
for cell in cells:
    f, ax = pyplot.subplots(figsize=(10,5))
    sns.scatterplot(data=smf_gene, x=cell + '_Aposn',y=cell + '_Bposn', ax=ax)
    plot([-4,1],[-4,1],'r--')
    print(cell + ': ' + str( stats.pearsonr( smf_gene[ cell + '_Aposn' ], smf_gene[cell + '_Bposn' ])) ) 

In [None]:
for cell in cells:
    smf_gene[cell] = smf_gene[ [cell + '_Aposn', cell + '_Bposn'] ].mean(1)
    smf_gene.drop( [cell + '_Aposn', cell + '_Bposn'], axis=1, inplace=True )
    
smf_gene

In [None]:
pairs = fc_merge[(fc_merge['GENE1'] != "control") & (fc_merge['GENE2'] != "control")] # These are all experiments without control,
pairs =  pairs[["GENE1", "GENE2"]]
pairs =pairs.drop_duplicates(keep='first')
pairs.insert(2, "GENE1_GENE2",np.tile("ZZ",len(pairs)), True)



pairs.columns
for ind in pairs.index:
    g1 = pairs.loc[ind, 'GENE1']
    g2 = pairs.loc[ind, 'GENE2']
    newval = g1 + "_" + g2
    if g1 > g2:
        newval = g2 + "_" + g1
    pairs.loc[ind,"GENE1_GENE2"] = newval

   
pairs
print(len(pairs))
pairs.drop_duplicates(subset="GENE1_GENE2", keep="first",inplace = True)
print(len(pairs))


In [None]:
fc_merge.columns
dLFC = pd.DataFrame( index=list(pairs.GENE1 + "_" + pairs.GENE2), columns=fc_merge.columns[2:], dtype=float)


In [None]:
smf = smf_gene
for pair_idx in pairs.index:
    g1 = pairs.loc[pair_idx].GENE1
    g2 = pairs.loc[pair_idx].GENE2
    expt_idx  = list( where( ( (fc_merge.GENE1==g1) & (fc_merge.GENE2==g2) )  | ( (fc_merge.GENE1==g2) & (fc_merge.GENE2==g1)  ))[0] )
    
    if ( len(expt_idx)==0 ):
        continue
    smf_sum = smf.loc[g1] + smf.loc[g2]
    expt = fc_merge.iloc[ expt_idx ]
    genepair = g1 + "_" + g2
    dLFC.loc[genepair] = expt.median(0,numeric_only=True) - smf_sum

In [None]:
zdLFC = pd.DataFrame( index=dLFC.index, columns=dLFC.columns, dtype=float ) 


In [None]:

percentile = 2.5
for col in zdLFC.columns:
    #print(col)
    top = np.percentile( dLFC.loc[:,col], percentile)
    bot = np.percentile( dLFC.loc[:,col], 100-percentile)
    #print(top,bot)
    mu = dLFC.iloc[ where( (dLFC[col]>top) & (dLFC[col]<bot))[0] ][col].mean() # This is because our dist is truncated normal
    std = dLFC.iloc[ where( (dLFC[col]>top) & (dLFC[col]<bot))[0] ][col].std()
    #print(mu, std)
    zdLFC[col] = (dLFC[col] - mu) / std

In [None]:
zdLFC

In [None]:
## Representation of a kernel-density estimate using Gaussian kernels.Kernel density estimation is a way to estimate
# the probability density function (PDF) of a random variable in a non-parametric way. gaussian_kde works for both 
# uni-variate and multi-variate data. It includes automatic bandwidth determination. The estimation works best for
# a unimodal distribution; bimodal or multi-modal distributions tend to be oversmoothed.
xx = linspace(-10,10,500)
kde_a549 = stats.gaussian_kde( zdLFC.A549 )
kde_MEL202_003 = stats.gaussian_kde( zdLFC.MEL202_003 )
kde_GI1_004 = stats.gaussian_kde( zdLFC.GI1_004 )
kde_PK1 = stats.gaussian_kde( zdLFC.PK1 )

kde_MEWO = stats.gaussian_kde( zdLFC.MEWO )

kde_GI1_004 = stats.gaussian_kde( zdLFC.GI1_004 )

kde_HS944T = stats.gaussian_kde( zdLFC.HS944T )

kde_IPC298 = stats.gaussian_kde( zdLFC.IPC298 )


kde_HSC5= stats.gaussian_kde( zdLFC.HSC5 )
kde_HS936T= stats.gaussian_kde( zdLFC.HS936T )
kde_PATU8988S= stats.gaussian_kde( zdLFC.PATU8988S )
#kde_HSC5= stats.gaussian_kde( zdLFC.HSC5 )
figure( figsize(5,4) )
plot( xx, stats.norm.pdf( xx), label='normal', linewidth=4 )
plot( xx, kde_a549.evaluate(xx), label='A549')
plot( xx, kde_MEL202_003.evaluate(xx), label='MEL202_003')
plot( xx, kde_GI1_004.evaluate(xx), label='GI1_004')
plot( xx, kde_PK1.evaluate(xx), label='PK1')
plot( xx, kde_MEWO.evaluate(xx), label='MEWO')
plot( xx, kde_HS944T.evaluate(xx), label='HS944T')
plot( xx, kde_IPC298.evaluate(xx), label='IPC298')
plot( xx, kde_PATU8988S.evaluate(xx), label='PATU8988S')
plot( xx, kde_HSC5.evaluate(xx), label='HSC5')
plot( xx, kde_HS936T.evaluate(xx), label='HS936T')


legend(loc=2)
#savefig('normfit-of-zdLFC.pdf')
show()


In [None]:
def reindex_alphbetically(df):
    result = []
    for index, row in df.iterrows():
        a, b = index.split('_')
        if a < b:
            result.append(f'{a}_{b}')
        else:
            result.append(f'{b}_{a}')
    
    
    return(result)


zdLFC.index = reindex_alphbetically(zdLFC)
zdLFC.to_csv("zdLFC Output/ITO.csv", index=True)  # Set index=False to exclude the index column