In [44]:
import re
import pandas as pd

In [45]:
# Directorio con la carpeta donde se encuentran los datos. Cambiar si fuera necesario.
work_folder = '/Directorio/SL_Data/'

In [46]:
# Genes esenciales (Selección negativa) y supresores (Selección positiva) inferidos del análisis de MAGeCKFlute.

# PatuT. Genes esenciales:
PatuTEss = ['PRMT3','ASXL1','HELLS','SMARCE1','KDM4B','EPC2']
# PatuT. Genes supresores:
PatuTSup = ['WHSC1','SUV39H2']

# PatuS. Genes esenciales:
PatuSEss = ['CHD7','BRD4','ING5']
# PatuS. Genes supresores:
PatuSSup = ['PHF23','UBE2A']

In [47]:
# Función que comprueba si de dentre los genes de una lista de parejas de SL se encuentran los introducidos, y devuelve la lista de parejas completa.
def SLfinder(df, gene):
    n2 = df.loc[df['n1.name'] == gene, 'n2.name']
    n1 = df.loc[df['n2.name'] == gene, 'n1.name']
    n = list(n2) + list(n1)
    if len(n) > 0:
        return n
    else:
        return 'No SL pairs Found'

In [48]:
# Función que aplica la función SLfiner() a la lista de genes definidos anteriormente 
# usando los datos introducidos de una base de datos concreta. Después guarda las parejas
# encontradas por cada gen en un archivo .csv.
def SLcsv(df,db):
    SL_results = {}

    print(f'-------SL pairs for Essential PatuT Genes in {db}:')
    print()
    for gene in PatuTEss:
        SLpairsTEssAll = SLfinder(df, gene)
        print(f'SL pairs in {db} for {gene}: {SLpairsTEssAll}')
        if isinstance(SLpairsTEssAll, list):
            SL_results[gene] = SLpairsTEssAll
        print()
    
    print(f'-------SL pairs for Suppressor PatuT Genes in {db}:')
    print()
    for gene in PatuTSup:
        SLpairsTSupAll = SLfinder(df, gene)
        print(f'SL pairs in {db} for {gene}: {SLpairsTSupAll}')
        if isinstance(SLpairsTSupAll, list):
            SL_results[gene] = SLpairsTSupAll
        print()

    print(f'-------SL pairs for Essential PatuS Genes in {db}:')
    print()
    for gene in PatuSEss:
        SLpairsSEssAll = SLfinder(df, gene)
        print(f'SL pairs in {db} for {gene}: {SLpairsSEssAll}')
        if isinstance(SLpairsSEssAll, list):
            SL_results[gene] = SLpairsSEssAll
        print()

    print(f'-------SL pairs for Suppressor PatuS Genes in {db}:')
    print()
    for gene in PatuSSup:
        SLpairsSSupAll = SLfinder(df, gene)
        print(f'SL pairs in {db} for {gene}: {SLpairsSSupAll}')
        if isinstance(SLpairsSSupAll, list):
            SL_results[gene] = SLpairsSSupAll
        print()

    if len(SL_results) != 0:

        SL_results = pd.DataFrame.from_dict(SL_results, orient = 'index').T
        SL_results = SL_results.map(lambda x: '' if x is None else x)
        SL_results.to_csv(f'{work_folder}Candidate_SL_Pairs/{db}_SLPairs.csv', header = True, index = False)

# SynLethDB

In [49]:
# Carga de los datos de SynLethDB.
SLdb = pd.read_csv(f'{work_folder}SynLethDB/Human_SL.csv')
print(SLdb)

       n1.name  n1.identifier  n2.name  n2.identifier r.cell_line r.pubmed_id  \
0      TMEM35A          59353     PTEN           5728         NaN    26427375   
1      TMEM35A          59353    BRCA2            675         NaN    26427375   
2        CIDEA           1149     NAE1           8883        A375    23100467   
3         FHL5           9457    OPRK1           4986         NaN    25171417   
4         FHL5           9457    GHRHR           2692         NaN    25171417   
...        ...            ...      ...            ...         ...         ...   
35938    PRKCQ           5588  SLC22A1           6580   K562;K562    20609354   
35939   RNF19A          25897     TP53           7157         NaN    26427375   
35940  SLC10A7          84068      ATM            472         NaN    26427375   
35941     SVIL           6840     PTEN           5728         NaN    26427375   
35942     SVIL           6840     CAV2            858         NaN    25171417   

                       r.so

In [50]:
# Obtención de una lista con todos los posibles orígenes de cada pareja registarda en SynLethDB.
SLdbSource = SLdb['r.source'].astype('category')
Source_list = SLdbSource.cat.categories.tolist()

Source_types = set()

# División de aquellas entradas con dos o más fuentes para obtenerlas por separado.
for source in Source_list:
    
    sources = re.split(r'[;,]', source)
    
    Source_types.update(s.strip() for s in sources)

for s in Source_types:
    print(s)

Text Mining
CRISPR/CRISPRi
GenomeRNAi
Decipher
High Throughput|Low Throughput
Low Throughput
Synlethality
Drug Screen
High Throughput
Computational Prediction
Daisy
RNAi Screen


In [51]:
# Clasificación de las fuentes computacionales:
CompSources = ['Computational Prediction','Decipher;Computational Prediction','Text Mining;Daisy']

# Clasificación de las fuentes bibliográficas y/o experimentales:
ExpSources = ['CRISPR/CRISPRi','Decipher', 'Decipher;Text Mining', 'Drug Screen', 'GenomeRNAi', 'GenomeRNAi;Decipher', 'GenomeRNAi;Text Mining', 'High Throughput', 
              'High Throughput|Low Throughput', 'Low Throughput', 'RNAi Screen', 'Synlethality', 'Synlethality;Decipher', 'Synlethality;GenomeRNAi', 'Synlethality;Text Mining', 
              'Text Mining','Text Mining;Synlethality']

# Filtrado de los datos de SynLethDB según el origen de sus datos.
# Fuentes computacionales:
SLdbComp = SLdb[SLdb['r.source'].isin(CompSources)]
print(SLdbComp)
print(SLdbComp['r.source'].astype('category'))
# Fuentes bibliográficas/experimentales:
SLdbExp = SLdb[SLdb['r.source'].isin(ExpSources)]
print(SLdbExp)
print(SLdbExp['r.source'].astype('category'))

       n1.name  n1.identifier  n2.name  n2.identifier r.cell_line r.pubmed_id  \
0      TMEM35A          59353     PTEN           5728         NaN    26427375   
1      TMEM35A          59353    BRCA2            675         NaN    26427375   
3         FHL5           9457    OPRK1           4986         NaN    25171417   
4         FHL5           9457    GHRHR           2692         NaN    25171417   
5         FHL5           9457  SLC13A1           6561         NaN    25171417   
...        ...            ...      ...            ...         ...         ...   
35934    NUP88           4927    DHX36         170506         NaN    31340155   
35939   RNF19A          25897     TP53           7157         NaN    26427375   
35940  SLC10A7          84068      ATM            472         NaN    26427375   
35941     SVIL           6840     PTEN           5728         NaN    26427375   
35942     SVIL           6840     CAV2            858         NaN    25171417   

                       r.so

In [52]:
SLcsv(SLdbComp,'SynLethDB_Comp')

-------SL pairs for Essential PatuT Genes in SynLethDB_Comp:

SL pairs in SynLethDB_Comp for PRMT3: No SL pairs Found

SL pairs in SynLethDB_Comp for ASXL1: ['BRCA1']

SL pairs in SynLethDB_Comp for HELLS: ['RFC2', 'TOPBP1', 'PCNA', 'ZNF107', 'SFPQ', 'FEN1']

SL pairs in SynLethDB_Comp for SMARCE1: ['LRRC37A3']

SL pairs in SynLethDB_Comp for KDM4B: ['SMARCC2', 'MED16', 'POGZ', 'DAZAP1']

SL pairs in SynLethDB_Comp for EPC2: No SL pairs Found

-------SL pairs for Suppressor PatuT Genes in SynLethDB_Comp:

SL pairs in SynLethDB_Comp for WHSC1: No SL pairs Found

SL pairs in SynLethDB_Comp for SUV39H2: ['ZWINT', 'RAD51', 'RBM17', 'KIF11', 'GTPBP4', 'ANKRD16', 'ATP5F1C']

-------SL pairs for Essential PatuS Genes in SynLethDB_Comp:

SL pairs in SynLethDB_Comp for CHD7: ['KCNJ8', 'HBG1', 'PRTN3']

SL pairs in SynLethDB_Comp for BRD4: ['PRKCSH', 'GGA3', 'CCNT1', 'SIN3B', 'TP53', 'TNK2']

SL pairs in SynLethDB_Comp for ING5: ['MRPS25']

-------SL pairs for Suppressor PatuS Genes in SynLethDB

In [53]:
SLcsv(SLdbExp,'SynLethDB_Exp')

-------SL pairs for Essential PatuT Genes in SynLethDB_Exp:

SL pairs in SynLethDB_Exp for PRMT3: No SL pairs Found

SL pairs in SynLethDB_Exp for ASXL1: ['MUS81']

SL pairs in SynLethDB_Exp for HELLS: No SL pairs Found

SL pairs in SynLethDB_Exp for SMARCE1: No SL pairs Found

SL pairs in SynLethDB_Exp for KDM4B: No SL pairs Found

SL pairs in SynLethDB_Exp for EPC2: No SL pairs Found

-------SL pairs for Suppressor PatuT Genes in SynLethDB_Exp:

SL pairs in SynLethDB_Exp for WHSC1: No SL pairs Found

SL pairs in SynLethDB_Exp for SUV39H2: ['KRAS', 'NRAS', 'HRAS']

-------SL pairs for Essential PatuS Genes in SynLethDB_Exp:

SL pairs in SynLethDB_Exp for CHD7: ['KRAS', 'NRAS', 'HRAS']

SL pairs in SynLethDB_Exp for BRD4: ['MYC', 'PTAR1', 'PARP1', 'NAE1', 'KMT2D', 'MSH6', 'KDM5C', 'PTEN', 'MAP2K1', 'FGFR3', 'RB1', 'DHFR', 'CHEK2', 'CDC7', 'BRCA1', 'CDK9', 'CHEK1', 'WEE1']

SL pairs in SynLethDB_Exp for ING5: ['CDC7', 'TUBA1A', 'MAP2K1', 'CDK9']

-------SL pairs for Suppressor PatuS Gen

# Slorth

In [54]:
# Carga de los datos de Slorth:
SLslorth = pd.read_csv(f'Slorth/h.sapiens_ssl_predictions.csv', sep = '\t', header = None)
print(SLslorth)

               0       1                2                3           4  \
0          CHEK1    MTOR  ENSG00000149554  ENSG00000198793  H. sapiens   
1            ADA   CHEK1  ENSG00000196839  ENSG00000149554  H. sapiens   
2          BRCA1   PARP1  ENSG00000012048  ENSG00000143799  H. sapiens   
3          BRCA2   PARP1  ENSG00000139618  ENSG00000143799  H. sapiens   
4       C10orf76   PTAR1  ENSG00000120029  ENSG00000188647  H. sapiens   
...          ...     ...              ...              ...         ...   
518631       TPR   U2AF1  ENSG00000047410  ENSG00000160201  H. sapiens   
518632       TPR   VTI1A  ENSG00000047410  ENSG00000151532  H. sapiens   
518633      TSC2   U2AF1  ENSG00000103197  ENSG00000160201  H. sapiens   
518634      TSC2   VTI1A  ENSG00000103197  ENSG00000151532  H. sapiens   
518635      TSC2  ZBTB16  ENSG00000103197  ENSG00000109906  H. sapiens   

              5                          6         7  
0       BioGRID  2208318, 2342099, 2342170  3.000000  
1

In [55]:
# Filtrado de las parejas de Slorth con un origen experimental (BioGRID).
SLslorthExp = SLslorth[SLslorth[5] == 'BioGRID']
print(SLslorthExp)

# Filtrado de las parejas de Slorth con un origen computacional.
SLslorthComp = SLslorth[SLslorth[5] == 'Slorth']
print(SLslorthComp)

            0      1                2                3           4        5  \
0       CHEK1   MTOR  ENSG00000149554  ENSG00000198793  H. sapiens  BioGRID   
1         ADA  CHEK1  ENSG00000196839  ENSG00000149554  H. sapiens  BioGRID   
2       BRCA1  PARP1  ENSG00000012048  ENSG00000143799  H. sapiens  BioGRID   
3       BRCA2  PARP1  ENSG00000139618  ENSG00000143799  H. sapiens  BioGRID   
4    C10orf76  PTAR1  ENSG00000120029  ENSG00000188647  H. sapiens  BioGRID   
..        ...    ...              ...              ...         ...      ...   
928    TUBA1A  YWHAZ  ENSG00000167552  ENSG00000164924  H. sapiens  BioGRID   
929     VEGFA    VHL  ENSG00000112715  ENSG00000134086  H. sapiens  BioGRID   
930      WEE1   WEE1  ENSG00000166483  ENSG00000166483  H. sapiens  BioGRID   
931      WEE1    WRN  ENSG00000166483  ENSG00000165392  H. sapiens  BioGRID   
932      WEE1  XRCC3  ENSG00000166483  ENSG00000126215  H. sapiens  BioGRID   

                             6    7  
0    2208318,

In [56]:
# Función que ajusta el formato de las parejas para la función SLcsv().
def SLcolumns(SLdf):
    SLdf = SLdf.iloc[:,[0,1,7]]
    SLdf.columns = ['n1.name','n2.name','score']
    return SLdf

SLslorthExp = SLcolumns(SLslorthExp)
print(SLslorthExp)

SLslorthComp = SLcolumns(SLslorthComp)
print(SLslorthComp)

      n1.name n2.name  score
0       CHEK1    MTOR    3.0
1         ADA   CHEK1    2.0
2       BRCA1   PARP1    2.0
3       BRCA2   PARP1    2.0
4    C10orf76   PTAR1    2.0
..        ...     ...    ...
928    TUBA1A   YWHAZ    1.0
929     VEGFA     VHL    1.0
930      WEE1    WEE1    1.0
931      WEE1     WRN    1.0
932      WEE1   XRCC3    1.0

[933 rows x 3 columns]
       n1.name n2.name     score
933      CDC27   MYLIP  0.735260
934      CDC27  NCAPD2  0.853135
935      CDC27  POLR3B  0.861576
936      CDC27    CDH1  0.783209
937      CDC27   VAMP3  0.706373
...        ...     ...       ...
518631     TPR   U2AF1  0.822218
518632     TPR   VTI1A  0.814406
518633    TSC2   U2AF1  0.816285
518634    TSC2   VTI1A  0.855900
518635    TSC2  ZBTB16  0.743188

[517703 rows x 3 columns]


In [57]:
# Filtrado de las fuentes computacionales de Slorth en base a su score de calidad
# (>= 0.75 --- High Quality Score)
SLslorthComp = SLslorthComp[SLslorthComp['score'] >= 0.75]
print(SLslorthComp)

       n1.name n2.name     score
934      CDC27  NCAPD2  0.853135
935      CDC27  POLR3B  0.861576
936      CDC27    CDH1  0.783209
938      CDC27  ARID1B  0.757375
939      CDC27  ANAPC4  0.898875
...        ...     ...       ...
518630     TPR    TSC2  0.823523
518631     TPR   U2AF1  0.822218
518632     TPR   VTI1A  0.814406
518633    TSC2   U2AF1  0.816285
518634    TSC2   VTI1A  0.855900

[364720 rows x 3 columns]


In [58]:
SLcsv(SLslorthExp,'SlorthExp')

-------SL pairs for Essential PatuT Genes in SlorthExp:

SL pairs in SlorthExp for PRMT3: No SL pairs Found

SL pairs in SlorthExp for ASXL1: No SL pairs Found

SL pairs in SlorthExp for HELLS: No SL pairs Found

SL pairs in SlorthExp for SMARCE1: No SL pairs Found

SL pairs in SlorthExp for KDM4B: No SL pairs Found

SL pairs in SlorthExp for EPC2: No SL pairs Found

-------SL pairs for Suppressor PatuT Genes in SlorthExp:

SL pairs in SlorthExp for WHSC1: No SL pairs Found

SL pairs in SlorthExp for SUV39H2: ['KRAS']

-------SL pairs for Essential PatuS Genes in SlorthExp:

SL pairs in SlorthExp for CHD7: No SL pairs Found

SL pairs in SlorthExp for BRD4: ['CDC7', 'CDK9', 'CHEK1', 'CHEK2', 'DHFR', 'FGFR3', 'KDM5C', 'MAP2K1', 'PTAR1', 'PTEN', 'RB1', 'WEE1', 'BRCA1']

SL pairs in SlorthExp for ING5: ['MAP2K1', 'TUBA1A', 'CDC7', 'CDK9']

-------SL pairs for Suppressor PatuS Genes in SlorthExp:

SL pairs in SlorthExp for PHF23: No SL pairs Found

SL pairs in SlorthExp for UBE2A: No SL pai

In [59]:
SLcsv(SLslorthComp,'SlorthComp')

-------SL pairs for Essential PatuT Genes in SlorthComp:

SL pairs in SlorthComp for PRMT3: ['DCTN1', 'PDLIM7']



SL pairs in SlorthComp for ASXL1: ['CDH1', 'RHOA', 'POLA1', 'SMAD4', 'PHB', 'CANX', 'SIRT1', 'PDCD6IP', 'ACTB', 'UBE2K', 'CHD4', 'POLD1', 'TXN', 'HSPA5', 'ASF1B', 'HIPK2', 'MAPK14', 'RAC1', 'SIN3B', 'PRKCB', 'HSPB1', 'MTA2', 'KDM1A', 'NFE2L2', 'HNRNPA2B1', 'TSG101', 'BMI1', 'KEAP1', 'JUND', 'EZR', 'RBL2', 'JUNB', 'SKI', 'E2F1', 'POU2F1', 'SFPQ']

SL pairs in SlorthComp for HELLS: ['PSEN1', 'MAP3K1', 'ACTB', 'HSPA8', 'PRKACA', 'UBE2K', 'APEX1', 'HSPA5', 'RPS6KA1', 'CSNK1A1', 'IKBKB', 'HIPK2', 'SUPT16H', 'AURKA', 'HSPB1', 'SMARCE1', 'KDM1A', 'NFE2L2', 'TNK2', 'TSG101', 'AHR', 'SFPQ', 'TNFAIP3']

SL pairs in SlorthComp for SMARCE1: ['PTPN23', 'THOC1', 'PSEN1', 'XPO1', 'HUWE1', 'SNAP23', 'MAP3K1', 'JAK2', 'SMARCB1', 'GRAP2', 'SNW1', 'PSMB5', 'HCK', 'NOP56', 'POLA1', 'NUP93', 'RNF40', 'SNRNP70', 'FBL', 'CDK6', 'MET', 'UBE2R2', 'EIF3A', 'UBTF', 'RPS6KB1', 'UBE2D3', 'FBXW7', 'ARPC3', 'CDKN1B', 'RRP9', 'NCL', 'SPTBN1', 'SOS1', 'MSH6', 'RPF1', 'ARID1A', 'RPA2', 'TJP2', 'COPS5', 'TUBA1B', 'CNOT1

# DAISY

In [60]:
# Carga de los datos de DAISY.
SLdaisy = pd.read_csv(f'{work_folder}DAISY/DAISY_SLpairs.csv')
print(SLdaisy)

      Gene A   Gene B
0      ACAP1     DEF6
1      ACAP1   GIMAP1
2      ACAP1   MAP4K1
3      ACAP1   SEMA4A
4        ACD  SMARCC2
...      ...      ...
2811  ZNF407    MEOX2
2812  ZC3H7B    MUTYH
2813  ZNF593  MYBBP1A
2814  ZC3H7B    RNF40
2815   ZWINT     SKP2

[2816 rows x 2 columns]


In [61]:
SLdaisy = SLdaisy.rename(columns = {'Gene A':'n1.name','Gene B':'n2.name'})
print(SLdaisy)

     n1.name  n2.name
0      ACAP1     DEF6
1      ACAP1   GIMAP1
2      ACAP1   MAP4K1
3      ACAP1   SEMA4A
4        ACD  SMARCC2
...      ...      ...
2811  ZNF407    MEOX2
2812  ZC3H7B    MUTYH
2813  ZNF593  MYBBP1A
2814  ZC3H7B    RNF40
2815   ZWINT     SKP2

[2816 rows x 2 columns]


In [62]:
SLcsv(SLdaisy,'DAISY')

-------SL pairs for Essential PatuT Genes in DAISY:

SL pairs in DAISY for PRMT3: No SL pairs Found

SL pairs in DAISY for ASXL1: No SL pairs Found

SL pairs in DAISY for HELLS: ['PCNA', 'RFC2', 'SFPQ', 'TOPBP1', 'ZNF107', 'FEN1']

SL pairs in DAISY for SMARCE1: No SL pairs Found

SL pairs in DAISY for KDM4B: ['POGZ', 'SMARCC2']

SL pairs in DAISY for EPC2: No SL pairs Found

-------SL pairs for Suppressor PatuT Genes in DAISY:

SL pairs in DAISY for WHSC1: ['EZH2', 'CDCA3']

SL pairs in DAISY for SUV39H2: ['RAD51']

-------SL pairs for Essential PatuS Genes in DAISY:

SL pairs in DAISY for CHD7: No SL pairs Found

SL pairs in DAISY for BRD4: ['CCNT1', 'GGA3', 'TNK2']

SL pairs in DAISY for ING5: ['MRPS25']

-------SL pairs for Suppressor PatuS Genes in DAISY:

SL pairs in DAISY for PHF23: No SL pairs Found

SL pairs in DAISY for UBE2A: No SL pairs Found



# BioGrid

In [63]:
# Carga de los datos de BioGRID-
Databg = pd.read_csv(f'{work_folder}BIOGRID-ORGANISM-4.4.227.tab3/BIOGRID-ORGANISM-Homo_sapiens-4.4.227.tab3.txt',sep = '\t')
print(Databg)

  Databg = pd.read_csv(f'{work_folder}BIOGRID-ORGANISM-4.4.227.tab3/BIOGRID-ORGANISM-Homo_sapiens-4.4.227.tab3.txt',sep = '\t')


         #BioGRID Interaction ID Entrez Gene Interactor A  \
0                            103                     6416   
1                            117                    84665   
2                            183                       90   
3                            278                     2624   
4                            418                     6118   
...                          ...                      ...   
1184828                  3585521                    22866   
1184829                  3585522                      273   
1184830                  3585523                    84445   
1184831                  3585524                    23032   
1184832                  3585525                    51562   

        Entrez Gene Interactor B  BioGRID ID Interactor A  \
0                           2318                   112315   
1                             88                   124185   
2                           2339                   106605   
3                      

In [64]:
print(Databg.columns)

Index(['#BioGRID Interaction ID', 'Entrez Gene Interactor A',
       'Entrez Gene Interactor B', 'BioGRID ID Interactor A',
       'BioGRID ID Interactor B', 'Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Synonyms Interactor A',
       'Synonyms Interactor B', 'Experimental System',
       'Experimental System Type', 'Author', 'Publication Source',
       'Organism ID Interactor A', 'Organism ID Interactor B', 'Throughput',
       'Score', 'Modification', 'Qualifications', 'Tags', 'Source Database',
       'SWISS-PROT Accessions Interactor A', 'TREMBL Accessions Interactor A',
       'REFSEQ Accessions Interactor A', 'SWISS-PROT Accessions Interactor B',
       'TREMBL Accessions Interactor B', 'REFSEQ Accessions Interactor B',
       'Ontology Term IDs', 'Ontology Term Names', 'Ontology Term Categories',
       'Ontology Term Qualifier IDs', 'Ontology Term Qualifier Names',
       'Ontology

In [65]:
# Filtrado de las columnas que contienen las parejas de la interacción y la columna que contiene el tipo de interacción.
Databg = Databg[['Official Symbol Interactor A','Official Symbol Interactor B','Experimental System']]
print(Databg)

        Official Symbol Interactor A Official Symbol Interactor B  \
0                             MAP2K4                         FLNC   
1                               MYPN                        ACTN2   
2                              ACVR1                         FNTA   
3                              GATA2                          PML   
4                               RPA2                        STAT3   
...                              ...                          ...   
1184828                       CNKSR2                         EGFR   
1184829                         AMPH                         EGFR   
1184830                        LZTS2                         EGFR   
1184831                        USP33                         EGFR   
1184832                         MBIP                         EGFR   

              Experimental System  
0                      Two-hybrid  
1                      Two-hybrid  
2                      Two-hybrid  
3                      Two-

In [66]:
# Obtención de una lista con los tipos de interacción disponibles en BioGRID.
Exp_System = Databg['Experimental System'].astype('category')
Exp_System = Exp_System.unique().tolist()
print(Exp_System)

['Two-hybrid', 'Affinity Capture-Luminescence', 'Affinity Capture-Western', 'Reconstituted Complex', 'Biochemical Activity', 'FRET', 'Dosage Rescue', 'Co-purification', 'Protein-peptide', 'Co-localization', 'Affinity Capture-MS', 'Affinity Capture-RNA', 'Co-crystal Structure', 'Far Western', 'Phenotypic Enhancement', 'Phenotypic Suppression', 'Co-fractionation', 'Protein-RNA', 'Synthetic Rescue', 'Synthetic Lethality', 'Synthetic Growth Defect', 'PCA', 'Dosage Lethality', 'Negative Genetic', 'Dosage Growth Defect', 'Proximity Label-MS', 'Positive Genetic']


In [67]:
# Obtención de las interacciones de BioGRID correspondientes a SL.
SLbg = (((Databg[Databg['Experimental System'] == 'Synthetic Lethality']).reset_index(drop = True))\
        [['Official Symbol Interactor A', 'Official Symbol Interactor B']]).rename\
            (columns = {'Official Symbol Interactor A':'n1.name', 'Official Symbol Interactor B':'n2.name'})
print(SLbg)

     n1.name  n2.name
0      SEC14     SNCA
1       CKI1     SNCA
2      FBXW7    BUB1B
3      FBXW7   FANCD2
4      FBXW7    RAB4B
...      ...      ...
2162    TP53    AP3B2
2163    TP53   ANXA11
2164     ME2      ME3
2165    LRP8  SLC31A1
2166    LRP8  SLC7A11

[2167 rows x 2 columns]


In [68]:
SLcsv(SLbg,'BioGrid')

-------SL pairs for Essential PatuT Genes in BioGrid:

SL pairs in BioGrid for PRMT3: No SL pairs Found

SL pairs in BioGrid for ASXL1: No SL pairs Found

SL pairs in BioGrid for HELLS: No SL pairs Found

SL pairs in BioGrid for SMARCE1: No SL pairs Found

SL pairs in BioGrid for KDM4B: No SL pairs Found

SL pairs in BioGrid for EPC2: No SL pairs Found

-------SL pairs for Suppressor PatuT Genes in BioGrid:

SL pairs in BioGrid for WHSC1: No SL pairs Found

SL pairs in BioGrid for SUV39H2: ['KRAS']

-------SL pairs for Essential PatuS Genes in BioGrid:

SL pairs in BioGrid for CHD7: No SL pairs Found

SL pairs in BioGrid for BRD4: ['PTAR1', 'CHEK2', 'CHEK1', 'WEE1', 'CDC7', 'CDK9']

SL pairs in BioGrid for ING5: ['MAP2K1', 'CDC7', 'CDK9', 'TUBA1A']

-------SL pairs for Suppressor PatuS Genes in BioGrid:

SL pairs in BioGrid for PHF23: No SL pairs Found

SL pairs in BioGrid for UBE2A: No SL pairs Found



# GenomeRNAi

In [69]:
# Carga de los datos de GenomeRNAi
DataGRNAi = pd.read_csv(f'{work_folder}GenomeRNAi_v17_AllScreens/GenomeRNAi_v17_Homo_sapiens_clean.txt',sep = '\t')
print(DataGRNAi)

  DataGRNAi = pd.read_csv(f'{work_folder}GenomeRNAi_v17_AllScreens/GenomeRNAi_v17_Homo_sapiens_clean.txt',sep = '\t')


          #Stable ID Entrez ID Gene ID Gene Symbol  Reagent ID  \
0          GR00016-A    113510     NaN      HEL308          np   
1          GR00016-A     57531   57531       HACE1          np   
2          GR00016-A      9493    9493       KIF23          np   
3          GR00016-A     57410   57410       SCYL1          np   
4          GR00016-A     83858   83858      ATAD3B          np   
...              ...       ...     ...         ...         ...   
2416177  GR00402-S-2     85478   85478    NYD-SP28  SIR0014272   
2416178  GR00402-S-2      9402    9402       GRAP2  SIR0003731   
2416179  GR00402-S-2      4089    4089       SMAD4  SIR0001447   
2416180  GR00402-S-2    116337  116337       PANX3  SIR0014648   
2416181           //       NaN     NaN         NaN         NaN   

                        Score  \
0                        2.86   
1                        2.54   
2                        2.04   
3                        1.54   
4                        1.74   
...      

In [70]:
# Filtrado de las columnas que contienen el gen y el fenotipo inferido para cada uno como resultado de su screening.
DataGRNAi = DataGRNAi[['Gene Symbol','Phenotype']]
print(DataGRNAi)

        Gene Symbol                                          Phenotype
0            HEL308  Upregulation of Wnt/beta-catenin pathway after...
1             HACE1  Upregulation of Wnt/beta-catenin pathway after...
2             KIF23  Upregulation of Wnt/beta-catenin pathway after...
3             SCYL1                                               none
4            ATAD3B                                               none
...             ...                                                ...
2416177    NYD-SP28                                          no effect
2416178       GRAP2                                          no effect
2416179       SMAD4                                          no effect
2416180       PANX3                                          no effect
2416181         NaN                                                NaN

[2416182 rows x 2 columns]


In [71]:
# Lista con todos los fenotipos individuales de GenomeRNAi.
Phenotypes = (DataGRNAi['Phenotype'].astype('category')).unique().tolist()
for Phe in Phenotypes:
    print(Phe)

Upregulation of Wnt/beta-catenin pathway after WNT3A stimulation
none
nan
Phenotype
Wnt reporter downregulated
Wnt reporter upregulated
Synthetic lethal with Ras
Increased gamma-H2AX phosphorylation
Decreased viability with paclitaxel
Increased cell migration
Decreased cell migration
Decreased viability
Decreased melanin production
Upregulation of Wnt pathway
Downregulation of Wnt pathway after Wnt3A stimulation
Upregulation of Wnt pathway after Wnt3A stimulation
Increased apoptosis
Apoptosis resistance
Increased proliferation
Colony formation
Decreased nuclei size in G2M
Increased cell number in G2M, increased nuclei size in G2M
Increased cell number in G1, average or increased nuclei size in G1
Increased cell number in G1, small nuclei in G1
Increased cell number in G2M, increased number of polyploid cells, increased number of cells with high perimeter-to-area ratio nuclei
Increased cell number in G2M, increased number of polyploid cells
Increased cell number in S and G2M
Increased c

In [72]:
# Recopilación de los fenotipos implicados en algún tipo de interacción de SL.
Phenos = []
for Phe in Phenotypes:
    if (type(Phe) is str) and ('Synthetic lethal' in Phe):
        Phenos.append(Phe)
        print(Phe[22:])

print(Phenos)

Ras
cisplatin
gemcitabine
paclitaxel
imatinib mesylate
c-Myc after tamoxifen stimulation
MLN4924 (a NAE inhibitor)
vaccinia virus (VACV) infection
['Synthetic lethal with Ras', 'Synthetic lethal with cisplatin', 'Synthetic lethal with gemcitabine', 'Synthetic lethal with paclitaxel', 'Synthetic lethal with imatinib mesylate', 'Synthetic lethal with c-Myc after tamoxifen stimulation', 'Synthetic lethal with MLN4924 (a NAE inhibitor)', 'Synthetic lethal with vaccinia virus (VACV) infection']


In [73]:
# Filtrado de los datos de GenomeRNAi basado en los fenotipos de SL extraídos previamente.
SLGRNAi = DataGRNAi[DataGRNAi['Phenotype'].isin(Phenos)].reset_index(drop = True)
print(SLGRNAi)

     Gene Symbol                                          Phenotype
0           UBA2                          Synthetic lethal with Ras
1          DIDO1                          Synthetic lethal with Ras
2           AMFR                          Synthetic lethal with Ras
3         WDR42C                          Synthetic lethal with Ras
4         TMEM17                          Synthetic lethal with Ras
...          ...                                                ...
7249       PLCD1  Synthetic lethal with vaccinia virus (VACV) in...
7250        TBCA  Synthetic lethal with vaccinia virus (VACV) in...
7251      NHP2L1  Synthetic lethal with vaccinia virus (VACV) in...
7252       FOXM1  Synthetic lethal with vaccinia virus (VACV) in...
7253      PCOLCE  Synthetic lethal with vaccinia virus (VACV) in...

[7254 rows x 2 columns]


In [74]:
SLGRNAi['Phenotype'] = SLGRNAi['Phenotype'].apply(lambda x: x[22:])
SLGRNAi = (SLGRNAi.drop_duplicates()).rename(columns = {'Gene Symbol':'n1.name','Phenotype':'n2.name'})
print(SLGRNAi)

     n1.name                          n2.name
0       UBA2                              Ras
1      DIDO1                              Ras
2       AMFR                              Ras
3     WDR42C                              Ras
4     TMEM17                              Ras
...      ...                              ...
7249   PLCD1  vaccinia virus (VACV) infection
7250    TBCA  vaccinia virus (VACV) infection
7251  NHP2L1  vaccinia virus (VACV) infection
7252   FOXM1  vaccinia virus (VACV) infection
7253  PCOLCE  vaccinia virus (VACV) infection

[4219 rows x 2 columns]


In [75]:
SLcsv(SLGRNAi,'GenomeRNAi')

-------SL pairs for Essential PatuT Genes in GenomeRNAi:

SL pairs in GenomeRNAi for PRMT3: No SL pairs Found

SL pairs in GenomeRNAi for ASXL1: No SL pairs Found

SL pairs in GenomeRNAi for HELLS: No SL pairs Found

SL pairs in GenomeRNAi for SMARCE1: No SL pairs Found

SL pairs in GenomeRNAi for KDM4B: No SL pairs Found

SL pairs in GenomeRNAi for EPC2: No SL pairs Found

-------SL pairs for Suppressor PatuT Genes in GenomeRNAi:

SL pairs in GenomeRNAi for WHSC1: No SL pairs Found

SL pairs in GenomeRNAi for SUV39H2: ['Ras']

-------SL pairs for Essential PatuS Genes in GenomeRNAi:

SL pairs in GenomeRNAi for CHD7: ['Ras']

SL pairs in GenomeRNAi for BRD4: ['c-Myc after tamoxifen stimulation', 'MLN4924 (a NAE inhibitor)']

SL pairs in GenomeRNAi for ING5: No SL pairs Found

-------SL pairs for Suppressor PatuS Genes in GenomeRNAi:

SL pairs in GenomeRNAi for PHF23: No SL pairs Found

SL pairs in GenomeRNAi for UBE2A: ['cisplatin', 'MLN4924 (a NAE inhibitor)']



# Gold Standard

In [76]:
# Carga de los datos del GS:
SLgs = pd.read_csv(f'{work_folder}Gold_Standard/41467_2018_4647_MOESM4_ESM.csv')
print(SLgs)

        gene1       gene2 gene1 perturbation gene2 perturbation      PMID  \
0       MUS81       TTC31                mut              shRNA  24104479   
1       MUS81       ESPL1                mut              shRNA  24104479   
2       MUS81  ST6GALNAC3                mut              shRNA  24104479   
3       MUS81       MTMR6                mut              shRNA  24104479   
4       MUS81       OVCH1                mut              shRNA  24104479   
...       ...         ...                ...                ...       ...   
154702   ALAD      NDUFB1              sgRNA              sgRNA  28319085   
154703  MST1R      PFKFB4              sgRNA              sgRNA  28319085   
154704  CNTFR        NPR2              sgRNA              sgRNA  28319085   
154705    BSG         GPI              sgRNA              sgRNA  28319085   
154706    CA9        NPR2              sgRNA              sgRNA  28319085   

       cancer type tested  SL  
0                    COAD   0  
1          

  SLgs = pd.read_csv(f'{work_folder}Gold_Standard/41467_2018_4647_MOESM4_ESM.csv')


In [78]:
SLgs = ((SLgs[SLgs['SL'] == 1])[['gene1', 'gene2']]).rename(columns = {'gene1':'n1.name','gene2':'n2.name'})
print(SLgs)

       n1.name  n2.name
1        MUS81    ESPL1
6        MUS81    TRAF6
9        MUS81    TRIP6
17       MUS81  PLEKHH1
20       MUS81   CREBBP
...        ...      ...
135394  CAMK2A      OXT
135395   BIRC5    MMP14
135396   KCNQ4    KDM1A
135397  CAMK1G   PIK3CA
135398  PIK3R2    STMN1

[6033 rows x 2 columns]


In [79]:
SLcsv(SLgs,'Gold_Standard')

-------SL pairs for Essential PatuT Genes in Gold_Standard:

SL pairs in Gold_Standard for PRMT3: No SL pairs Found

SL pairs in Gold_Standard for ASXL1: ['MUS81']

SL pairs in Gold_Standard for HELLS: No SL pairs Found

SL pairs in Gold_Standard for SMARCE1: No SL pairs Found

SL pairs in Gold_Standard for KDM4B: No SL pairs Found

SL pairs in Gold_Standard for EPC2: No SL pairs Found

-------SL pairs for Suppressor PatuT Genes in Gold_Standard:

SL pairs in Gold_Standard for WHSC1: No SL pairs Found

SL pairs in Gold_Standard for SUV39H2: ['KRAS']

-------SL pairs for Essential PatuS Genes in Gold_Standard:

SL pairs in Gold_Standard for CHD7: No SL pairs Found

SL pairs in Gold_Standard for BRD4: ['MYC', 'MAP2K1', 'DHFR', 'FGFR3', 'KDM5C', 'PTEN', 'RB1', 'PTAR1', 'WEE1', 'CDC7', 'CDK9', 'CHEK1', 'CHEK2', 'WEE1', 'CDC7', 'CDK9', 'CHEK1', 'CHEK2', 'BRCA1']

SL pairs in Gold_Standard for ING5: ['CDC7', 'CDK9', 'MAP2K1', 'TUBA1A', 'CDC7', 'CDK9', 'MAP2K1', 'TUBA1A']

-------SL pairs for