In [1]:
# Import all packages
import os
from multiprocessing.pool import ThreadPool
import numpy as np
import pandas as pd
from Bio import Entrez
from IPython.display import display

In [2]:
# Global parameters / values
data_dir = "../data/"
dataAux_dir = "../data_aux/"
Klaeger_filename = "Klaeger.csv"
Huang_filename = "Huang.csv"
Annes100_filename = "Annes100.csv"
Annes500_filename = "Annes500.csv"

EntrezEmail = "example@gmail.com"

# Name of column of official gene symbols to be added to data frames
geneSymbolColumn = "GeneSymbol"

# Number of threads to use for looking up official gene symbols via Entrez
# - Possible values
#     None: uses a ThreadPool of a default number of threads as returned by os.cpu_count()
#     1: uses main thread for lookup, no ThreadPool
#     > 1: Uses a ThreadPool of nThreads
# - If too large (say, > 50), may run into server errors such as "HTTP Error 429: Too Many Requests"
nThreads = 50

In [3]:
# Gene symbol lookup functions

def searchGeneNames(term, email, useSingleIndirectMatch = True):
    '''
    Search official human gene names and aliases in NCBI Gene database for a match to term, returning offical names and IDs.
    Dependencies: Biopython
    
    Args:
    - term: str
        gene name / alias
    - email: str
        email registered with NCBI
    - useSingleIndirectMatch
        If the Entrez Gene Database query returns only 1 NCBI Gene ID, even if the term does not exactly match the gene symbol
        or an alias, use the match.
    
    Returns: dict: str -> list
        "names": list of matched official gene name(s)
        "ids": list of NCBI Gene IDs corresponding to matched official gene name(s)
    '''

    names, ids = [], []
    Entrez.email = email
    handle = Entrez.esearch(db="gene", term='(' + term + '[gene]) AND (Homo sapiens[orgn]) AND alive[prop] NOT newentry[gene]')
    idList = Entrez.read(handle)['IdList']
    for id in idList:
        handle = Entrez.esummary(db='gene', id=id)
        record = Entrez.read(handle)
        name = record['DocumentSummarySet']['DocumentSummary'][0]['Name']
        aliases = record['DocumentSummarySet']['DocumentSummary'][0]['OtherAliases'].split(', ')
        if (term in [name] + aliases):
            names.append(name)
            ids.append(id)
    if useSingleIndirectMatch:
        if len(names) == 0 and len(idList) == 1:
            ids.append(idList[0])
            handle = Entrez.esummary(db='gene', id=id)
            record = Entrez.read(handle)
            names.append(record['DocumentSummarySet']['DocumentSummary'][0]['Name'])
    return({"names": names, "ids": ids})

def geneSymbolLookupFromSeries(series, email):
    '''
    Search official human gene names and aliases in NCBI Gene database for the value in 'Name' index of given pandas Series.
    
    Args
    - series: pandas.Series
        must have 'Name' index
    - email: str
        email registered with NCBI
    
    Returns: str
      If no match found, returns the empty string. Otherwise, returns the first matched official gene symbol.
    '''
    
    term = series['Name']
    match = searchGeneNames(term, email)
    if len(match["names"]) == 0:
        return("")
    if (term in match["names"]):
        return(term)
    return(match["names"][0])

def multiThreadedSearchGeneNames(terms, email, nThreads = None):
    '''
    Search official human gene names and aliases in NCBI Gene database for terms.
    
    Args
    - terms: list of str
        terms to lookup
    - email: str
        email registered with NCBI
    - nThreads: int
        None: Uses a ThreadPool of a default number of threads as returned by os.cpu_count()
        1+: Uses a ThreadPool of nThreads
    
    Returns: list of str
      Where no matches found, returns the empty string. Otherwise, returns the first matched official gene symbol.
    '''
    
    dict_results = []
    gene_symbols = []
    pool = ThreadPool(nThreads)
    print("Using {:d} threads...".format(pool._processes))
    for i in range(len(terms)):
        dict_results.append(pool.apply_async(searchGeneNames, (terms[i], email)))
    pool.close()
    pool.join()
    for i in range(len(terms)):
        match = dict_results[i].get()
        if len(match["names"]) == 0:
            gene_symbols.append("")
        elif terms[i] in match["names"]:
            gene_symbols.append(terms[i])
        else:
            gene_symbols.append(match["names"][0])
    return(gene_symbols)

## Process data from Klaeger et al.

In [4]:
# Parameters
# - highConfidenceOnly: logical
#     Keep only "high confidence" protein-drug interactions.
#       "A protein was considered a high-confidence target if the binding curve showed a sigmoidal shape
#        with a dose-dependent decrease in binding to the Kinobeads." (Klaeger et al., Supplementary Materials)
# - na_values_klaeger: list of str
#     Values from CSV file to be read in as np.nan
# - dtype_klaeger: dict: str -> dtype
#     dtypes of columns of data

highConfidenceOnly = True
na_values_klaeger = ["n.d."]
dtype_klaeger = {'Name': 'str', 'OTSSP167': np.float64, 'CC401': np.float64}

In [5]:
# Read in data
# - cannot set dtype of columns yet because of string "n.i." (not inhibited) values in OTSSP167 and CC401 columns,
#   which should be of dtype np.float
df1 = pd.read_csv(os.path.join(data_dir, Klaeger_filename), na_values=na_values_klaeger)

In [6]:
# Remove rows (targets) where values are NA
df1.dropna(axis=0, how='any', inplace=True)

In [7]:
# Remove low-confidence values if specified
if highConfidenceOnly:
    df1.drop(df1.index[df1['OTSSP167'].str.contains('\(') == True], axis=0, inplace=True)
    df1.drop(df1.index[df1['CC401'].str.contains('\(') == True], axis=0, inplace=True)
else:
    df1['OTSSP167'] = df1['OTSSP167'].str.strip('()')
    df1['CC401'] = df1['CC401'].str.strip('()')

In [8]:
# Convert "n.i." (not inhibited) values to np.inf
df1.replace('n.i.', np.inf, inplace=True)
df1 = df1.astype(dtype_klaeger)

In [9]:
# Split combined target names into individual target names with their own row
# - Ex: 
#             Name  CC401  OTSSP167          Name  CC401  OTSSP167
#  CSNK2A1;CSNK2A3  747.0      11.0  -->  CSNK2A1  747.0      11.0
#                                         CSNK2A3  747.0      11.0
for name in df1.loc[:,'Name']:
    names = name.split(';')
    if (len(names) > 1):
        series = df1.loc[df1.index[df1['Name'] == name]].squeeze()
        for sub_name in names:
            series['Name'] = sub_name
            df1 = df1.append(series, ignore_index=True)
        df1.drop(df1.index[df1['Name'] == name], axis=0, inplace=True)
df1.reset_index(drop=True, inplace=True)

In [10]:
# Add official gene names as a new column
if (nThreads is None) or nThreads > 1:
    df1[geneSymbolColumn] = multiThreadedSearchGeneNames(df1['Name'].tolist(), EntrezEmail, nThreads)
else:
    df1[geneSymbolColumn] = df1.apply(geneSymbolLookupFromSeries, 1, email = EntrezEmail)

# Show rows where no official gene symbol was found
display(df1.loc[df1[geneSymbolColumn] == ""])

Using 50 threads...


Unnamed: 0,Name,CC401,OTSSP167,GeneSymbol
267,Q6ZSR9,2529.0,155.0,


In [11]:
# For genes with missing official gene symbols, manually add official gene symbols
df1.loc[df1['Name'] == 'Q6ZSR9', geneSymbolColumn] = 'Q6ZSR9'

In [12]:
# Sort by official gene names
df1.sort_values(by=geneSymbolColumn, inplace=True)
df1.reset_index(drop=True, inplace=True)

# Reorder columns
df1 = df1[[geneSymbolColumn, "Name", "OTSSP167", "CC401"]]

In [13]:
# Quick verification of gene symbol lookups

# Display rows where official gene symbol differed from original gene name
display(df1[df1['Name'] != df1[geneSymbolColumn]])

# Confirm there are no duplicate genes (rows)
print("Number of duplicated rows: " + str(sum(df1.duplicated() == True)))

Unnamed: 0,GeneSymbol,Name,OTSSP167,CC401
81,COQ8A,ADCK3,inf,inf
116,EIF2S3B,EIF2S3L,inf,inf
150,GRK2,ADRBK1,inf,inf
186,MAP3K20,ZAK,101.0,inf
218,MTREX,SKIV2L2,inf,inf
338,SRPRA,SRPR,inf,inf
359,TMEM94,KIAA0195,inf,inf


Number of duplicated rows: 0


In [14]:
display(df1)

Unnamed: 0,GeneSymbol,Name,OTSSP167,CC401
0,AAK1,AAK1,172.000000,2608.000000
1,ABL1,ABL1,280.000000,inf
2,ABL2,ABL2,187.000000,inf
3,ACAD10,ACAD10,inf,inf
4,ACOX3,ACOX3,inf,inf
5,ACP1,ACP1,inf,inf
6,ACTR2,ACTR2,inf,inf
7,ACTR3,ACTR3,inf,inf
8,ACVR1,ACVR1,222.000000,inf
9,ACVR1B,ACVR1B,195.000000,inf


In [15]:
df1.to_csv(os.path.join(dataAux_dir, Klaeger_filename), index=False)

## Process data from Huang et al.

In [16]:
df2 = pd.read_csv(os.path.join(data_dir, Huang_filename))

In [17]:
# Remove rows (targets) where values are NA
df2.dropna(axis=0, how='any', inplace=True)

In [18]:
# Add official gene names as a new column
if (nThreads is None) or nThreads > 1:
    df2[geneSymbolColumn] = multiThreadedSearchGeneNames(df2['Name'].tolist(), EntrezEmail, nThreads)
else:
    df2[geneSymbolColumn] = df2.apply(geneSymbolLookupFromSeries, 1, email = EntrezEmail)

# Show rows where no official gene symbol was found
display(df2.loc[df2[geneSymbolColumn] == ""])

Using 50 threads...


Unnamed: 0,Name,OTSSP167,HTH01091,GeneSymbol
9,p38 alpha,10,84.0,
10,p38 beta,6,85.0,
11,p38 gamma,12,86.0,
12,p38 delta,4,89.0,
18,PKB beta,24,88.0,
21,PKA,12,112.0,
35,CAMKK beta,2,91.0,
42,GSK3 beta,0,86.0,
43,CDK2-Cyclin A,2,67.0,
44,CDK9-Cyclin T1,4,92.0,


In [19]:
# For genes with missing official gene symbols, manually add official gene symbols
df2.loc[df2['Name'] == 'p38 alpha', geneSymbolColumn] = 'MAPK14'
df2.loc[df2['Name'] == 'p38 beta', geneSymbolColumn] = 'MAPK11'
df2.loc[df2['Name'] == 'p38 gamma', geneSymbolColumn] = 'MAPK12'
df2.loc[df2['Name'] == 'p38 delta', geneSymbolColumn] = 'MAPK13'
df2.loc[df2['Name'] == 'PKB beta', geneSymbolColumn] = 'AKT2'
df2.loc[df2['Name'] == 'PKA', geneSymbolColumn] = 'PRKACA'
df2.loc[df2['Name'] == 'CAMKK beta', geneSymbolColumn] = 'CAMKK2'
df2.loc[df2['Name'] == 'GSK3 beta', geneSymbolColumn] = 'GSK3B'
df2.loc[df2['Name'] == 'CDK2-Cyclin A', geneSymbolColumn] = 'CDK2'
df2.loc[df2['Name'] == 'CDK9-Cyclin T1', geneSymbolColumn] = 'CDK9'
df2.loc[df2['Name'] == 'Aurora A', geneSymbolColumn] = 'AURKA'
df2.loc[df2['Name'] == 'Aurora B', geneSymbolColumn] = 'AURKB'
df2.loc[df2['Name'] == 'AMPK (hum)', geneSymbolColumn] = 'PRKAA1'
df2.loc[df2['Name'] == 'CK1 gamma 2', geneSymbolColumn] = 'CSNK1G2'
df2.loc[df2['Name'] == 'CK1 delta', geneSymbolColumn] = 'CSNK1D'
df2.loc[df2['Name'] == 'CK2', geneSymbolColumn] = 'CSNK2A1'
df2.loc[df2['Name'] == 'IKK epsilon', geneSymbolColumn] = 'IKBKE'
df2.loc[df2['Name'] == 'EF2K', geneSymbolColumn] = 'EEF2K' # based on UniProt
df2.loc[df2['Name'] == 'MPSK1', geneSymbolColumn] = 'STK16'
df2.loc[df2['Name'] == 'EPH-A2', geneSymbolColumn] = 'EPHA2'
df2.loc[df2['Name'] == 'EPH-A4', geneSymbolColumn] = 'EPHA4'
df2.loc[df2['Name'] == 'EPH-B1', geneSymbolColumn] = 'EPHB1'
df2.loc[df2['Name'] == 'EPH-B2', geneSymbolColumn] = 'EPHB2'
df2.loc[df2['Name'] == 'EPH-B3', geneSymbolColumn] = 'EPHB3'
df2.loc[df2['Name'] == 'EPH-B4', geneSymbolColumn] = 'EPHB4'
df2.loc[df2['Name'] == 'FGF-R1', geneSymbolColumn] = 'FGFR1'
df2.loc[df2['Name'] == 'IGF-1R', geneSymbolColumn] = 'IGF1R'
df2.loc[df2['Name'] == 'IR', geneSymbolColumn] = '' # unknown
df2.loc[df2['Name'] == 'PINK', geneSymbolColumn] = 'PINK1'

In [20]:
# Sort by official gene names
df2.sort_values(by=geneSymbolColumn, inplace=True)
df2.reset_index(drop=True, inplace=True)

# Reorder columns
df2 = df2[[geneSymbolColumn, "Name", "OTSSP167", "HTH01091"]]

In [21]:
# Quick verification of gene symbol lookups

# Display rows where official gene symbol differed from original gene name
with pd.option_context('display.max_rows', None):
    display(df2[df2['Name'] != df2[geneSymbolColumn]])

# Confirm there are no duplicate genes (rows)
print("Number of duplicated rows: " + str(sum(df2.duplicated() == True)))

Unnamed: 0,GeneSymbol,Name,OTSSP167,HTH01091
0,,IR,2,100.0
1,ABL1,ABL,1,70.0
2,AKT1,PKB alpha,28,101.0
3,AKT2,PKB beta,24,88.0
4,AURKA,Aurora A,10,98.0
5,AURKB,Aurora B,14,35.0
10,CAMKK2,CAMKK beta,2,91.0
11,CDK2,CDK2-Cyclin A,2,67.0
12,CDK9,CDK9-Cyclin T1,4,92.0
13,CHEK1,CHK1,11,86.0


Number of duplicated rows: 0


In [22]:
display(df2)

Unnamed: 0,GeneSymbol,Name,OTSSP167,HTH01091
0,,IR,2,100.0
1,ABL1,ABL,1,70.0
2,AKT1,PKB alpha,28,101.0
3,AKT2,PKB beta,24,88.0
4,AURKA,Aurora A,10,98.0
5,AURKB,Aurora B,14,35.0
6,BRSK1,BRSK1,2,85.0
7,BRSK2,BRSK2,2,125.0
8,BTK,BTK,7,45.0
9,CAMK1,CAMK1,15,93.0


In [23]:
df2.to_csv(os.path.join(dataAux_dir, Huang_filename), index=False)

## Process data from Annes et al.

In [24]:
# Parameters
# - condense_func: function
#     function condense values of multiple variants (different phosphorylation states, mutants) of the same kinase
condense_func = np.mean

In [25]:
df3 = pd.read_csv(os.path.join(data_dir, Annes100_filename))
df4 = pd.read_csv(os.path.join(data_dir, Annes500_filename))

In [26]:
# Remove rows (targets) where values are NA
df3.dropna(axis=0, how='any', inplace=True)
df4.dropna(axis=0, how='any', inplace=True)

In [27]:
def condenseDuplicatesByKey(df, keyCol, valueCol, func):
    '''
    Condense duplicate rows (based on the keyCol column) by applying a specified function to elements in the valueCol column.
    
    Args
    - df: pandas.DataFrame
    - keyCol: str
        column in which to look for duplicates
    - valueCol: str
        name of column of values to condense
    - func: function
        function to apply to duplicate values. Must take a list and return a single element
    
    Return: pandas.DataFrame
    '''
    for key in df[keyCol]:
        data = df[df[keyCol] == key]
        if data.shape[0] > 1:
            series = data.iloc[0,:].copy() # deep copy to avoid error of assigning value (next line) to a view of slice from df
            series[valueCol] = func(data[valueCol])
            df.drop(df.index[df[keyCol] == key], axis=0, inplace=True)
            df = df.append(series, ignore_index=True)
    return(df)

In [28]:
df3 = condenseDuplicatesByKey(df3, "Name", "OTSSP167", condense_func)
df4 = condenseDuplicatesByKey(df4, "Name", "OTSSP167", condense_func)

In [29]:
# Add official gene names as a new column
if (nThreads is None) or nThreads > 1:
    df3[geneSymbolColumn] = multiThreadedSearchGeneNames(df3['Name'].tolist(), EntrezEmail, nThreads)
    df4[geneSymbolColumn] = multiThreadedSearchGeneNames(df4['Name'].tolist(), EntrezEmail, nThreads)
else:
    df3[geneSymbolColumn] = df3.apply(geneSymbolLookupFromSeries, 1, email = EntrezEmail)
    df4[geneSymbolColumn] = df4.apply(geneSymbolLookupFromSeries, 1, email = EntrezEmail)

# Show rows where no official gene symbol was found
display(df3.loc[df3[geneSymbolColumn] == ""])
display(df4.loc[df4[geneSymbolColumn] == ""])

Using 50 threads...
Using 50 threads...


Unnamed: 0,DiscoveRx_Name,Name,OTSSP167,STF1285,GeneSymbol
231,NIM1,MGC42105,100.0,96.0,
250,PFCDPK1(P.falciparum),CDPK1,0.0,96.0,
251,PFPK5(P.falciparum),MAL13P1.279,100.0,99.0,
275,PKNB(M.tuberculosis),pknB,15.0,96.0,
294,QSK,KIAA0999,15.0,80.0,


Unnamed: 0,DiscoveRx_Name,Name,OTSSP167,STF1285,GeneSymbol
231,NIM1,MGC42105,100.0,100.0,
250,PFCDPK1(P.falciparum),CDPK1,0.0,56.0,
251,PFPK5(P.falciparum),MAL13P1.279,100.0,91.0,
275,PKNB(M.tuberculosis),pknB,15.0,78.0,
294,QSK,KIAA0999,15.0,70.0,


In [30]:
# For genes with missing official gene symbols, manually add official gene symbols
geneSymbolsMap = {
    "MGC42105": "NIM1K",
    "CDPK1": "PF3D7_0217500", # Genus/species: Plasmodium falciparum 3D7 - https://www.ncbi.nlm.nih.gov/gene/812762
    "MAL13P1.279": "PF3D7_1356900", # Genus/species: Plasmodium falciparum 3D7 - https://www.ncbi.nlm.nih.gov/gene/813841
    "pknB": "pknB", # Genus/species: Mycobacterium tuberculosis H37Rv - https://www.ncbi.nlm.nih.gov/gene/887072
    "KIAA0999": "SIK3"}

df3.loc[df3["Name"].isin(geneSymbolsMap), geneSymbolColumn] = list(geneSymbolsMap.values())
df4.loc[df3["Name"].isin(geneSymbolsMap), geneSymbolColumn] = list(geneSymbolsMap.values())

In [31]:
# Sort by official gene names
df3.sort_values(by=geneSymbolColumn, inplace=True)
df3.reset_index(drop=True, inplace=True)
df4.sort_values(by=geneSymbolColumn, inplace=True)
df4.reset_index(drop=True, inplace=True)

# Reorder columns
df3 = df3[[geneSymbolColumn, "Name", "OTSSP167", "STF1285"]]
df4 = df4[[geneSymbolColumn, "Name", "OTSSP167", "STF1285"]]

In [32]:
# Quick verification of gene symbol lookups

# Display rows where official gene symbol differed from original gene name
display(df3[df3['Name'] != df3[geneSymbolColumn]])
display(df4[df4['Name'] != df4[geneSymbolColumn]])

# Confirm there are no duplicate genes (rows)
print("Number of duplicated rows: " + str(sum(df3.duplicated() == True)))
print("Number of duplicated rows: " + str(sum(df4.duplicated() == True)))

Unnamed: 0,GeneSymbol,Name,OTSSP167,STF1285
42,CDK11A,CDC2L2,11.0,100.0
70,COQ8A,CABC1,93.0,98.0
71,COQ8B,ADCK4,86.0,100.0
131,GRK2,ADRBK1,74.0,100.0
132,GRK3,ADRBK2,100.0,100.0
137,HASPIN,GSG2,1.2,0.4
184,MAP3K20,ZAK,41.0,97.0
242,NIM1K,MGC42105,100.0,96.0
254,PAK5,PAK7,8.2,70.0
259,PF3D7_0217500,CDPK1,0.0,96.0


Unnamed: 0,GeneSymbol,Name,OTSSP167,STF1285
42,CDK11A,CDC2L2,11.0,100.0
70,COQ8A,CABC1,93.0,85.0
71,COQ8B,ADCK4,86.0,97.0
131,GRK2,ADRBK1,74.0,100.0
132,GRK3,ADRBK2,100.0,66.0
137,HASPIN,GSG2,1.2,0.2
184,MAP3K20,ZAK,41.0,66.0
242,NIM1K,MGC42105,100.0,100.0
254,PAK5,PAK7,8.2,28.0
259,PF3D7_0217500,CDPK1,0.0,56.0


Number of duplicated rows: 0
Number of duplicated rows: 0


In [33]:
display(df3)

Unnamed: 0,GeneSymbol,Name,OTSSP167,STF1285
0,AAK1,AAK1,6.200000,74.0
1,ABL1,ABL1,4.533333,59.0
2,ABL2,ABL2,72.000000,100.0
3,ACVR1,ACVR1,43.000000,100.0
4,ACVR1B,ACVR1B,82.000000,100.0
5,ACVR2A,ACVR2A,100.000000,100.0
6,ACVR2B,ACVR2B,100.000000,96.0
7,ACVRL1,ACVRL1,89.000000,100.0
8,AKT1,AKT1,97.000000,100.0
9,AKT2,AKT2,89.000000,100.0


In [34]:
display(df4)

Unnamed: 0,GeneSymbol,Name,OTSSP167,STF1285
0,AAK1,AAK1,6.200000,21.0
1,ABL1,ABL1,4.533333,13.0
2,ABL2,ABL2,72.000000,78.0
3,ACVR1,ACVR1,43.000000,63.0
4,ACVR1B,ACVR1B,82.000000,100.0
5,ACVR2A,ACVR2A,100.000000,92.0
6,ACVR2B,ACVR2B,100.000000,78.0
7,ACVRL1,ACVRL1,89.000000,96.0
8,AKT1,AKT1,97.000000,99.0
9,AKT2,AKT2,89.000000,97.0


In [35]:
df3.to_csv(os.path.join(dataAux_dir, Annes100_filename), index=False)
df4.to_csv(os.path.join(dataAux_dir, Annes500_filename), index=False)