In [1]:
# Import all packages
import numpy as np
import pandas as pd
from Bio import Entrez

In [8]:
## Parameters
highConfidenceOnly = True

In [9]:
## Read in data
df1 = pd.read_csv('Klaeger(2017)_Science.csv', na_values=["n.d."])

In [10]:
## remove n/a values
df1.drop(df1.index[pd.isnull(df1['OTS-167'])], axis=0, inplace=True)

In [12]:
## remove low-confidence values if necessary
if highConfidenceOnly:
    df1.drop(df1.index[df1['OTS-167'].str.contains('\(') == True], axis=0, inplace=True)
    df1.drop(df1.index[df1['CC-401'].str.contains('\(') == True], axis=0, inplace=True)
else:
    df1['OTS-167'] = df1['OTS-167'].str.strip('()')
    df1['CC-401'] = df1['CC-401'].str.strip('()')

## convert n.i. values to np.inf
df1.replace('n.i.', np.inf, inplace=True)
df1 = df1.astype({'OTS-167':np.float, 'CC-401':np.float})
df1.dtypes

Name              object
Kinase            object
Direct binder     object
CC-401           float64
OTS-167          float64
dtype: object

In [6]:
## check for duplicates
df1.index[df1.duplicated() == True]

Int64Index([], dtype='int64')

In [7]:
# separate combined names into individual names
for name in df1.loc[:,'Name']:
    names = name.split(';')
    if (len(names) > 1):
        series = df1.loc[df1.index[df1['Name'] == name]].squeeze()
        for sub_name in names:
            series['Name'] = sub_name
            df1 = df1.append(series, ignore_index=True)
        df1.drop(df1.index[df1['Name'] == name], axis=0, inplace=True)
df1

Unnamed: 0,Name,Kinase,Direct binder,CC-401,OTS-167
0,BMPR1B,yes,yes,inf,2.000000
1,GAK,yes,yes,1464.000000,5.000000
2,RIPK2,yes,yes,inf,5.000000
3,CSNK2A2,yes,yes,584.000000,10.000000
4,DYRK1A,yes,yes,161.000000,10.000000
5,CLK1,yes,yes,inf,11.000000
6,CSNK2B,no,no,741.000000,11.000000
7,DCAF7,no,no,28.000000,11.000000
8,IRAK3,yes,yes,inf,12.000000
9,BCR,no,yes,inf,14.000000


In [8]:
## check for duplicates
df1.index[df1.duplicated() == True]
df1.shape

(413, 5)

In [9]:
## get official names and IDs

for name in df1['Name']:
    Entrez.email = 'bentyeh@gmail.com'
    handle = Entrez.esearch(db="gene", term='(' + name + '[gene]) AND (Homo sapiens[orgn]) AND alive[prop] NOT newentry[gene]')
    record_search = Entrez.read(handle)
    if len(record_search['IdList']) == 1:
        handle = Entrez.esummary(db='gene', id=record_search['IdList'][0])
        record = Entrez.read(handle)
        df1.loc[df1['Name'] == name, 'id'] = record_search['IdList'][0]
        df1.loc[df1['Name'] == name, 'Name_official'] = record['DocumentSummarySet']['DocumentSummary'][0]['Name']
    else:
        flag_found = False
        for ID in record_search['IdList']:
            handle = Entrez.esummary(db='gene', id=ID)
            record = Entrez.read(handle)
            
            ## official symbol matches name
            if record['DocumentSummarySet']['DocumentSummary'][0]['Name'] == name:
                df1.loc[df1['Name'] == name, 'id'] = ID
                df1.loc[df1['Name'] == name, 'Name_official'] = name
                flag_found = True
                break
        
        if not flag_found:
            print(name + ' ' + ','.join(record_search['IdList']))

Q6ZSR9 


In [12]:
## manually fix names
df1.loc[df1['Name'] == 'Q6ZSR9', 'id'] = 'Q6ZSR9'
df1.loc[df1['Name'] == 'Q6ZSR9', 'Name_official'] = 'Q6ZSR9'

In [13]:
print(df1['Name_official'][df1['Name_official'] != df1['Name']])
print(df1['Name'][df1['Name_official'] != df1['Name']])
df1.index[df1.duplicated() == True]

45     MAP3K20
129      COQ8A
132       GRK2
244     TMEM94
354      MTREX
366      SRPRA
405    EIF2S3B
Name: Name_official, dtype: object
45          ZAK
129       ADCK3
132      ADRBK1
244    KIAA0195
354     SKIV2L2
366        SRPR
405     EIF2S3L
Name: Name, dtype: object


Int64Index([], dtype='int64')

In [14]:
df1

Unnamed: 0,Name,Kinase,Direct binder,CC-401,OTS-167,id,Name_official
0,BMPR1B,yes,yes,inf,2.000000,658,BMPR1B
1,GAK,yes,yes,1464.000000,5.000000,2580,GAK
2,RIPK2,yes,yes,inf,5.000000,8767,RIPK2
3,CSNK2A2,yes,yes,584.000000,10.000000,1459,CSNK2A2
4,DYRK1A,yes,yes,161.000000,10.000000,1859,DYRK1A
5,CLK1,yes,yes,inf,11.000000,1195,CLK1
6,CSNK2B,no,no,741.000000,11.000000,1460,CSNK2B
7,DCAF7,no,no,28.000000,11.000000,10238,DCAF7
8,IRAK3,yes,yes,inf,12.000000,11213,IRAK3
9,BCR,no,yes,inf,14.000000,613,BCR


In [15]:
df1.to_csv('Klaeger(2017)_Science_processed.csv', index=False)

In [None]:
df2 = pd.read_csv('Huang(2017)_eLife_Fig1source.csv')

In [30]:
## get official names and IDs

for name in df2['Kinase']:
    Entrez.email = 'bentyeh@gmail.com'
    handle = Entrez.esearch(db="gene", term='(' + name + '[gene]) AND (Homo sapiens[orgn]) AND alive[prop] NOT newentry[gene]')
    record_search = Entrez.read(handle)
    if len(record_search['IdList']) == 0:
        handle = Entrez.esearch(db="gene", term='(' + name.replace('-', '') + '[gene]) AND (Homo sapiens[orgn]) AND alive[prop] NOT newentry[gene]')
        record_search = Entrez.read(handle)
    if len(record_search['IdList']) == 1:
        handle = Entrez.esummary(db='gene', id=record_search['IdList'][0])
        record = Entrez.read(handle)
        df2.loc[df2['Kinase'] == name, 'id'] = record_search['IdList'][0]
        df2.loc[df2['Kinase'] == name, 'Name_official'] = record['DocumentSummarySet']['DocumentSummary'][0]['Name']
    else:
        flag_found = False
        for ID in record_search['IdList']:
            handle = Entrez.esummary(db='gene', id=ID)
            record = Entrez.read(handle)
            
            ## official symbol matches name
            if record['DocumentSummarySet']['DocumentSummary'][0]['Name'] == name:
                df2.loc[df2['Kinase'] == name, 'id'] = ID
                df2.loc[df2['Kinase'] == name, 'Name_official'] = name
                flag_found = True
                break
        
        if not flag_found:
            print(name + ' ' + ','.join(record_search['IdList']))

MPSK1 
PKA 
IR 
IKK epsilon 
CAMKK beta 
CDK9-Cyclin T1 
Aurora A 
GSK3 beta 
p38 delta 
p38 beta 
p38 alpha 
p38 gamma 
CK1 gamma 2 
ABL 25,4547
TSSK1 83942,23752
CDK2-Cyclin A 
PKB beta 
AMPK (hum) 
CK1 delta 
CK2 
Aurora B 
TAK1 6885,7182
TAO1 57551,9344
PINK 
AMPK 5562,5564,5563
EF2K 


In [34]:
## manually fix names
df2.loc[df2['Kinase'] == 'MPSK1', 'id'] = '8576'
df2.loc[df2['Kinase'] == 'MPSK1', 'Name_official'] = 'STK16'

df2.loc[df2['Kinase'] == 'PKA', 'id'] = '5566'
df2.loc[df2['Kinase'] == 'PKA', 'Name_official'] = 'PRKACA'

df2.loc[df2['Kinase'] == 'IR', 'id'] = '' # unknown
df2.loc[df2['Kinase'] == 'IR', 'Name_official'] = '' # unknown

df2.loc[df2['Kinase'] == 'IKK epsilon', 'id'] = '9641' 
df2.loc[df2['Kinase'] == 'IKK epsilon', 'Name_official'] = 'IKBKE'

df2.loc[df2['Kinase'] == 'CAMKK beta', 'id'] = '10645'
df2.loc[df2['Kinase'] == 'CAMKK beta', 'Name_official'] = 'CAMKK2'

df2.loc[df2['Kinase'] == 'CDK9-Cyclin T1', 'id'] = '1025'
df2.loc[df2['Kinase'] == 'CDK9-Cyclin T1', 'Name_official'] = 'CDK9'

df2.loc[df2['Kinase'] == 'Aurora A', 'id'] = '6790'
df2.loc[df2['Kinase'] == 'Aurora A', 'Name_official'] = 'AURKA'

df2.loc[df2['Kinase'] == 'GSK3 beta', 'id'] = '2932'
df2.loc[df2['Kinase'] == 'GSK3 beta', 'Name_official'] = 'GSK3B'

df2.loc[df2['Kinase'] == 'p38 delta', 'id'] = '5603'
df2.loc[df2['Kinase'] == 'p38 delta', 'Name_official'] = 'MAPK13'

df2.loc[df2['Kinase'] == 'p38 beta', 'id'] = '5600'
df2.loc[df2['Kinase'] == 'p38 beta', 'Name_official'] = 'MAPK11'

df2.loc[df2['Kinase'] == 'p38 alpha', 'id'] = '1432'
df2.loc[df2['Kinase'] == 'p38 alpha', 'Name_official'] = 'MAPK14'

df2.loc[df2['Kinase'] == 'p38 gamma', 'id'] = '6300'
df2.loc[df2['Kinase'] == 'p38 gamma', 'Name_official'] = 'MAPK12'

df2.loc[df2['Kinase'] == 'CK1 gamma 2', 'id'] = '1455'
df2.loc[df2['Kinase'] == 'CK1 gamma 2', 'Name_official'] = 'CSNK1G2'

df2.loc[df2['Kinase'] == 'ABL', 'id'] = '25'
df2.loc[df2['Kinase'] == 'ABL', 'Name_official'] = 'ABL1'

df2.loc[df2['Kinase'] == 'TSSK1', 'id'] = '83942'
df2.loc[df2['Kinase'] == 'TSSK1', 'Name_official'] = 'TSSK1B'

df2.loc[df2['Kinase'] == 'CDK2-Cyclin A', 'id'] = '1017'
df2.loc[df2['Kinase'] == 'CDK2-Cyclin A', 'Name_official'] = 'CDK2'

df2.loc[df2['Kinase'] == 'PKB beta', 'id'] = '208'
df2.loc[df2['Kinase'] == 'PKB beta', 'Name_official'] = 'AKT2'

df2.loc[df2['Kinase'] == 'AMPK (hum)', 'id'] = '5562'
df2.loc[df2['Kinase'] == 'AMPK (hum)', 'Name_official'] = 'PRKAA1'

df2.loc[df2['Kinase'] == 'CK1 delta', 'id'] = '1453'
df2.loc[df2['Kinase'] == 'CK1 delta', 'Name_official'] = 'CSNK1D'

df2.loc[df2['Kinase'] == 'CK2', 'id'] = '1457'
df2.loc[df2['Kinase'] == 'CK2', 'Name_official'] = 'CSNK2A1'

df2.loc[df2['Kinase'] == 'Aurora B', 'id'] = '9212'
df2.loc[df2['Kinase'] == 'Aurora B', 'Name_official'] = 'AURKB'

df2.loc[df2['Kinase'] == 'TAK1', 'id'] = '6885'
df2.loc[df2['Kinase'] == 'TAK1', 'Name_official'] = 'MAP3K7'

df2.loc[df2['Kinase'] == 'TAO1', 'id'] = '57551'
df2.loc[df2['Kinase'] == 'TAO1', 'Name_official'] = 'TAOK1'

df2.loc[df2['Kinase'] == 'PINK', 'id'] = '65018' 
df2.loc[df2['Kinase'] == 'PINK', 'Name_official'] = 'PINK1'

df2.loc[df2['Kinase'] == 'AMPK', 'id'] = '65248' # rat
df2.loc[df2['Kinase'] == 'AMPK', 'Name_official'] = 'Prkaa1' # rat

df2.loc[df2['Kinase'] == 'EF2K', 'id'] = '29904' # based on UniProt
df2.loc[df2['Kinase'] == 'EF2K', 'Name_official'] = 'EEF2K' # based on UniProt

In [36]:
print(df2['Name_official'][df2['Name_official'] != df2['Kinase']])
print(df2['Kinase'][df2['Name_official'] != df2['Kinase']])
df2.index[df2.duplicated() == True]

1         EPHB1
4         STK16
5         EPHB4
7        PRKACA
9              
11        STK26
12        MKNK2
13        IKBKE
14       CAMKK2
15       MAPK10
16         CDK9
17        AURKA
18        STK24
19         PTK6
20        IGF1R
21        MAPK8
22        GSK3B
24       MAPK13
25        MAPK9
26     MAPKAPK2
29       MAP3K5
32        MKNK1
35        EPHB3
36        ERBB4
37       MAPK11
38      RPS6KB1
42        FGFR1
43        PRKCZ
45         PKN2
46        CHEK1
         ...   
77       MAP2K1
78        EPHA2
81      MAP3K11
82        NTRK1
84          SRC
86      RPS6KA1
89         FLT1
90         NEK2
91         STK3
96          LCK
97       MAP3K9
98        ROCK2
99        CHEK2
100      PRKAA1
101       MAPK3
102      CSNK1D
106      MAPK15
107     CSNK2A1
109       AURKB
111      MAP3K7
113      MAP2K6
114       STK11
118      MAP3K1
119       TAOK1
128       PINK1
129       MAPK1
131        MYLK
135      Prkaa1
136    MAPKAPK3
140       EEF2K
Name: Name_official, Len

Int64Index([], dtype='int64')

In [27]:
df2

Unnamed: 0,Kinase,OTSSP167 (1 μM) % AR,OTSSP167 (1 μM) SD,HTH-01-091 (1 μM) % AR,HTH-01-091 (1 μM) SD,Diff,SD Diff,id,Name_official
0,BRSK2,2,2,125.0,2.0,123.0,2.828427,,
1,EPH-B1,11,2,122.0,7.0,111.0,7.280110,,
2,EIF2AK3,7,0,114.0,18.0,107.0,18.000000,,
3,SYK,7,1,112.0,4.0,105.0,4.123106,,
4,MPSK1,5,0,107.0,11.0,102.0,11.000000,8576,STK16
5,EPH-B4,2,0,104.0,17.0,102.0,17.000000,,
6,ULK2,4,0,105.0,15.0,101.0,15.000000,,
7,PKA,12,0,112.0,24.0,100.0,24.000000,5566,PRKACA
8,PDK1,4,1,102.0,0.0,98.0,1.000000,,
9,IR,2,0,100.0,3.0,98.0,3.000000,,


In [37]:
df2.to_csv('Huang(2017)_eLife_Fig1source_processed.csv', index=False)

In [2]:
df3 = pd.read_csv('Annes(2018)_Kinome_1081.csv')
df4 = pd.read_csv('Annes(2018)_Kinome_1285.csv')

In [3]:
# average if same gene symbol

for gene in df3['Entrez Gene Symbol']:
    data = df3.loc[df3['Entrez Gene Symbol'] == gene, :]
    if data.shape[0] > 1:
        series = data.iloc[0].squeeze()
        series['Percent Control'] = data['Percent Control'].mean()
        series['DiscoveRx Gene Symbol'] = gene + '_mean'
        df3.drop(df3.index[df3['Entrez Gene Symbol'] == gene], axis=0, inplace=True)
        df3 = df3.append(series, ignore_index=True)

df3.to_csv('Annes(2018)_Kinome_1081_mean.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [4]:
# average if same gene symbol

for gene in df4['Entrez Gene Symbol']:
    data = df4.loc[df4['Entrez Gene Symbol'] == gene, :]
    if data.shape[0] > 1:
        series = data.iloc[0].squeeze()
        series['Percent Control'] = data['Percent Control'].mean()
        series['DiscoveRx Gene Symbol'] = gene + '_mean'
        df4.drop(df4.index[df4['Entrez Gene Symbol'] == gene], axis=0, inplace=True)
        df4 = df4.append(series, ignore_index=True)

df4.to_csv('Annes(2018)_Kinome_1285_mean.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [2]:
df5 = pd.read_csv('Annes(2018)_Kinome500_1285.csv')

for gene in df5['Entrez Gene Symbol']:
    data = df5.loc[df5['Entrez Gene Symbol'] == gene, :]
    if data.shape[0] > 1:
        series = data.iloc[0].squeeze()
        series['Percent Control'] = data['Percent Control'].mean()
        series['DiscoveRx Gene Symbol'] = gene + '_mean'
        df5.drop(df5.index[df5['Entrez Gene Symbol'] == gene], axis=0, inplace=True)
        df5 = df5.append(series, ignore_index=True)

df5.to_csv('Annes(2018)_Kinome500_1285_mean.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
