# Comparison of alternative methods for kinase enrichment analyses

In [1]:
import pandas as pd
cd Kinase_Enrichment_Comparisons

/Users/schilder/Desktop/X2K_RandomSearch/Kinase_Enrichment_Comparisons


# :: X2K ::

## Import list of all kinases

In [1]:
# Standardize genes to HGNC symbols
mapping = pd.read_table('../X2K_Summaries/General_Resources/Moshe_mapping/mappingFile_2017.txt', header=None)
greekLetters = pd.read_csv('../X2K_Summaries/General_Resources/GreekLetter_Converter.csv', names=['Greek', 'Abbrev'])
greekLetters = greekLetters.apply(lambda x: x.str.strip('\xa0'))

def standardizeGeneSymbol(gene):
    if gene.__contains__('AURORA'):
            HGNC = 'AURK' + gene[-1]
    elif any(substring in gene for substring in greekLetters['Greek']):
        for letter in greekLetters['Greek']:
            if gene.__contains__(letter):
                HGNC = gene.replace(letter, greekLetters.loc[greekLetters['Greek']==letter,'Abbrev'].values[0] )
    else:
        HGNC = gene
    if HGNC in mapping[0]:
        HGNC = mapping.iloc[mapping[0]==HGNC, 1]
    return HGNC

# Get list of all kinases in KEA2018
import pandas as pd
KEA2018 = pd.read_csv("Data/UberKeaFile.csv")
KEA2018.head()
allKinases = KEA2018.iloc[:,2].unique().tolist()

# Get list of unique kinases in KEA2018, standardized to HGNC symbols
with open('allKinases_KEA2018.txt','w') as file:
    seen=[]
    for k in allKinases:
        HGNC = standardizeGeneSymbol(k)
        if HGNC not in seen:
            file.write(HGNC+"\n")
        seen.append(HGNC)

## Run X2K Web with default parameters

In [2]:
import PythonScripts.X2Kweb_API as xweb

## GEO: Up genes

In [3]:
kinase_file = 'Kinase_Perturbations_from_GEO_up.txt'
save_file = 'X2K_kinaseRanks_UP.txt'

X2K_UP = xweb.run_X2K_allGenes(kinase_file, save_file, verbose=False)

BRAF_druginhibition_175_GSE42872
BRAF_druginhibition_38_GDS5085
FGFR3_druginhibition_36_GDS5023
ERBB3_knockdown_65_GSE19921
BRAF_knockdown_193_GSE5481
AURKA_druginhibition_196_GSE57810
HUNK_knockout_240_GSE14226
TGFBR2_knockout_293_GSE46211
TGFBR2_knockout_292_GSE46150
TGFBR2_knockout_295_GSE45968
TGFBR2_knockout_296_GSE22989
PDK1_knockout_265_GSE42187
TRIM28_knockout_302_GSE32224
TRIM28_knockout_303_GSE32224
RAF1_activemutant_219_GSE42964
CDK4_knockdown_225_GSE8866
IGF1R_druginhibition_46_GSE14024
IRAK4_defectivemutant_200_GSE6789
RET_knockout_270_GSE32093
GSK3A_knockdown_201_GDS4305
GSK3B_knockdown_202_GDS4305
GSK3A_knockdown_207_GDS4305
GSK3B_knockdown_208_GDS4305
GSK3A_knockdown_203_GDS4305
GSK3B_knockdown_204_GDS4305
SYK_knockdown_189_GSE54065
SYK_knockdown_190_GSE54065
SYK_knockdown_191_GSE54065
ABL1_druginhibition_77_GSE24493
ILK_activemutant_78_GSE25729
ALK_druginhibition_187_GSE50803
TYK2_knockdown_176_GSE44652
TYK2_knockdown_35_GDS4754
PIK3CA_druginhibition_57_GSE17785
SYK_dr

## GEO: Down genes

In [4]:
kinase_file = 'Kinase_Perturbations_from_GEO_down.txt'
save_file = 'X2Koutput_DN.csv'
X2K_DN = xweb.run_X2K_allGenes(kinase_file, save_file, verbose=False)

BRAF_druginhibition_175_GSE42872
BRAF_druginhibition_38_GDS5085
FGFR3_druginhibition_36_GDS5023
ERBB3_knockdown_65_GSE19921
BRAF_knockdown_193_GSE5481
AURKA_druginhibition_196_GSE57810
HUNK_knockout_240_GSE14226
TGFBR2_knockout_293_GSE46211
TGFBR2_knockout_292_GSE46150
TGFBR2_knockout_295_GSE45968
TGFBR2_knockout_296_GSE22989
PDK1_knockout_265_GSE42187
TRIM28_knockout_302_GSE32224
TRIM28_knockout_303_GSE32224
RAF1_activemutant_219_GSE42964
CDK4_knockdown_225_GSE8866
IGF1R_druginhibition_46_GSE14024
IRAK4_defectivemutant_200_GSE6789
RET_knockout_270_GSE32093
GSK3A_knockdown_201_GDS4305
GSK3B_knockdown_202_GDS4305
GSK3A_knockdown_207_GDS4305
GSK3B_knockdown_208_GDS4305
GSK3A_knockdown_203_GDS4305
GSK3B_knockdown_204_GDS4305
SYK_knockdown_189_GSE54065
SYK_knockdown_190_GSE54065
SYK_knockdown_191_GSE54065
ABL1_druginhibition_77_GSE24493
ILK_activemutant_78_GSE25729
ALK_druginhibition_187_GSE50803
TYK2_knockdown_176_GSE44652
TYK2_knockdown_35_GDS4754
PIK3CA_druginhibition_57_GSE17785
SYK_dr

# :: KEA ::

## Alex's version


In [6]:
def run_KEA(input_line):
    directory='KEA/'
    import time
    import socket
    HOST = "localhost"
    PORT3 = 5002
    start_time = time.time()
    
    sock2 = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock2.connect((HOST, PORT3))

    buffer_size = 1024
    allData3 = ','.join([input_line[0]]+input_line[2:])

    print("Running KEA...")
    kea_string = ';'.join(["run", 'pvalue', 'humanarchs4', 'KP',  '10000'])
    print("KEA Parameters:   "+kea_string)
    kea_parameters = kea_string + "\n"+allData3+"messageComplete\n"
    kea_parameters.replace("messageComplete\n", "")
    sock2.sendall(bytes(kea_parameters+"\n", 'utf-8'))

    while 1:
        #print("d: "+data)
        data = sock2.recv(buffer_size).decode("utf-8")
        allData3 = allData3 + data
        if allData3.endswith("messageComplete\n"):
            break
    
    allData3.replace("messageComplete\n", "")
    sock2.send(bytes("kill\n", 'utf-8'))
    sock2.close()
    
    allData3 = allData3.replace("messageComplete\n", "").replace(kea_parameters, "")
    
    text_file = open(directory+"output/kea_out.txt", "w")
    text_file.write(allData3)
    text_file.close()

    time.time() - start_time
    return allData3

## X2K Web Version

In [2]:
## KEA CSV OUTPUT FORMAT

"""
    1. name of the kinase
    2. number of substrates in the input gene-list
    3. number of genes that are substrates of the kinase
    4. the fraction of genes that are substrates compared to total number of genes in gene-list
    5. the fraction of genes that are substrates compared to total number of genes in background
    6. difference between the background fraction and the substrate-list fraction
    7. p-value computed using the Fisher Test
    8. rank computed using z-test
    9. combined score computed from p-value and rank
    10. list of substrates separated by a semi-colon
""";

In [None]:
import pandas as pd
import os 
from time import sleep

def create_geneList_file(geneList):
    with open('KEA/geneList.txt','w') as file:
        for g in geneList:
            file.write(g+'\n')
            
"""
For X2K and KEA, whenever the kinase-enrichment couldn't find any overlap between the substrates of a given kinase 
and the genes in the input list, KEA returns 'NaN' for that kinase.
-Alternatively, you can randomly assign a remaining rank 
"""
import math
def choose_random_rank(input):
    if math.isnan(input):
        newRank = np.random.choice(remainingRanks, 1, replace=False)[0]
    else:
        newRank = input
    return newRank

def replace_NAs_with_random_rank(DF):
    from random import shuffle
    for col in DF.iloc[:,1:]:
        print(col)
        maxRank = int(max(DF[col].dropna()))
        remainingRanks = list(range(maxRank+1,len(DF)+1))
        shuffle(remainingRanks)
        DF[col] = DF[col].apply(choose_random_rank).astype(int)
    #DF.to_csv(outputName, sep='\t', header=True, index=None, na_rep='NA')
    return DF               
            
def create_DF_from_KEAoutput(expt, finalDF):
    KEAout = pd.read_csv('KEA/KEA_output.csv', header=None, index_col=False)
    KEAout.head()
    KEAout.columns = ['Kinase','number of substrates in the input gene-list', 'number of genes that are substrates of the kinase',\
                      'fraction of genes that are substrates compared to total number of genes in gene-list',\
                      'fraction of genes that are substrates compared to total number of genes in background',\
                      'difference between the background fraction and the substrate-list fraction',\
                      'pvalue', 'ztest_rank', 'combined_score','substrates']
    KEAout.index = KEAout['Kinase']
    KEA_sort = KEAout.sort_values(by='pvalue')
    KEA_sort[expt] = range(0,len(KEA_sort))
    newDF = KEA_sort[['Kinase',expt]]
    finalDF = finalDF.merge(newDF, on='Kinase', how='outer')
    return finalDF
    
    
#cd Kinase_Enrichment_Comparisons
def run_KEA_old(inputGMT, KEA_summary_file, replaceNAs=True):
    with open(inputGMT) as file:
        input_GMT = file.readlines()
        
    #input_GMT = input_GMT[0:10]
    finalDF=pd.DataFrame(columns=['Kinase'])
    for line in input_GMT:
        # Delete old files
        try:
            os.remove('KEA/KEA_output.csv')
            os.remove('KEA/geneList.txt')
        except:
            print("No files to delete")
        while os.path.exists('KEA/geneList.txt') or os.path.exists('KEA/KEA_output.csv'):
            sleep(.5)
        
        # Create gene list
        lineSp = line.split('\t')
        expt = lineSp[0]
        genes = [x.strip(',1.0') for x in lineSp[2:-1]]
        print("Processing: "+expt)
        # Create gene list txt file
        print(expt+': Creating genList file')
        create_geneList_file(genes)
        print('Waiting for KEA_output')
        while not os.path.exists('KEA/geneList.txt'):
            sleep(.5) 
        # Run KEA command line
        # result = subprocess.run(['/Library/Java/JavaVirtualMachines/1.6.0.jdk/Contents/Home/bin/java','-jar',\
        #                          'KEA/KEA-1.5-SNAPSHOT-jar-with-dependencies.jar','KEA/UberKeaFile.csv',\
        #                          'KEA/geneList.txt KEA/KEA_output.csv'] )
        # result.stdout.decode('utf-8')
        print('Running KEA')
        os.system('/Library/Java/JavaVirtualMachines/1.6.0.jdk/Contents/Home/bin/java '+\
                  ' -jar'+' KEA/KEA-1.5-SNAPSHOT-jar-with-dependencies.jar'+' KEA/UberKeaFile.csv'+\
                  ' KEA/geneList.txt'+ ' KEA/KEA_output.csv')
        # Sleep until the file is ready
        print('Waiting for KEA_output')
        while not os.path.exists('KEA/KEA_output.csv'):
            sleep(.5)
        # Read in KEA output and process
        print(expt+' : Creating dataframe')
        finalDF = create_DF_from_KEAoutput(expt, finalDF)
        if replaceNAs==True:
            finalDF = replace_NAs_with_random_rank(finalDF)
        
    finalDF.to_csv(KEA_summary_file, sep='\t', header=True, index=None, na_rep='NA')
    return finalDF

###  Fill NAs in X2K results files

In [None]:
X2K_UP = pd.read_table('X2K_kinaseRanks_UP.txt', index_col=False)
X2K_filled_UP = replace_NAs_with_random_rank(X2K_UP,'X2K_output_NAsfilled_UP.txt') 

X2K_DN = pd.read_table('X2K_kinaseRanks_DN.txt', index_col=False)
X2K_filled_DN = replace_NAs_with_random_rank(X2K_DN,'X2K_output_NAsfilled_DN.txt')




### Run KEA (while filling NAs)

In [None]:
KEA_UP = run_KEA_old(inputGMT='Kinase_Perturbations_from_GEO_up.txt', KEA_summary_file='KEA_output_UP.txt', replaceNAs=True)
KEA_DN = run_KEA_old(inputGMT='Kinase_Perturbations_from_GEO_down.txt', KEA_summary_file='KEA_output_DN.txt', replaceNAs=True)

## Import Previously Processed Results

In [2]:
%cd /Users/schilder/Desktop/X2K_RandomSearch/Kinase_Enrichment_Comparisons

/Users/schilder/Desktop/X2K_RandomSearch/Kinase_Enrichment_Comparisons


In [3]:
import pandas as pd
# Import corrected X2K data
X2K_UP = pd.read_table('X2K_output_NAsfilled_UP.txt', index_col=False)
X2K_DN = pd.read_table('X2K_output_NAsfilled_DN.txt', index_col=False)

# Import KEA data
KEA_UP = pd.read_table('KEA_output_NAsfilled_UP.txt', index_col=False)
KEA_DN = pd.read_table('KEA_output_NAsfilled_DN.txt', index_col=False)

# Method comparison

## Heatmaps across all kinases

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(font="Arial")

# Color heatmap by kinase groups/families
def get_kinase_groups_families():
    homo = pd.read_excel('../../X2K_Summaries/General_Resources/Kinase.com/Kinome_Hsap_updated.xls').loc[:,['Name','Group','Family','Subfamily']]
    mus = pd.read_excel('../../X2K_Summaries/General_Resources/Kinase.com/Kinome_Mmus.xls').loc[:,['Gene Name','Group','Family','Subfamily']]
    mus = mus.rename(columns={'Gene Name':'Name'})
    # Fill missing Subfamily info with Family
    ##homo['Subfamily'] = homo['Subfamily'].fillna(homo['Family'])
    ##mus['Subfamily'] = mus['Subfamily'].fillna(mus['Family'])
    
    # Capitalize everything
    homo = homo.apply(lambda x: x.astype(str).str.upper())
    mus = mus.apply(lambda x: x.astype(str).str.upper())
    both = pd.concat([homo, mus]).drop_duplicates()
    both['Subfamily'] = both['Subfamily'].fillna('[No Info.]')
    both.columns = ['Kinase','Kinase_Group','Kinase_Family','Kinase_Subfamily']
    both.index = both['Kinase']
    return both

"""
def get_kinase_groups_families():
    KEA = pd.read_csv('KEA/UberKeaFile.csv', header=None, index_col=False).iloc[:,0:3]
    KEA.drop_duplicates(inplace=True)
    #KEA.fillna('OTHER', inplace=True)
    KEA.columns = ['Kinase_Family','Kinase_Group','Name']
    KEA.index = KEA['Name']
    return KEA
"""
    
def category_colors_dict(category):
    KEA_gf = get_kinase_groups_families()
    uniqueCats = KEA_gf[category].unique()
    colorCodes = sns.color_palette("hls", len(uniqueCats))
    colorDict = dict(zip(uniqueCats, colorCodes))
    row_colors = KEA_gf[category].map(colorDict)
    return row_colors, colorDict

DF = X2K_UP.copy()
def plotHeatmap(DF, method='', z_score=None, category='Kinase_Group', saveFig=True):
    DF.dropna(inplace=True)
    scaleKey = {None:'raw', 0:'zscore-row', 1:'zscore-col'}
    plotDF = DF.iloc[:,1:]
    plotDF.index = DF['Kinase']
    # Apply z-score tranformation
    if z_score==None:
        title = 'Pvalue-sorted Rank'
    else:
        title = scaleKey[z_score]
        
    # CLUSTERMAP
    row_colors, colorDict = category_colors_dict(category)
    g = sns.clustermap(plotDF, z_score=z_score, row_colors=row_colors, cmap="RdBu") #"inferno", "hot"
    
    # Set position of main colorbar
    g.cax.set_position([.05, .2, .03, .45])
    # Draw legend for classes
    for label in colorDict.keys():
        g.ax_row_dendrogram.bar(0, 0, color=colorDict[label], label=label, linewidth=0)
    g.ax_row_dendrogram.legend(loc="upper right", ncol=2, bbox_to_anchor=(.35, 1.35), borderaxespad=1).set_title(category)
    # Change label params
    plt.title(title)
    
    # Save fig
    if saveFig==True:
        g.savefig('Figures/'+method+'_'+scaleKey[z_score]+'_clustermap.png')


dfList = ['X2K_UP','X2K_DN','KEA_UP','KEA_DN']
def iterate_clustermaps(dfList, z_score=None, category='Kinase_Family', saveFig=False):
    for df in dfList:
        print(df)
        plotHeatmap(eval(df), method=df, z_score=z_score, category=category, saveFig=saveFig)
iterate_clustermaps(dfList, z_score=0, category='Kinase_Group', saveFig=True)

## Clustergrammer

In [6]:
from clustergrammer import Network
# net = Network()
DF = X2K_UP.copy()
DF.index =  DF['Kinase']
DF.dropna(inplace=True)
DF = DF.iloc[:,1:]

# net.load_df()
# 
# # Z-score normalize the rows
# net.normalize(axis='row', norm_type='zscore', keep_orig=True)
# # filter for the top 100 columns based on their absolute value sum
# ## net.filter_N_top('col', 100, 'sum')
# # cluster using default parameters
# net.cluster()
# # save visualization JSON to file for use by front end
# net.write_json_to_file('viz', 'mult_view.json')



## USING WIDGET
from clustergrammer_widget import *
net = Network(clustergrammer_widget)
# load DataFrame
net.load_df(DF)
# cluster using default parameters

net.normalize(axis='col', norm_type='zscore', keep_orig=True)
# filter for the top 200 rows based on their absolute value sum
net.filter_N_top('row', 20, 'sum') #test
# cluster using default parameters
net.cluster()

# make interactive widget
net.widget()

In [6]:
%pwd

'/Users/schilder/Desktop/X2K_RandomSearch/Kinase_Enrichment_Comparisons'

## KDE Plot for Target Kinases

In [None]:
import numpy as np
def getTargetKinaseRanks(DF, method):
    summaaryDF = pd.DataFrame()
    for col in DF.columns[1:]:
        target = col.split("_")[0]
        expt = col
        if target in DF['Kinase']:
            targetRank = DF.loc[DF['Kinase']==target, col].values[0]
        else:
            targetRank = 'NA'
        newDF = pd.DataFrame(np.column_stack([method, expt, target, targetRank]), columns=['Method','Experiment','Target Kinase', 'Rank'])
        summaryDF = summaryDF.append(newDF)
    return summaryDF


import seaborn as sns

def KDEplot(DF, method, color='m'):
    summaryDF = getTargetKinaseRanks(DF,method)
    summaryDF = summaryDF[summaryDF['Rank']!='NA']
    summaryDF['Rank'] = pd.to_numeric(summaryDF['Rank'])
    sns.kdeplot(summaryDF['Rank'], shade=False, label=method) #color[i]
    #sns.distplot(summaryDF['Rank'])


plt.figure()
for df in dfList:
    summaryDF = getTargetKinaseRanks(eval(df), df )
    KDEplot(summaryDF, df)