# Comparison of alternative methods for kinase enrichment analyses

In [1]:
import X2Kweb_API as xweb

import pandas as pd
%pwd
%cd Kinase_Enrichment_Comparisons

import pickle
with open("../../X2K_Summaries/General_Resources/synDict.pkl", 'rb') as handle:
    synDict = pickle.load(handle)

/Users/schilder/Desktop/X2K_Web/Kinase_Enrichment_Comparisons


# :: X2K ::

## Import list of all kinases

In [31]:
# Standardize genes to HGNC symbols
mapping = pd.read_table('../../X2K_Summaries/General_Resources/Moshe_mapping/mappingFile_2017.txt', header=None)
greekLetters = pd.read_csv('../../X2K_Summaries/General_Resources/GreekLetter_Converter.csv', names=['Greek', 'Abbrev'], header=0 )
greekLetters = greekLetters.apply(lambda x: x.str.strip('\xa0'))

def standardizeGeneSymbol(gene):
    if gene.__contains__('AURORA'):
        HGNC = 'AURK' + gene[-1]
    elif any(substring in gene for substring in greekLetters['Greek']):
        for letter in greekLetters['Greek']:
            LETTER = letter.upper()
            if gene.__contains__(LETTER):
                HGNC = gene.replace(LETTER, greekLetters.loc[greekLetters['Greek']==letter,'Abbrev'].values[0] )
    else:
        HGNC = gene
    if HGNC in mapping[0]:
        HGNC = mapping.iloc[mapping[0]==HGNC, 1]
    return HGNC

# Get list of all kinases in KEA2018
import pandas as pd
KEA2018 = pd.read_csv('../../X2k_Databases/KINASE/KEA_2018/KEA2018_KINASES.csv', header=None)#pd.read_csv("KEA/UberKeaFile.csv")
KEA2018.head()
allKinases = KEA2018.iloc[:,2].unique().tolist()

# # Get list of unique kinases in KEA2018, standardized to HGNC symbols
# def get_all_kinases():
#     with open('allKinases_KEA2018.txt','w') as file:
#         seen=[]
#         for k in allKinases:
#             HGNC = standardizeGeneSymbol(k)
#             if HGNC not in seen:
#                 file.write(HGNC+"\n")
#             seen.append(HGNC)
#     return allKinases
# allKinases = get_all_kinases()

## Run X2K Web with default parameters

In [33]:
"""
# UP genes
kinase_file = 'Kinase_Perturbations_from_GEO_up.txt'
save_file = 'X2K_kinasePvalues_UP.txt'
X2K_UP = xweb.run_X2K_allGenes(kinase_file, save_file, verbose=True, replaceNAs=False, outputValues='pvalue')

# DN Genes
kinase_file = 'Kinase_Perturbations_from_GEO_down.txt'
save_file = 'X2K_kinasePvalues_DN.txt'
X2K_DN = xweb.run_X2K_allGenes(kinase_file, save_file, verbose=True, replaceNAs=False, outputValues='pvalue')
"""
print("OFF")

OFF


In [34]:
# Run X2K Web with ranks from z-scores of -log(val)

"""
# UP genes
kinase_file = 'Kinase_Perturbations_from_GEO_up.txt'
save_file = 'X2K_kinaseRanks_negLogZRank_UP.txt'
#X2K_UP_negLog = xweb.run_X2K_allGenes(kinase_file, save_file, verbose=True, replaceNAs=False, rankingMethod='-log(pvalue)')

X2K_UP_negLog = pd.read_table('X2K_kinaseRanks_negLogZRank_UP.txt', index_col=False).fillna(0)
# Add absent perturbed kinases (also with -log(pvalue)==0)
X2K_UP_negLog = add_absent_perturbed(X2K_UP_negLog)



# DN genes
kinase_file = 'Kinase_Perturbations_from_GEO_down.txt'
save_file = 'X2K_kinaseRanks_negLogZRank_DN.txt'
#X2K_DN_negLog = xweb.run_X2K_allGenes(kinase_file, save_file, verbose=True, replaceNAs=False, rankingMethod='-log(pvalue)')

X2K_DN_negLog = pd.read_table('X2K_kinaseRanks_negLogZRank_DN.txt', index_col=False)
# Fill Nas with -log(pvalue)==0
X2K_DN_negLog.fillna(0, inplace=True)
# Add absent perturbed kinases (also with -log(pvalue)==0)
X2K_DN_negLog = add_absent_perturbed(X2K_DN_negLog)

"""
print("OFF")

OFF


# :: KEA ::

## Alex's version


In [35]:
"""
def run_KEA(input_line):
    directory='KEA/'
    import time
    import socket
    HOST = "localhost"
    PORT3 = 5002
    start_time = time.time()
    
    sock2 = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock2.connect((HOST, PORT3))

    buffer_size = 1024
    allData3 = ','.join([input_line[0]]+input_line[2:])

    print("Running KEA...")
    kea_string = ';'.join(["run", 'pvalue', 'humanarchs4', 'KP',  '10000'])
    print("KEA Parameters:   "+kea_string)
    kea_parameters = kea_string + "\n"+allData3+"messageComplete\n"
    kea_parameters.replace("messageComplete\n", "")
    sock2.sendall(bytes(kea_parameters+"\n", 'utf-8'))

    while 1:
        #print("d: "+data)
        data = sock2.recv(buffer_size).decode("utf-8")
        allData3 = allData3 + data
        if allData3.endswith("messageComplete\n"):
            break
    
    allData3.replace("messageComplete\n", "")
    sock2.send(bytes("kill\n", 'utf-8'))
    sock2.close()
    
    allData3 = allData3.replace("messageComplete\n", "").replace(kea_parameters, "")
    
    text_file = open(directory+"output/kea_out.txt", "w")
    text_file.write(allData3)
    text_file.close()

    time.time() - start_time
    return allData3
"""

'\ndef run_KEA(input_line):\n    directory=\'KEA/\'\n    import time\n    import socket\n    HOST = "localhost"\n    PORT3 = 5002\n    start_time = time.time()\n    \n    sock2 = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n    sock2.connect((HOST, PORT3))\n\n    buffer_size = 1024\n    allData3 = \',\'.join([input_line[0]]+input_line[2:])\n\n    print("Running KEA...")\n    kea_string = \';\'.join(["run", \'pvalue\', \'humanarchs4\', \'KP\',  \'10000\'])\n    print("KEA Parameters:   "+kea_string)\n    kea_parameters = kea_string + "\n"+allData3+"messageComplete\n"\n    kea_parameters.replace("messageComplete\n", "")\n    sock2.sendall(bytes(kea_parameters+"\n", \'utf-8\'))\n\n    while 1:\n        #print("d: "+data)\n        data = sock2.recv(buffer_size).decode("utf-8")\n        allData3 = allData3 + data\n        if allData3.endswith("messageComplete\n"):\n            break\n    \n    allData3.replace("messageComplete\n", "")\n    sock2.send(bytes("kill\n", \'utf-8\'))\n    s

## X2K Web Version

In [36]:
## KEA CSV OUTPUT FORMAT

"""
    1. name of the kinase
    2. number of substrates in the input gene-list
    3. number of genes that are substrates of the kinase
    4. the fraction of genes that are substrates compared to total number of genes in gene-list
    5. the fraction of genes that are substrates compared to total number of genes in background
    6. difference between the background fraction and the substrate-list fraction
    7. p-value computed using the Fisher Test
    8. rank computed using z-test
    9. combined score computed from p-value and rank
    10. list of substrates separated by a semi-colon
"""

'\n    1. name of the kinase\n    2. number of substrates in the input gene-list\n    3. number of genes that are substrates of the kinase\n    4. the fraction of genes that are substrates compared to total number of genes in gene-list\n    5. the fraction of genes that are substrates compared to total number of genes in background\n    6. difference between the background fraction and the substrate-list fraction\n    7. p-value computed using the Fisher Test\n    8. rank computed using z-test\n    9. combined score computed from p-value and rank\n    10. list of substrates separated by a semi-colon\n'

In [37]:
"""
For X2K and KEA, whenever the kinase-enrichment couldn't find any overlap between the substrates of a given kinase 
and the genes in the input list, KEA returns 'NaN' for that kinase.
-Alternatively, you can randomly assign a remaining rank 
"""
 

# For both X2K and KEA
def absent_perturbed_kinases(DF):
    perturbedKinases = set([x.split('_')[0] for x in DF.columns[1:]])
    absentPerturbed=[]
    for pert in perturbedKinases:
        syns = synDict[pert]
        overlap = set(syns).intersection(set(DF['Kinase']))
        if len(overlap)==0:
            absentPerturbed.append(pert)
    return absentPerturbed



def replace_NAs_with_random_rank(DF):
    # Get list of perturbed kinases that are absent from the output file 
    absentPerturbed = absent_perturbed_kinases(DF)
    newDF = pd.concat([DF['Kinase'], pd.Series(absentPerturbed).rename('Kinase')]).reset_index()['Kinase']
    # Assign ranks
    import math
    from random import shuffle
    for col in DF.iloc[:,1:]:
        newRanks=[]
        DFcol = DF[col]
        print(col)
        # Create lists of remaining ranks
        maxRank = int(max(DFcol.dropna()))
        remainingRanks = list(range(maxRank+1,len(newDF)))
        shuffle(remainingRanks)
        # Replace any NAs with random rank
        NAcount=0
        for inputRank in DFcol:
            if math.isnan(inputRank):
                #newRanks.append( np.random.choice(remainingRanks, 1, replace=False)[0] )
                newRanks.append( remainingRanks[NAcount] )
                NAcount+=1
            else:
                newRanks.append( int(inputRank) )
        for ap in absentPerturbed:
            newRanks.append(remainingRanks[NAcount])
            NAcount+=1
        newCol = pd.Series(newRanks, name=col)
        newDF = pd.concat([newDF, newCol], axis=1) 
    return newDF 



#----------------------
# KEA-specific
import pandas as pd
import os 
from time import sleep

def create_geneList_file(geneList):
    with open('KEA/geneList.txt','w') as file:
        for g in geneList:
            file.write(g+'\n')

#create_DF_from_KEAoutput  
def create_DF_from_KEAoutput(expt, finalDF):
    KEAout = pd.read_csv('KEA/KEA_output.csv', header=None, index_col=False)
    KEAout.head()
    KEAout.columns = ['Kinase','number of substrates in the input gene-list', 'number of genes that are substrates of the kinase',\
                      'fraction of genes that are substrates compared to total number of genes in gene-list',\
                      'fraction of genes that are substrates compared to total number of genes in background',\
                      'difference between the background fraction and the substrate-list fraction',\
                      'pvalue', 'ztest_rank', 'combined_score','substrates']
    KEAout.index = KEAout['Kinase']
    KEA_sort = KEAout.sort_values(by='pvalue')
    KEA_sort[expt] = range(0,len(KEA_sort))
    newDF = KEA_sort[['Kinase',expt]]
    finalDF = finalDF.merge(newDF, on='Kinase', how='outer')
    return finalDF

def return_negLog(expt, finalDF):
    KEAout = pd.read_csv('KEA/KEA_output.csv', header=None, index_col=False)
    KEAout.columns = ['Kinase','number of substrates in the input gene-list', 'number of genes that are substrates of the kinase',\
                      'fraction of genes that are substrates compared to total number of genes in gene-list',\
                      'fraction of genes that are substrates compared to total number of genes in background',\
                      'difference between the background fraction and the substrate-list fraction',\
                      'pvalue', 'ztest_rank', 'combined_score','substrates']
    KEAout.index = KEAout['Kinase']
    KEAout.head()
    # Create -log(pvalue)
    import numpy as np
    KEAout['-log(pvalue)'] = -np.log(pd.to_numeric(KEAout['pvalue']))
    KEA_sort = KEAout.sort_values(by=['-log(pvalue)'])[['Kinase','-log(pvalue)']] #** Select values to put in newDF
    KEA_sort.columns = ['Kinase', expt]
    finalDF = finalDF.merge(KEA_sort, on='Kinase', how='outer')
    return finalDF
 
   
    
def run_KEA_old(inputGMT, KEA_summary_file, replaceNAs=True):
    with open(inputGMT) as file:
        input_GMT = file.readlines()
    #input_GMT = input_GMT[0:10]
    finalDF = pd.DataFrame(columns=['Kinase'],dtype=float)
    for line in input_GMT:
        # Delete old files
        try:
            os.remove('KEA/KEA_output.csv')
            os.remove('KEA/geneList.txt')
        except:
            print("No files to delete")
        while os.path.exists('KEA/geneList.txt') or os.path.exists('KEA/KEA_output.csv'):
            sleep(.5)
        
        # Create gene list
        lineSp = line.split('\t')
        expt = lineSp[0]
        genes = [x.strip(',1.0') for x in lineSp[2:-1]]
        print("Processing: "+expt)
        # Create gene list txt file
        print(expt+': Creating genList file')
        create_geneList_file(genes)
        print('Waiting for KEA_output')
        while not os.path.exists('KEA/geneList.txt'):
            sleep(.5) 
        # Run KEA command line
        # result = subprocess.run(['/Library/Java/JavaVirtualMachines/1.6.0.jdk/Contents/Home/bin/java','-jar',\
        #                          'KEA/KEA-1.5-SNAPSHOT-jar-with-dependencies.jar','KEA/UberKeaFile.csv',\
        #                          'KEA/geneList.txt KEA/KEA_output.csv'] )
        # result.stdout.decode('utf-8')
        print('Running KEA')
        os.system('/Library/Java/JavaVirtualMachines/1.6.0.jdk/Contents/Home/bin/java '+\
                  ' -jar'+' KEA/KEA-1.5-SNAPSHOT-jar-with-dependencies.jar'+' ../../X2K_Databases/KINASE/KEA_2018/KEA2018_KINASES.csv'+\
                  ' KEA/geneList.txt'+ ' KEA/KEA_output.csv')
        # Sleep until the file is ready
        print('Waiting for KEA_output')
        while not os.path.exists('KEA/KEA_output.csv'):
            sleep(.5)
            
        # Read in KEA output and process
        print(expt+' : Creating dataframe')
        
        finalDF = return_negLog(expt, finalDF) #*** Control whether you what values are in final DF
        if replaceNAs==True:
            finalDF = replace_NAs_with_random_rank(finalDF)
        
    finalDF.to_csv(KEA_summary_file, sep='\t', header=True, index=None, na_rep='NA')
    return finalDF

OFF


## Run KEA with Each Kinase Database

In [None]:
run_KEA_old

###  Fill NAs in X2K results files

In [38]:
"""
# UP
X2K_UP = pd.read_table('X2K_kinaseRanks_UP.txt', index_col=False)
X2K_filled_UP = replace_NAs_with_random_rank(X2K_UP) 
X2K_filled_UP.to_csv('X2K_output_NAsfilled_UP.txt', sep='\t', header=True, index=None, na_rep='NA')

# DN
X2K_DN = pd.read_table('X2K_kinaseRanks_DN.txt', index_col=False)
X2K_filled_DN = replace_NAs_with_random_rank(X2K_DN)
X2K_filled_DN.to_csv('X2K_output_NAsfilled_DN.txt', sep='\t', header=True, index=None, na_rep='NA')
"""

"\n# UP\nX2K_UP = pd.read_table('X2K_kinaseRanks_UP.txt', index_col=False)\nX2K_filled_UP = replace_NAs_with_random_rank(X2K_UP) \nX2K_filled_UP.to_csv('X2K_output_NAsfilled_UP.txt', sep='\t', header=True, index=None, na_rep='NA')\n\n# DN\nX2K_DN = pd.read_table('X2K_kinaseRanks_DN.txt', index_col=False)\nX2K_filled_DN = replace_NAs_with_random_rank(X2K_DN)\nX2K_filled_DN.to_csv('X2K_output_NAsfilled_DN.txt', sep='\t', header=True, index=None, na_rep='NA')\n"

### Run KEA (and fill NAs)

In [39]:
# KEA_UP = run_KEA_old(inputGMT='Kinase_Perturbations_from_GEO_up.txt', KEA_summary_file='KEA_output_NAsfilled_UP.txt', replaceNAs=True)
# KEA_DN = run_KEA_old(inputGMT='Kinase_Perturbations_from_GEO_down.txt', KEA_summary_file='KEA_output_NAsfilled_DN.txt', replaceNAs=True)

"""
# Get -log(pvalues)
## UP
KEA_UP = run_KEA_old(inputGMT='Kinase_Perturbations_from_GEO_up.txt', KEA_summary_file='KEA_results_negLogPval_UP.txt', replaceNAs=False)
KEA_UP.fillna(0, inplace=True)
## DN
KEA_DN = run_KEA_old(inputGMT='Kinase_Perturbations_from_GEO_down.txt', KEA_summary_file='KEA_results_negLogPval_DN.txt', replaceNAs=False)
KEA_DN.fillna(0, inplace=True)
"""
print("OFF")

OFF


## !!!!!!! Import Previously Processed Results !!!!!!!

In [40]:
import pandas as pd
import numpy as np

def absent_perturbed_kinases(DF):
        perturbedKinases = set([x.split('_')[0] for x in DF.columns[1:]])
        absentPerturbed=[]
        for pert in perturbedKinases:
            syns = synDict[pert]
            overlap = set(syns).intersection(set(DF['Kinase']))
            if len(overlap)==0:
                absentPerturbed.append(pert)
        return absentPerturbed


def add_absent_perturbed(DF):
    absent = absent_perturbed_kinases(DF)
    emptyDF = pd.DataFrame(0.0, index= np.arange(len(absent)), columns=DF.columns)
    emptyDF['Kinase'] = absent
    newDF = pd.concat([DF, emptyDF])
    newDF.index = range(0,len(newDF))
    return newDF

# DF=X2K_UP.copy()
# def add_allKinases(DF): 
#     allKinases = get_all_kinases()
#     newDF=pd.DataFrame()
#     missingKinases=[]
#     for k in allKinases:
#         syns = synDict[k]
#         overlap = set(syns).intersection(set(DF['Kinase']))
#         if len(overlap)==0:
#             missingKinases.append(k)
#     emptyDF = pd.DataFrame(0.0, index= np.arange(len(missingKinases)), columns=DF.columns)
#     emptyDF['Kinase'] = absent
#     newDF = pd.concat([DF, emptyDF])
#     newDF.index = range(0,len(newDF))


def standardize_geneSymbols(DF):
    newGenes=[]
    for g in DF['Kinase'].tolist():
        newGenes.append(standardizeGeneSymbol(g))
    DF['Kinase'] = newGenes
    return DF 

def import_fillNA_addMissingKinases(fileName,  standardize_genes=True, fillNAs=True, addAbsentKinases=True):
    file = pd.read_table(fileName, index_col=False)
    if standardize_genes==True:
        file = standardize_geneSymbols(file)
    if fillNAs==True:
        file.fillna(0, inplace=True)
    if addAbsentKinases==True:
        file = add_absent_perturbed(file)
    return file
        
        
    
# Import and correct data at the same time
# X2K
X2K_UP = import_fillNA_addMissingKinases('Results/X2K_results_negLogPval_UP.txt')
X2K_DN = import_fillNA_addMissingKinases('Results/X2K_results_negLogPval_DN.txt')

## KEA
KEA_UP = import_fillNA_addMissingKinases('Results/KEA_results_negLogPval_UP.txt')
KEA_DN = import_fillNA_addMissingKinases('Results/KEA_results_negLogPval_DN.txt')

# Convert Ranks to Zscore Ranks

In [41]:
"""
def zscore_Ranks(DF):
    newRanks_DF=pd.DataFrame()
    from scipy import stats
    rankDF =  DF.iloc[:,1:].copy()
    zscoreDF = pd.DataFrame(stats.zscore(rankDF, axis=1), columns=rankDF.columns)
    for col in zscoreDF:
        orderedCol = zscoreDF[col]
        orderedCol.index = DF['Kinase']
        # Assign new ranks based on zscores of old ranks
        orderedCol = orderedCol.sort_values()
        newRanks = pd.Series(data=range(0,len(orderedCol)), name=col, index=orderedCol.index)
        #Merge DFs on index
        newRanks_DF = pd.concat([newRanks_DF, newRanks], axis=1)
    #Add kinase col 
    newKinaseCol = pd.Series(newRanks_DF.index, name='Kinase', index=newRanks_DF.index)
    zscoreRanks_DF = pd.concat([newKinaseCol, newRanks_DF ], axis=1)
    zscoreRanks_DF = zscoreRanks_DF.reset_index()
    del zscoreRanks_DF['index']
    return zscoreRanks_DF

# Create Zscore DFs
X2K_UP_Zscore = zscore_Ranks(X2K_UP)
X2K_DN_Zscore = zscore_Ranks(X2K_DN)
KEA_UP_Zscore = zscore_Ranks(KEA_UP)
KEA_DN_Zscore = zscore_Ranks(KEA_DN)
"""

"\ndef zscore_Ranks(DF):\n    newRanks_DF=pd.DataFrame()\n    from scipy import stats\n    rankDF =  DF.iloc[:,1:].copy()\n    zscoreDF = pd.DataFrame(stats.zscore(rankDF, axis=1), columns=rankDF.columns)\n    for col in zscoreDF:\n        orderedCol = zscoreDF[col]\n        orderedCol.index = DF['Kinase']\n        # Assign new ranks based on zscores of old ranks\n        orderedCol = orderedCol.sort_values()\n        newRanks = pd.Series(data=range(0,len(orderedCol)), name=col, index=orderedCol.index)\n        #Merge DFs on index\n        newRanks_DF = pd.concat([newRanks_DF, newRanks], axis=1)\n    #Add kinase col \n    newKinaseCol = pd.Series(newRanks_DF.index, name='Kinase', index=newRanks_DF.index)\n    zscoreRanks_DF = pd.concat([newKinaseCol, newRanks_DF ], axis=1)\n    zscoreRanks_DF = zscoreRanks_DF.reset_index()\n    del zscoreRanks_DF['index']\n    return zscoreRanks_DF\n\n# Create Zscore DFs\nX2K_UP_Zscore = zscore_Ranks(X2K_UP)\nX2K_DN_Zscore = zscore_Ranks(X2K_DN)\nKEA_U

## Convert values to zscore

In [42]:
def drop_Zeros(DF):
    # Drop all rows that ONLY have (0). Never appeared across any experiment
    # Keeping the all 0s messes up the zscore
    DFcols = DF.iloc[:,1:]
    filteredDF = DF[(DFcols.T != 0).any()]
    return filteredDF

#DF = X2K_UP.copy()
def values_to_zscores(DF, dropZeros=True):
    if dropZeros==True:
        df = drop_Zeros(DF).copy()
    else:
        df = DF.copy()
    from scipy import stats
    #df.index = df['Kinase']
    df.index = range(0,len(df))
    # 1. Shuffle around only the rows with -log(pval)==0 
    ## to make sure there's no bias in their zscores (e.g. based on alphabetical order)
    for col in df.iloc[:,1:]:
        dfCol = df.loc[:,col]
        #print(col)
        # Get all the zeros
        df0 = dfCol.loc[dfCol==0]
        df0 = df0.sample(frac=1) # Shuffle 
        # Get all the non-zeros
        dfNum = dfCol.loc[dfCol!=0]
        # Recombine them and put back into df
        df.loc[:,col] = pd.concat([dfNum, df0]) ###### here
        
    # 2. Now take zscore
    DFsub  = df.iloc[:,1:]
    zscoreDF = pd.DataFrame(stats.zscore(DFsub, axis=1), columns=DFsub.columns, index=DFsub.index)
    # 3. Add kinase col 
    newKinaseCol = pd.Series(df['Kinase'], name='Kinase', index=df.index)
    zscoreDF = pd.concat([newKinaseCol, zscoreDF ], axis=1)
    zscoreDF = zscoreDF.reset_index()
    del zscoreDF['index']
    return zscoreDF


# Convert to zscores
# X2K
X2K_UP_nLog_zscore = values_to_zscores(X2K_UP)
X2K_DN_nLog_zscore = values_to_zscores(X2K_DN)
# KEA
KEA_UP_nLog_zscore = values_to_zscores(KEA_UP)
KEA_DN_nLog_zscore = values_to_zscores(KEA_DN)
# sns.distplot(X2K_UP_nLog_zscore.iloc[:,6])

## Convert Values to Ranks

In [None]:
def values_to_ranks(DF):
    #newRanks_DF=pd.DataFrame(index=DF.index)
    DF.index = DF['Kinase']
    Ranks=[]
    # assign ranks based on given value (could be pvalue, -log(pvalue), ranks, etc)
    for col in DF.columns[1:]:
        #print(col)
        # Since zscore comes from -log(pvalue), flip the rank order so that low numbered ranks are still the best
        orderedCol = DF[col].sort_values(ascending=False)
        
        newRanks = pd.Series(data=range(0,len(orderedCol)), name=col, index=orderedCol.index)
        newRanks.sort_index(inplace=True) # Sort by index
        Ranks.append(newRanks.values)
        #newRanks = pd.DataFrame(data=list(range(0,len(orderedCol))), columns=[col], index=orderedCol.index)
        # Merge DFs on index
        #newRanks_DF = pd.concat([newRanks_DF, newRanks_DF], axis=1)
        #newRanks_DF = pd.merge(newRanks_DF, newRanks, left_index=True, right_index=True)
        #newRanks_DF.loc[:,col] = newRanks
    newRanks_DF = pd.DataFrame(data=np.column_stack(Ranks), columns=DF.columns[1:], index=DF.index)
    # Add back kinase col 
    newKinaseCol = pd.Series(newRanks_DF.index, name='Kinase', index=newRanks_DF.index)
    ranks_DF = pd.concat([newKinaseCol, newRanks_DF ], axis=1)
    ranks_DF.index = range(0,len(ranks_DF))
    return ranks_DF

### -log(pvalues) to Rank

In [45]:
X2K_UP_nLog_ranks = values_to_ranks(X2K_UP)
X2K_DN_nLog_ranks = values_to_ranks(X2K_DN)
# KEA
KEA_UP_nLog_ranks = values_to_ranks(KEA_UP)
KEA_DN_nLog_ranks = values_to_ranks(KEA_DN)

### Zscores to Ranks

In [46]:
X2K_UP_nLog_zscore_ranks = values_to_ranks(X2K_UP_nLog_zscore)
X2K_DN_nLog_zscore_ranks = values_to_ranks(X2K_DN_nLog_zscore)
# KEA
KEA_UP_nLog_zscore_ranks = values_to_ranks(KEA_UP_nLog_zscore)
KEA_DN_nLog_zscore_ranks = values_to_ranks(KEA_DN_nLog_zscore)

# Method comparison

In [47]:
# Data group lists
negLogPvals = ['X2K_UP','X2K_DN','KEA_UP','KEA_DN']
negLogPval_Ranks = ['X2K_UP_nLog_ranks', 'X2K_DN_nLog_ranks', 'KEA_UP_nLog_ranks', 'KEA_DN_nLog_ranks']
negLogPval_zScore_Ranks = ['X2K_UP_nLog_zscore_ranks','X2K_DN_nLog_zscore_ranks','KEA_UP_nLog_zscore_ranks','KEA_DN_nLog_zscore_ranks']

In [48]:
"""
# Saving the objects:
negLogPvals_VARS = [X2K_DN, X2K_DN, X2K_DN, KEA_UP, KEA_DN]
negLogPval_Ranks_VARS = [X2K_UP_nLog_ranks, X2K_DN_nLog_ranks, KEA_UP_nLog_ranks, KEA_DN_nLog_ranks]
negLogPval_zScore_Ranks_VARS = [X2K_UP_nLog_zscore_ranks,X2K_DN_nLog_zscore_ranks,KEA_UP_nLog_zscore_ranks,KEA_DN_nLog_zscore_ranks]
allVars = negLogPvals_VARS+negLogPval_Ranks_VARS+negLogPval_zScore_Ranks_VARS

# def eval_list(varList):
#     evalList = []
#     for var in varList:
#         evalList.append(eval(var))
#     return evalList
# 
# negLogPvals_vars = eval_list(negLogPvals)
# negLogPval_Ranks_vars = eval_list(negLogPval_Ranks)
# negLogPval_zScore_Ranks_vars = eval_list(negLogPval_zScore_Ranks)


# Save
def save_pickle(name, varList):
    with open('Saved_Variables/'+name, 'wb') as f:  # Python 3: open(..., 'wb')
        pickle.dump(varList, f)
save_pickle('allVars.pkl', allVars)
save_pickle('negLogPvals_vars.pkl', negLogPvals_vars)
save_pickle('negLogPval_Ranks_vars.pkl', negLogPval_Ranks_vars)
save_pickle('negLogPval_zScore_Ranks_vars.pkl', negLogPval_zScore_Ranks_vars)

# Load
def load_pickle(name, varList):
    with open('objs.pkl','rb') as f:  # Python 3: open(..., 'rb')
       allVars = pickle.load(f)

# import processed data
import pickle
"""
print("OFF")

OFF


## Clustermaps of Kinase Predictions

In [49]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(font="Arial")

# Color heatmap by kinase groups/families
def get_kinase_groups_families():
    homo = pd.read_excel('../../X2K_Summaries/General_Resources/Kinase.com/Kinome_Hsap_updated.xls').loc[:,['Name','Group','Family','Subfamily']]
    mus = pd.read_excel('../../X2K_Summaries/General_Resources/Kinase.com/Kinome_Mmus.xls').loc[:,['Gene Name','Group','Family','Subfamily']]
    mus = mus.rename(columns={'Gene Name':'Name'})
    # Fill missing Subfamily info with Family
    ##homo['Subfamily'] = homo['Subfamily'].fillna(homo['Family'])
    ##mus['Subfamily'] = mus['Subfamily'].fillna(mus['Family'])
    
    # Capitalize everything
    homo = homo.apply(lambda x: x.astype(str).str.upper())
    mus = mus.apply(lambda x: x.astype(str).str.upper())
    both = pd.concat([homo, mus]).drop_duplicates()
    both['Subfamily'] = both['Subfamily'].fillna('[No Info.]')
    both.columns = ['Kinase','Kinase_Group','Kinase_Family','Kinase_Subfamily']
    both.index = both['Kinase']
    return both
    
def category_colors_dict(category):
    KEA_gf = get_kinase_groups_families()
    uniqueCats = KEA_gf[category].unique()
    colorCodes = sns.color_palette("hls", len(uniqueCats))
    colorDict = dict(zip(uniqueCats, colorCodes))
    row_colors = KEA_gf[category].map(colorDict)
    return row_colors, colorDict

def plotHeatmap(DF, method='', z_score=None, category='Kinase_Group', saveFig=True):
    DF.dropna(inplace=True)
    scaleKey = {None:'raw', 0:'zscore-row', 1:'zscore-col'}
    plotDF = DF.iloc[:,1:]
    plotDF.index = DF['Kinase']
    # Apply z-score tranformation
    if z_score==None:
        rankMethod = 'Rank'
    else:
        rankMethod = scaleKey[z_score]
        
    # CLUSTERMAP
    row_colors, colorDict = category_colors_dict(category)
    g = sns.clustermap(plotDF, z_score=z_score, row_colors=row_colors, cmap="RdBu" ) #"inferno", "hot"
    # Change label params
    g.ax_heatmap.set_title(method +" : " + rankMethod, pad=130)
    # Set position of main colorbar
    g.cax.set_position([.05, .2, .03, .45])
    # Draw legend for classes
    for label in colorDict.keys():
        g.ax_row_dendrogram.bar(0, 0, color=colorDict[label], label=label, linewidth=0)
    g.ax_row_dendrogram.legend(loc="upper right", ncol=2, bbox_to_anchor=(.35, 1.35), borderaxespad=1).set_title(category)
    # Save fig
    if saveFig==True:
        g.savefig('Figures/Clustermaps/'+method+'_'+scaleKey[z_score]+'_clustermap.png')


def iterate_clustermaps(dfList, z_score=None, category='Kinase_Family', saveFig=False):
    for df in dfList:
        print(df)
        plotHeatmap(DF=eval(df), method=df, z_score=z_score, category=category, saveFig=saveFig)
        
        
# Plot/save clustermap
iterate_clustermaps(negLogPval_Ranks, z_score=None, category='Kinase_Group', saveFig=True)
iterate_clustermaps(negLogPval_Ranks, z_score=0, category='Kinase_Group', saveFig=True)

X2K_UP_nLog_ranks


X2K_DN_nLog_ranks


KEA_UP_nLog_ranks


KEA_DN_nLog_ranks


X2K_UP_nLog_ranks


X2K_DN_nLog_ranks


KEA_UP_nLog_ranks


KEA_DN_nLog_ranks


## Layered Rank Distribution Plots 

### Every kinase across all experiments

In [50]:
"""

import seaborn as sns  
import matplotlib.pyplot as plt

def kinase_layered_KDE_subplot(dfList):
    f, (ax1, ax2) = plt.subplots(2, 2, sharex='all', sharey='all')
    for i,df in enumerate(dfList):
        print('Plotting all kinase distributions for... ' + df)
        DF =  eval(df)
        if i<2:
            ax=ax1[i]
        else:
            ax=ax2[i]
        for i,row in DF.iterrows():
            sns.kdeplot(row.values[1:], label=row[0], cut=0, ax=ax)
            plt.title('Kinase rank distributions: '+df)
            plt.ylabel('Frequency')
            plt.xlabel('Raw Rank')
            
negLogPval_Ranks
kinase_layered_KDE_subplot(dfList=['X2K_UP_nLog_ranks']) #['X2K_UP','X2K_DN','KEA_UP', 'KEA_DN']

"""
print("OFF")

OFF


## Red-cluster kinases from heatmaps

In [51]:
# nLog_ranks = X2K_DN_nLog_ranks.copy()
def get_red_cluster(nLog_ranks_name, threshold=50, save=True):
    nLog_ranks = eval(nLog_ranks_name)
    nLog_ranks.index = nLog_ranks['Kinase']
    # get row/kinase average rank
    meanRanks = nLog_ranks.mean(axis=1)
    topKinases = list(meanRanks[meanRanks<threshold].sort_values().index)
    # Subet DF to just top ranked kinases
    topKinases_DF = nLog_ranks[nLog_ranks['Kinase'].isin(topKinases)]
    if save==True:
        topKinases_DF.to_csv('Results/Red_Clusters/'+nLog_ranks_name+"_redCluster.csv", index=False )
    return topKinases_DF

# X2K 
X2K_UP_redCluster = get_red_cluster('X2K_UP_nLog_ranks',100)
X2K_DN_redCluster = get_red_cluster('X2K_DN_nLog_ranks', 100)
#KEA
KEA_UP_redCluster = get_red_cluster('KEA_UP_nLog_ranks', 100)
KEA_DN_redCluster = get_red_cluster('KEA_DN_nLog_ranks', 100)

### Heatmap: Ranks of Predicted-Kinases (y-axis) vs. Perturbed-Kinases

## Clustergrammer

In [56]:
from clustergrammer_widget import *

def clustergrammer_widget(DF):
    ## USING WIDGET
    net = Network(clustergrammer_widget)
    # load DataFrame
    net.load_df(DF)
    
    # cluster using default parameters
    ##net.normalize(axis='row', norm_type='zscore', keep_orig=True)
    
    # filter for the top 200 rows based on their absolute value sum
    ## net.filter_N_top('row', 200, 'sum')
    
    # cluster using default parameters
    net.cluster()
    # make interactive widget
    net.widget()
    
clustergrammer_widget( DF=create_adjacency_matrix(X2K_DN_nLog_ranks).iloc[:,1:] )

  del sys.path[0]


TypeError: clustergrammer_widget() got an unexpected keyword argument 'network'

## KDE Plot for Target Kinases

In [None]:
import numpy as np
import seaborn as sns
from random import shuffle
import matplotlib.pyplot as plt

def rescale_ranks(summaryDF):
    # Rescale the ranks from 0-1
    ranks = summaryDF[summaryDF['Rank']!='NA']['Rank']
    ranks = pd.to_numeric(ranks)
    ranks -= ranks.min() 
    ranks /= ranks.max()
    #ranks = ranks.sort_values(ascending=False)
    ranks.index = range(0,len(ranks))
    summaryDF.index = range(0,len(summaryDF))
    summaryDF['Scaled_Ranks'] = ranks
    return summaryDF

# DF = X2K_DN_negLog_zRank.copy()
# method='X2K_DN_negLog_zRank'
def targetKinaseRank_SummaryDF(DF, method):
    summaryDF = pd.DataFrame()
    for col in DF.columns[1:]:
        target = col.split("_")[0]
        expt = col
        syns = synDict[target]
        overlap =  set(syns).intersection(set(DF['Kinase']))
        # GET SUMMARY TABLE
        ## If the target is in the kinse list, put info there
        if target in DF['Kinase'].tolist():
            targetRank = DF.loc[DF['Kinase']==target, col].values[0]
        # If the target's synonym is in the kinase list, put info there
        elif len(overlap)>0:
            targetRank = DF.loc[DF['Kinase']==list(overlap)[0], col].values[0]
        else:
            targetRank ="NA"
            print(expt+': COULD NOT FIND KINASE RANK')
        # else:
        #     targetRank = 'NA'
        newDF = pd.DataFrame(np.column_stack([method, expt, target, targetRank]), columns=['Method','Experiment','Target_Kinase', 'Rank'])
        summaryDF = summaryDF.append(newDF)
    summaryDF = rescale_ranks(summaryDF)
    return summaryDF

def getShuffledRanks(DF, method):
    shuffledDF = pd.DataFrame()
    shuffledCols = list(DF.copy().columns[1:])
    shuffle(shuffledCols)
    i = 0
    for col in DF.columns[1:]:
        expt = col
        target =  shuffledCols[i].split("_")[0] # FAKE/RANDOM TARGET
        syns = synDict[target]
        overlap =  set(syns).intersection(set(DF['Kinase']))
       
        # Get shuffled summary
        # GET SUMMARY TABLE
        if target in DF['Kinase'].tolist():
            targetRank = DF.loc[DF['Kinase']==target, col].values[0]
        # Get Synonym's rank
        elif len(overlap)>0:
            targetRank = DF.loc[DF['Kinase']==list(overlap)[0], col].values[0]
        # Otherwise, return NA
        else:
            targetRank = 'NA'
        
        #print(target +" : "+str(targetRank))
        newDF = pd.DataFrame(np.column_stack([method, expt, target, targetRank]), columns=['Method','Experiment','Target_Kinase', 'Rank'])
        shuffledDF = shuffledDF.append(newDF)
        i+=1
    shuffledDF = rescale_ranks(shuffledDF)
    return shuffledDF

def getPerturbedKinaseRanks_for_every_expt(DF):
    oneCol=pd.DataFrame()
    df = DF.copy()
    perturbedKinases = list(set([x.split('_')[0] for x in list(df.columns)]))
    df = df[df['Kinase'].isin(perturbedKinases)]
    for col in df.columns[1:]:
        actualTarget = col.split('_')[0]
        subset = df[df['Kinase']!=actualTarget]
        oneCol = pd.concat([oneCol, subset[col]])
    # Rescale ranks from 0-1
    oneCol -= oneCol.min() 
    oneCol /= oneCol.max()
    return oneCol

def allKinase_Ranks(DF):
    allRanks=pd.DataFrame()
    for col in DF.columns[1:]:
        allRanks = pd.concat([allRanks, DF[col]])
        #print('Columns Rank Sum = '+str(sum(DF[col])) )
    # Rescale ranks from 0-1
    allRanks -= allRanks.min() 
    allRanks /= allRanks.max()
    return allRanks


def KDE_subplot(dfList, ax, scaledRanks):
    if scaledRanks==True:
        var = 'Scaled_Ranks'
    else:
        var = 'Rank'
    colorSets = [['magenta', 'm'], ['aqua', 'c'],['r', 'tomato']]
    for i,df in enumerate(dfList):
        print('Plotting.... '+ df)
        DF = eval(df)
        # Real Data
        summaryDF = targetKinaseRank_SummaryDF(DF, df)
        if scaledRanks==False:
            summaryDF = summaryDF[summaryDF['Rank']!='NA']
            summaryDF = summaryDF[summaryDF['Scaled_Ranks']!='NA']
        sns.kdeplot(summaryDF[var], shade=False, label=df, linestyle='-', cut=0, ax=ax, color=colorSets[i][0])
        # Shuffled data
        shuffledDF = getShuffledRanks(DF, df)
        sns.kdeplot(shuffledDF[var], shade=False, label=df+": Shuffled", linestyle="--", cut=0, ax=ax, color=colorSets[i][1])
        # Just the kinases that were perturbed in the input dataset
        sns.kdeplot(getPerturbedKinaseRanks_for_every_expt(DF)[0], shade=False, label=df+': All Perturbed Kinases', linestyle="-", cut=0, ax=ax,\
                    color=colorSets[2][i])
    # All kinase ranks (should be uniform)
    sns.kdeplot(allKinase_Ranks(DF)[0], shade=False, label='All Kinases', linestyle='-', cut=0, ax=ax, color='black')
    ax.legend(loc='center right', ncol=1, fontsize=8) #bbox_to_anchor=(1.25, 0.5) #bbox_to_anchor=(1.25, 0.5), ncol=1)


def layered_negLogPval_KDE(saveFig=True):
    f, (ax1, ax2) = plt.subplots(2, 1, sharex='all', sharey='all')
    # Make separate X2K and KEA subplots
    KDE_subplot(dfList=['X2K_DN'], ax=ax1, scaledRanks=True)#'X2K_UP', 
    KDE_subplot(dfList=['KEA_DN'], ax=ax2, scaledRanks=True)#'KEA_UP', 
    if saveFig==True:
        plt.draw()
        f.savefig('Figures/Distribution_Plots/'+ 'predictedKinase_negLogPval_Distributions.png')
        
        
def layered_KDE_plots(saveFig=True, scaledRanks=False):
    f, (ax1, ax2) = plt.subplots(2, 3, sharex='all', sharey=False)#'all')
    # Make separate X2K and KEA subplots
    KDE_subplot(dfList=['X2K_DN_nLog_ranks'], ax=ax1[0], scaledRanks=scaledRanks) #, 'X2K_DN_Zscore'
    KDE_subplot(dfList=['KEA_DN_nLog_ranks'], ax=ax2[0], scaledRanks=scaledRanks) #, 'KEA_DN_Zscore'
    KDE_subplot(dfList=['X2K_DN_nLog_zscore_ranks'], ax=ax1[1], scaledRanks=scaledRanks)
    KDE_subplot(dfList=['KEA_DN_nLog_zscore_ranks'], ax=ax2[1], scaledRanks=scaledRanks)
    KDE_subplot(dfList=['X2K_UP_redCluster'], ax=ax1[2], scaledRanks=scaledRanks)
    KDE_subplot(dfList=['X2K_DN_redCluster'], ax=ax2[2], scaledRanks=scaledRanks)
    if saveFig==True:
        plt.draw()
        f.savefig('Figures/Distribution_Plots/'+ 'predictedKinaseRank_Distributions.png')

negLogPvals = ['X2K_UP','X2K_DN','KEA_UP','KEA_DN']
negLogPval_Ranks = ['X2K_UP_nLog_ranks', 'X2K_DN_nLog_ranks', 'KEA_UP_nLog_ranks', 'KEA_DN_nLog_ranks']
negLogPval_zScore_Ranks = ['X2K_UP_nLog_zscore_ranks','X2K_DN_nLog_zscore_ranks','KEA_UP_nLog_zscore_ranks','KEA_DN_nLog_zscore_ranks']

# Run
## Plot the ranks directly from the sorted -log(pvalue)
layered_negLogPval_KDE(saveFig=True)
# Plot the ranks from 
layered_KDE_plots(saveFig=True)

NameError: name 'synDict' is not defined