# X2K

In [None]:
% cd Kinase_Enrichment_Comparisons
import pandas as pd

# Standardize genes to HGNC symbols
mapping = pd.read_table('../../X2K_Summaries/General_Resources/Moshe_mapping/mappingFile_2017.txt', header=None)
greekLetters = pd.read_csv('../../X2K_Summaries/General_Resources/GreekLetter_Converter.csv', names=['Greek', 'Abbrev'], header=0 )
greekLetters = greekLetters.apply(lambda x: x.str.strip('\xa0'))

def standardizeGeneSymbol(gene):
    if gene.__contains__('AURORA'):
        HGNC = 'AURK' + gene[-1]
    elif any(substring in gene for substring in greekLetters['Greek']):
        for letter in greekLetters['Greek']:
            LETTER = letter.upper()
            if gene.__contains__(LETTER):
                HGNC = gene.replace(LETTER, greekLetters.loc[greekLetters['Greek']==letter,'Abbrev'].values[0] )
    else:
        HGNC = gene
    if HGNC in mapping[0]:
        HGNC = mapping.iloc[mapping[0]==HGNC, 1]
    return HGNC

# Get list of all kinases in KEA2018
import pandas as pd
KEA2018 = pd.read_csv('../../X2k_Databases/KINASE/KEA_2018/KEA2018_KINASES.csv', header=None)#pd.read_csv("KEA/UberKeaFile.csv")
KEA2018.head()
allKinases = KEA2018.iloc[:,2].unique().tolist()

## Import Results and Convert to Pvalue Matrices

In [None]:
import pandas as pd
import pickle
x2kResults_eachKinaseDB_UP = pickle.load( open( "x2kResults_syns_eachKinaseDB_DN.pkl", "rb" ) )
x2kResults_eachKinaseDB_DN = pickle.load( open( "x2kResults_syns_eachKinaseDB_DN.pkl", "rb" ) )

 
def pvalue_matrix(dataType, results):
    nameKey = {'ChEA':'simpleName','KEA':'name'}
    ## dict_keys(['X2K', 'ChEA', 'KEA', 'G2N', 'input', 'Experiment', 'x2k_options', 'binaryString'])
    # DB -> Experiment -> Kinase -> kinase results
    pvalDict={}
    for i,expt in enumerate(results):
        KEAres = results[expt][dataType]
        exptName = results[expt]['Experiment']
        predictedKinases = [y[nameKey[dataType]] for y in KEAres]
        predictedPvals = [y['pvalue'] for y in KEAres]
        pvalDict[exptName] = dict(zip(predictedKinases, predictedPvals))
    resultsDF = pd.DataFrame(pvalDict) 
    return resultsDF

def all_matrices(x2k_kinase_db_results):
    matrixDict={}
    for db in x2k_kinase_db_results.keys():
        print("Processing matrix for: " + db)
        matrixDict[db] = pvalue_matrix(dataType='KEA', results=x2k_kinase_db_results[db])
    return matrixDict

x2k_matrixDict_UP = all_matrices(x2kResults_eachKinaseDB_UP)
del x2kResults_eachKinaseDB_UP
x2k_matrixDict_DN = all_matrices(x2kResults_eachKinaseDB_DN)
del x2kResults_eachKinaseDB_DN

## Fill in NAs

In [None]:
import pandas as pd
import numpy as np

def add_absent_perturbed(DF):
    perturbed = DF.columns.str.split("_").str[0]
    predicted = DF.index.tolist()
    absent = list(set(perturbed) - set(predicted))
    emptyDF = pd.DataFrame(1.0, index= absent, columns=DF.columns)
    newDF = pd.concat([DF, emptyDF], axis=0)
    return newDF

def import_fillNA_addMissingKinases(DF, fillNAs=True, addAbsentKinases=True, negLogPval=True):  
    if fillNAs==True:
        DF.fillna(1.0, inplace=True)
    if addAbsentKinases==True:
        DF = add_absent_perturbed(DF)
    if negLogPval==True:
        DF  = np.negative(np.log(DF))
     return DF      
        
# Import and correct data at the same time
def preprocess_pvalMatrices(matrixDict):
    matrixDict_filled={}
    for db in matrixDict:
        DF = matrixDict[db].copy()
        matrixDict_filled[db] = import_fillNA_addMissingKinases(DF)
    return matrixDict_filled


x2k_matrixDict_negLog_UP = preprocess_pvalMatrices(x2k_matrixDict_UP)
x2k_matrixDict_negLog_DN = preprocess_pvalMatrices(x2k_matrixDict_DN)

## Convert Values to Ranks 

In [None]:
def values_to_ranks(DF, ascending=False):
    Ranks={}
    # assign ranks based on given value (could be pvalue, -log(pvalue), ranks, etc)
    for col in DF:
        # Since zscore comes from -log(pvalue), flip the rank order so that low numbered ranks are still the best
        orderedCol = DF[col].sort_values(ascending=ascending)
        # Shuffle order of 0s
        nonZeros = orderedCol.loc[orderedCol!=0]
        try:
            zeros = orderedCol.loc[orderedCol==0].sample(frac=1)
        except:
            zeros = pd.Series(dtype=float)
        shuffledCol = pd.concat([nonZeros, zeros])
        # Assign ranks
        newRanks = pd.Series(data=range(0,len(shuffledCol)), name=col, index=shuffledCol.index)
        newRanks.sort_index(inplace=True) # Sort by index
        Ranks[col] = dict(zip(newRanks.index, newRanks.values))
    return pd.DataFrame(Ranks)
    
def values_to_ranks_over_kinaseDBs(matrixDict_filled, ascending):
    matrixDict_ranks={}
    for db in matrixDict_filled:
        print(db)
        DF = matrixDict_filled[db]
        matrixDict_ranks[db] = values_to_ranks(DF, ascending)
     return matrixDict_ranks

# When getting ranks directly 
x2k_matrixDict_ranks_UP = values_to_ranks_over_kinaseDBs(x2k_matrixDict_negLog_UP, ascending=False)
x2k_matrixDict_ranks_DN = values_to_ranks_over_kinaseDBs(x2k_matrixDict_negLog_DN, ascending=False)

## Values to Zscores to Ranks

In [None]:
from scipy import stats

def values_to_zscores(DF, dropZeros=True):
    if dropZeros==True:
        # Drop all rows that ONLY have (0). Never appeared across any experiment
        # Keeping the all 0s messes up the zscore
        df = DF[(DF.T != 0).any()]
     else:
        df = DF.copy()
    zscoreDF = pd.DataFrame(stats.zscore(df, axis=1), columns=df.columns, index=df.index)
    return zscoreDF

def values_to_zscores_to_Ranks_over_kinaseDBs(matrixDict_filled):
    matrixDict_zscoreRanks={}
    for db in matrixDict_filled:
        print(db)
        DF = matrixDict_filled[db]
        zscoreDF = values_to_zscores(DF)
        matrixDict_zscoreRanks[db] = values_to_ranks(zscoreDF)
    return matrixDict_zscoreRanks
        
x2k_matrixDict_zscoreRanks_UP = values_to_zscores_to_Ranks_over_kinaseDBs(x2k_matrixDict_negLog_UP)
x2k_matrixDict_zscoreRanks_DN = values_to_zscores_to_Ranks_over_kinaseDBs(x2k_matrixDict_negLog_DN)

## Clustermaps of Kinase Predictions

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(font="Arial")

# Color heatmap by kinase groups/families
def get_kinase_groups_families():
    homo = pd.read_excel('../../X2K_Summaries/General_Resources/Kinase.com/Kinome_Hsap_updated.xls').loc[:,['Name','Group','Family','Subfamily']]
    mus = pd.read_excel('../../X2K_Summaries/General_Resources/Kinase.com/Kinome_Mmus.xls').loc[:,['Gene Name','Group','Family','Subfamily']]
    mus = mus.rename(columns={'Gene Name':'Name'})
    # Fill missing Subfamily info with Family
    ##homo['Subfamily'] = homo['Subfamily'].fillna(homo['Family'])
    ##mus['Subfamily'] = mus['Subfamily'].fillna(mus['Family'])
    
    # Capitalize everything
    homo = homo.apply(lambda x: x.astype(str).str.upper())
    mus = mus.apply(lambda x: x.astype(str).str.upper())
    both = pd.concat([homo, mus]).drop_duplicates()
    both['Subfamily'] = both['Subfamily'].fillna('[No Info.]')
    both.columns = ['Kinase','Kinase_Group','Kinase_Family','Kinase_Subfamily']
    both.index = both['Kinase']
    return both
    
def category_colors_dict(category):
    KEA_gf = get_kinase_groups_families()
    uniqueCats = KEA_gf[category].unique()
    colorCodes = sns.color_palette("hls", len(uniqueCats))
    colorDict = dict(zip(uniqueCats, colorCodes))
    row_colors = KEA_gf[category].map(colorDict)
    return row_colors, colorDict

def plotClustermap(DF, plotTitle='', z_score=None, category='Kinase_Group', saveFig=True):
    #DF.dropna(inplace=True)
    # Clustermap
    row_colors, colorDict = category_colors_dict(category)
    g = sns.clustermap(DF, z_score=z_score, row_colors=row_colors, cmap="RdBu") #"inferno", "hot"
    ## Change label
    g.ax_heatmap.set_title(plotTitle, pad=130)
    ## Set position of main colorbar
    g.cax.set_position([.05, .2, .03, .45])
    ## Draw legend for classes
    for label in colorDict.keys():
        g.ax_row_dendrogram.bar(0, 0, color=colorDict[label], label=label, linewidth=0)
    g.ax_row_dendrogram.legend(loc="upper right", ncol=2, bbox_to_anchor=(.35, 1.35), borderaxespad=1).set_title(category)
    if saveFig==True:
        g.savefig('Figures/Clustermaps/'+plotTitle+'_clustermap.png')


def iterate_clustermaps(matrixDict, method , z_score=None, category='Kinase_Family', saveFig=False):
    for db in matrixDict:
        print(db)
        DF =  matrixDict[db] 
        plotClustermap(DF=DF, plotTitle=method+" - "+db, z_score=z_score, category=category, saveFig=saveFig)
        
        
# Plot/save Clustermap
## UP
iterate_clustermaps(x2k_matrixDict_ranks_UP, method="X2K_Pvalue_Ranks_UP", z_score=None, category='Kinase_Group', saveFig=True)
iterate_clustermaps(x2k_matrixDict_zscoreRanks_UP, method="X2K_PvalueZscore_Ranks_UP", z_score=None, category='Kinase_Group', saveFig=True)

#iterate_clustermaps(x2k_matrixDict_ranks_UP, z_score=0, category='Kinase_Group', saveFig=True)
## DN
iterate_clustermaps(x2k_matrixDict_ranks_DN, method="X2K_Pvalue_Ranks_DN", z_score=None, category='Kinase_Group', saveFig=True)
iterate_clustermaps(x2k_matrixDict_zscoreRanks_DN, method="X2K_PvalueZscore_Ranks_DN", z_score=None, category='Kinase_Group', saveFig=True)
#iterate_clustermaps(x2k_matrixDict_ranks_UP, z_score=0, category='Kinase_Group', saveFig=True)

## Identify Red Clusters (Kinases that are always predicted)  

In [None]:
def get_red_cluster(nLog_ranks, method, threshold=50, save=True):
   # get row/kinase average rank
    meanRanks = nLog_ranks.mean(axis=1)
    topKinases = list(meanRanks[meanRanks<threshold].sort_values().index)
    # Subet DF to just top ranked kinases
    topKinases_DF = nLog_ranks[nLog_ranks.index.isin(topKinases)]
    if save==True:
        topKinases_DF.to_csv('Results/Red_Clusters/'+method+"_redCluster.csv", index=False )
    return topKinases_DF


def get_red_cluster_for_each_kinaseDB(matrixDict_ranks, method, threshold=100):
    redClusters={}
    percentage_report={}
    for db in matrixDict_ranks:
        DF = matrixDict_ranks[db]
        redClust = get_red_cluster(DF, method, threshold)
        redClusters[db] = redClust
        percentage_report[db] = len(redClust)/len(DF)*100
    return redClusters, percentage_report
 
def plot_redCluster_report(percentage_report, title, ax, sort=True):
    yvar = '% of Predicted Kinases in Red Clusters'
    barDF = pd.Series(percentage_report).reset_index()
    barDF.columns = ['Kinase Database',yvar]
    if sort==True:
        barDF = barDF.sort_values(by=yvar,ascending=False)
    sns.barplot(data=barDF, x='Kinase Database', y=yvar, ax=ax).set_title(title)
    


import matplotlib.pyplot as plt
import seaborn as sns
# X2K 
X2K_UP_redClusters, perReport_X2K_UP = get_red_cluster_for_each_kinaseDB(x2k_matrixDict_ranks_UP,'X2K_UP_nLogPval_ranks', 25)
X2K_DN_redClusters, perReport_X2K_DN = get_red_cluster_for_each_kinaseDB(x2k_matrixDict_ranks_DN, 'X2K_DN_nLogPval_ranks', 25)

f, ax = plt.subplots(2, 1, sharex=True)
plt.xticks(rotation=45)
plot_redCluster_report(perReport_X2K_UP, title="X2K_UP", ax=ax[0], sort=True)
plt.xticks(rotation=45)
plot_redCluster_report(perReport_X2K_DN, title="X2K_DN", ax=ax[1], sort=False)
f.suptitle('% of Predicted Kinases that were non-specific')

## KDE Distribution Plots

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def scaled_ranks(DFstack):
    scaledDF = DFstack.copy()
    # Rescale ranks from 0-1
    scaledDF['Rank'] -= scaledDF['Rank'].min() 
    scaledDF['Rank'] /= scaledDF['Rank'].max()
    return scaledDF

def KDE_subplots(x2k_matrixDict_ranks, scaledRanks=True, saveFig=False, supTitle=''):
    f, AX = plt.subplots(2, int(len(x2k_matrixDict_ranks)/2), sharex='all', sharey='all')
    AX = AX.ravel()
    for i,db in enumerate(x2k_matrixDict_ranks):
        print(db)
       
        ax=AX[i]
        DF = x2k_matrixDict_ranks[db]
        n = len(DF)
        DFstack = DF.stack().reset_index()
        DFstack.columns = ['Kinase','Experiment','Rank']
        if scaledRanks==True:
            DFstack = scaled_ranks(DFstack)
        
        print("line 0...")
        # Null distribution (all kinase ranks)
        g0 = sns.distplot( DFstack['Rank'], label='All Kinases',rug=False, hist=False, norm_hist=True, ax=ax).set_xlim(0,1)
        
        print("line 1...")
        # Target Kinases Only
        DFstack_target = DFstack.loc[DFstack['Kinase']==DFstack['Experiment'].str.split('_').str[0]]
        g1 = sns.distplot( DFstack_target['Rank'], label='Target Kinases',rug=False, hist=False, norm_hist=True, ax=ax).set_xlim(0,1)
       
        print("line 2...")
        # Shuffled targets
        DFstack_shuffled = DFstack_target.copy()
        DFstack_shuffled.loc[:,['Experiment','Rank']] = DFstack.sample(n=len(DFstack_target)).loc[:,['Experiment','Rank']].values
        g2 = sns.distplot( DFstack_shuffled['Rank'], label='Shuffled Kinases',rug=False, hist=False, norm_hist=True, ax=ax,\
                      kde_kws={"linestyle":"--"}).set_xlim(0,1)
         
        print("line 3...")
        # Perturbed kinases (regardless of experiment)
        perturbedKinases = set(DFstack['Experiment'].str.split('_').str[0])
        DFstack_perturbed = DFstack[DFstack['Kinase'].isin(perturbedKinases)] 
        DFstack_perturbed['Rank']  /= len(DFstack.columns) # Correct for additional samples
        g3 = sns.distplot( DFstack_perturbed['Rank'], label='Perturbed Kinases (all experiments)',rug=False, hist=False, norm_hist=True, ax=ax)
        g3.legend(bbox_to_anchor=(-1.5, -0.15), loc='upper center', borderaxespad=0., ncol=4)
        g3.set_xlim(0,1)
        #g3.set_title(db + "(n="+str(n)+")")
        g3.set_title(db)
        g3.set_xlabel('')
        
        if i<len(x2k_matrixDict_ranks)-1:
            ax.legend_.remove() 
        plt.suptitle(supTitle)
    if saveFig==True:
        plt.draw()
        f.savefig('Figures/Rank_Distribution_Plots/'+ supTitle+'.png')
    return f
 

fig1 = KDE_subplots(x2k_matrixDict_ranks_UP, scaledRanks=True, saveFig=True, supTitle='X2K_UP_pvalue_ranks - KDE Distributions') 
fig2 = KDE_subplots(x2k_matrixDict_ranks_DN, scaledRanks=True, saveFig=True, supTitle='X2K_DN_pvalue_ranks - KDE Distributions')

fig3 = KDE_subplots(x2k_matrixDict_zscoreRanks_UP, scaledRanks=True, saveFig=True, supTitle='X2K_UP_pvalueZscore_ranks - KDE Distributions') 
fig4 = KDE_subplots(x2k_matrixDict_zscoreRanks_DN, scaledRanks=True, saveFig=True, supTitle='X2K_DN_pvalueZscore_ranks - KDE Distributions')


x2k_matrixDict_ranks_UP['kea 2018'].mean(axis=1).sort_values()
x2k_matrixDict_ranks_UP['kea 2018'].std(axis=1).sort_values()

## Empirical Distribution Function (ECDF) Plots

In [None]:
from statsmodels.distributions.empirical_distribution import ECDF
import matplotlib.pyplot as plt

def plot_ECDF(DF, db, ax, supTitle, scaledRanks=True):
    DFstack = DF.stack().reset_index()
    DFstack.columns = ['Kinase','Experiment','Rank']
    if scaledRanks==True:
            DFstack = scaled_ranks(DFstack)
    # Target Kinases Only
    DFstack_target = DFstack.loc[DFstack['Kinase']==DFstack['Experiment'].str.split('_').str[0]]
    # Shuffled targets
    DFstack_shuffled = DFstack_target.copy()
    DFstack_shuffled.loc[:,['Experiment','Rank']] = DFstack.sample(n=len(DFstack_target)).loc[:,['Experiment','Rank']].values
    DFmerged = pd.merge(DFstack_target, DFstack_shuffled, on='Kinase',suffixes=['_target', '_shuffled'])
    # Plot
    for i,var in enumerate(['Rank_target','Rank_shuffled']):
        colors =  ['lime', 'red']
        sample = DFmerged[var]
        ecdf = ECDF(sample)
        x = np.linspace(min(sample), max(sample))
        y = ecdf(x)
        ax.step(x, y, label=var, color=colors[i] )
        ax.set_title(db)
        #ax.set_xlabel('Rank')
        plt.legend(loc='lower right')
        ax.set_facecolor('whitesmoke')

    plt.suptitle(supTitle)

def repeat_plot_ECDF(x2k_matrixDict_ranks, supTitle):
    f, AX = plt.subplots(2, int(len(x2k_matrixDict_ranks)/2), sharex='all', sharey='all')
    AX = AX.ravel()
    for i,db in enumerate(x2k_matrixDict_ranks):
        print(db)
        DF = x2k_matrixDict_ranks[db]
        plot_ECDF(DF, db, AX[i], supTitle)

repeat_plot_ECDF(x2k_matrixDict_ranks_UP, 'X2K_pvalueRanks_UP')
repeat_plot_ECDF(x2k_matrixDict_ranks_DN, 'X2K_pvalueRanks_DN')
repeat_plot_ECDF(x2k_matrixDict_zscoreRanks_UP, 'X2K_pvalueZscoreRanks_UP')
repeat_plot_ECDF(x2k_matrixDict_zscoreRanks_DN, 'X2K_pvalueZscoreRanks_DN')

### Plot difference between ECDF Lines

In [None]:
def plot_ECDF_difference(x2k_matrixDict_ranks, supTitle, scaledRanks=True):
    plt.figure()
    for db in x2k_matrixDict_ranks:
        DF = x2k_matrixDict_ranks[db]
        DFstack = DF.stack().reset_index()
        DFstack.columns = ['Kinase','Experiment','Rank']
        if scaledRanks==True:
                DFstack = scaled_ranks(DFstack)
        # Target Kinases Only
        DFstack_target = DFstack.loc[DFstack['Kinase']==DFstack['Experiment'].str.split('_').str[0]]
        # Shuffled targets
        DFstack_shuffled = DFstack_target.copy()
        DFstack_shuffled.loc[:,['Experiment','Rank']] = DFstack.sample(n=len(DFstack_target)).loc[:,['Experiment','Rank']].values
        DFmerged = pd.merge(DFstack_target, DFstack_shuffled, on='Kinase',suffixes=['_target', '_shuffled'])
        DFmerged['Rank_diff'] = DFmerged['Rank_target']  - DFmerged['Rank_shuffled'] 
        # Plot for each db
        sample = DFmerged['Rank_diff']
        ecdf = ECDF(sample)
        x = np.linspace(min(sample), max(sample))
        y = ecdf(x)
        plt.step(x, y, label=db)
        plt.xlabel('Target vs. Shuffled Rank Difference')
        plt.legend(loc='lower right')
        plt.title(supTitle)

plot_ECDF_difference(x2k_matrixDict_ranks_UP, 'X2K_pvalueRanks_UP')
plot_ECDF_difference(x2k_matrixDict_ranks_DN, 'X2K_pvalueRanks_DN')
plot_ECDF_difference(x2k_matrixDict_zscoreRanks_UP, 'X2K_pvalueZscoreRanks_UP')
plot_ECDF_difference(x2k_matrixDict_zscoreRanks_DN, 'X2K_pvalueZscoreRanks_DN')