# X2K API Tutorial Notebook
April 9<sup>th</sup>, 2018

This Jupyter Notebook contains an interactive tutorial for **running the Expression2Kinases (X2K) API** using Python 3.

### Table of Contents
The notebook contains the following sections:
1. **<a href="#1">API Documentation</a>** - shows how to programmatically analyze your gene list in Python.
2. **<a href="#2">Using the X2K API</a>** - overview of the input parameters and output of the API.
3. **<a href="#3">Interpreting the results</a>** - gives an overview of the structure and meaning of the analysis results.
    * **<a href="#chea">Transcription Factor Enrichment Analysis</a>** (ChEA)
    * **<a href="#g2n">Protein-Protein Interaction Expansion</a>** (G2N)
    * **<a href="#kea">Kinase Enrichment Analysis</a>** (KEA)
    * **<a href="#x2k">Expression2Kinases</a>** (X2K)

## 1. <span id="1">Using the X2K API</span>
The X2K API allows for programmatic analysis of an input gene list.

The `run_X2K()` function displayed below can be used to analyze a gene list and load the results in a Python dictionary by performing a **POST request**.

The function requires only one input, `input_genes`, **a list of gene symbols ** to be analyzed. Additional optional parameters can be specified with the `options` parameters.

In [2]:
% cd Kinase_Enrichment_Comparisons

import pandas as pd

# Standardize genes to HGNC symbols
mapping = pd.read_table('../../X2K_Summaries/General_Resources/Moshe_mapping/mappingFile_2017.txt', header=None)
greekLetters = pd.read_csv('../../X2K_Summaries/General_Resources/GreekLetter_Converter.csv', names=['Greek', 'Abbrev'], header=0 )
greekLetters = greekLetters.apply(lambda x: x.str.strip('\xa0'))

def standardizeGeneSymbol(gene):
    if gene.__contains__('AURORA'):
        HGNC = 'AURK' + gene[-1]
    elif any(substring in gene for substring in greekLetters['Greek']):
        for letter in greekLetters['Greek']:
            LETTER = letter.upper()
            if gene.__contains__(LETTER):
                HGNC = gene.replace(LETTER, greekLetters.loc[greekLetters['Greek']==letter,'Abbrev'].values[0] )
    else:
        HGNC = gene
    if HGNC in mapping[0]:
        HGNC = mapping.iloc[mapping[0]==HGNC, 1]
    return HGNC

# Get list of all kinases in KEA2018
import pandas as pd
KEA2018 = pd.read_csv('../../X2k_Databases/KINASE/KEA_2018/KEA2018_KINASES.csv', header=None)#pd.read_csv("KEA/UberKeaFile.csv")
KEA2018.head()
allKinases = KEA2018.iloc[:,2].unique().tolist()

/Users/schilder/Desktop/X2K_Web/Kinase_Enrichment_Comparisons


## Optimal X2K Parameters

In [3]:
best_options = {
    'TF-target gene background database used for enrichment': ['ChEA & ENCODE Consensus'],
    'kinase interactions to include': 'kea 2018',
    'enable_ppi': [ 
        'ppid',
        'Stelzl',
        'IntAct',
        'MINT',
        'BioGRID' 
    ],
    'max_number_of_interactions_per_article': 1000000,
    'max_number_of_interactions_per_protein': 200,
    'min_network_size': 10,
    'min_number_of_articles_supporting_interaction': 0,
    'path_length': 2,
    'included organisms in the background database': 'both'
}

## Run X2K Function

In [4]:
import http.client
import json

def run_X2K(input_genes, options={}):
    # Open HTTP connection
    # conn = http.client.HTTPConnection("amp.pharm.mssm.edu") #
    conn = http.client.HTTPConnection("localhost:8080", timeout=20)
    # Get default options
    default_options = {'text-genes': '\n'.join(input_genes), 'included_organisms': 'both', 'included_database': 'ChEA 2015',
                       'path_length': 2, 'minimum network size': 50, 'min_number_of_articles_supporting_interaction': 2,
                       'max_number_of_interactions_per_protein': 200, 'max_number_of_interactions_per_article': 100,
                       'biocarta': True, 'biogrid': True, 'dip': True, 'innatedb': True, 'intact': True, 'kegg': True, 'mint': True,
                       'ppid': True, 'snavi': True, 'number_of_results': 50, 'sort_tfs_by': 'combined score', 'sort_kinases_by': 'combined score',
                       'kinase interactions to include': 'kea 2018'}
    # Update options
    for key, value in options.items():
        if key in default_options.keys() and key != 'text-genes':
            default_options.update({key: value})
    # Get payload
    boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW"
    payload = ''.join(['--'+boundary+'\r\nContent-Disposition: form-data; name=\"{key}\"\r\n\r\n{value}\r\n'.format(**locals()) for key, value in default_options.items()])+'--'+boundary+'--'
    # Get Headers
    headers = {
        'content-type': "multipart/form-data; boundary="+boundary,
        'cache-control': "no-cache",
    }
    # Initialize connection
    conn.request("POST", "/X2K/api", payload, headers)
    # Get response
    res = conn.getresponse()
    # Read response
    data = res.read().decode('utf-8')
    # Convert to dictionary
    x2k_results = {key: json.loads(value) if key != 'input' else value for key, value in json.loads(data).items()}
    # Clean results
    x2k_results['ChEA'] = x2k_results['ChEA']['tfs']
    x2k_results['G2N'] = x2k_results['G2N']['network']
    x2k_results['KEA'] = x2k_results['KEA']['kinases']
    x2k_results['X2K'] = x2k_results['X2K']['network']
    # Return results
    return x2k_results


## Run X2K Iteratively over GMT

In [5]:
def parse_GEO_line(line):
    lineSp = line.split('\t')
    expt_name = lineSp[0]
    genes = [str(x.strip(',1.0')) for x in lineSp[2:-1]]
    return expt_name, genes

def prepare_options_for_x2k(input_genes, x2k_options):
    options = x2k_options.copy()
    # Add input_genes
    options['text-genes'] = input_genes
    # Convert ppi into enable flags
    for ppi in options['enable_ppi']:
        options['enable_' + ppi] = 'true'
    del options['enable_ppi']
    # Convert any lists
    return {
        k: '\n'.join(v) if type(v) == list else str(v)
        for k, v in options.items() 
    }

def run_x2k_over_experiments(experiments_GMT, x2k_options, binaryString='NA'):
    all_x2k_results = {}
    errors=[]
    with open(experiments_GMT) as gmt:
        GMT = gmt.readlines()
    for i,line in enumerate(GMT):
        # Get experiment name and input genes
        expt_name, input_genes = parse_GEO_line(line)
        ## Standardize input genes
        #input_genes = [standardizeGeneSymbol(g) for g in input_genes]
        #print(str(i)+" : "+expt_name)
        # Prepare options
        options = prepare_options_for_x2k(input_genes, x2k_options)
        # Run x2k API
        try:
            x2k_results = run_X2K(input_genes=input_genes, options=options)
             # Modify results
            x2k_results['Experiment'] = expt_name
            x2k_results['x2k_options'] = options
            x2k_results['binaryString'] = binaryString
            all_x2k_results[i] = x2k_results
        except:
            print("^ couldn't process: skipping")
            errors.append(expt_name) 
            continue
    print("ERRORS: ")
    print(errors)
    print()
    return all_x2k_results

 # all_x2k_results, errors = run_x2k_over_experiments(experiments_GMT='GEO.txt', x2k_options=best_options)

## Run X2K Web with each database

In [6]:
kinase_dbs = ['kea 2018',
        'ARCHS4',
        'iPTMnet',
        'NetworkIN',
        'Phospho.ELM',
        'Phosphopoint',
        'PhosphoPlus',
        'MINT']

def run_X2K_over_kinase_dbs(kinase_dbs, best_options, experiments_GMT):
    x2k_kinase_db_results={}
    selected_options = best_options.copy()
    for db in kinase_dbs:
        print('Processing: X2K with '+db)
        selected_options["kinase interactions to include"] = db
        x2k_kinase_db_results[db] = run_x2k_over_experiments(experiments_GMT, x2k_options=selected_options)
        print(x2k_kinase_db_results[db][0]['x2k_options'])
    return x2k_kinase_db_results

## Save X2K Results

In [None]:
import pickle
## UP
x2k_kinase_db_results_UP = run_X2K_over_kinase_dbs(kinase_dbs, best_options, experiments_GMT='Kinase_Perturbations_from_GEO_up.txt')
pickle.dump( x2k_kinase_db_results_UP, open( "x2kResults_eachKinaseDB_UP.pkl", "wb" ) )
## DN
x2k_kinase_db_results_DN = run_X2K_over_kinase_dbs(kinase_dbs, best_options, experiments_GMT='Kinase_Perturbations_from_GEO_down.txt')
pickle.dump( x2k_kinase_db_results_DN, open( "x2kResults_eachKinaseDB_DN.pkl", "wb" ) )

# KEA

## Run KEA (X2K_Web version)

In [None]:
% cd Kinase_Enrichment_Comparisons

import pandas as pd
import os 
from time import sleep

def create_geneList_file(geneList):
    with open('KEA/geneList.txt','w') as file:
        for g in geneList:
            file.write(g+'\n')

def return_kea_results():
    KEAout = pd.read_csv('KEA/KEA_output.csv', header=None, index_col=False)
    KEAout.columns = ['Kinase','number of substrates in the input gene-list', 'number of genes that are substrates of the kinase',\
                      'fraction of genes that are substrates compared to total number of genes in gene-list',\
                      'fraction of genes that are substrates compared to total number of genes in background',\
                      'difference between the background fraction and the substrate-list fraction',\
                      'pvalue', 'ztest_rank', 'combined_score','substrates']
    resultsDict = dict(zip(KEAout['Kinase'], KEAout['pvalue']))
    return resultsDict

 
def run_KEA_old(experiments_GMT, KEA_summary_path, kinaseDatabase):
    finalDict={}
    try: 
        with open(experiments_GMT) as file:
            input_GMT = file.readlines()
    except:
        pd.DataFrame(finalDict).to_csv(KEA_summary_path, sep='\t', header=True, index=None, na_rep='NA')
        return finalDict
    for line in input_GMT:
        # Delete old files
        try:
            os.remove('KEA/KEA_output.csv')
        except: print("No files to delete")
        try:
            os.remove('KEA/geneList.txt')
        except: print("No files to delete")

        while os.path.exists('KEA/geneList.txt') or os.path.exists('KEA/KEA_output.csv'):
            print("Nap time")
            sleep(.5)
        
        # Create gene list
        lineSp = line.split('\t')
        expt = lineSp[0]
        genes = [x.strip(',1.0') for x in lineSp[2:-1]]
        print("Processing: "+expt)
        # Create gene list txt file
        print(expt+': Creating genList file')
        create_geneList_file(genes)
        print('Waiting for KEA_output')
        while not os.path.exists('KEA/geneList.txt'):
            sleep(.5) 
        # Run KEA command line
        ##print('Running KEA')
        os.system('/Library/Java/JavaVirtualMachines/1.6.0.jdk/Contents/Home/bin/java '+\
                  ' -jar'+' KEA/KEA-1.5-SNAPSHOT-jar-with-dependencies.jar'+' KEA/resources/'+kinaseDatabase+"_KINASES.csv"\
                  ' KEA/geneList.txt'+ ' KEA/KEA_output.csv')
        # Sleep until the file is ready
        print('Waiting for KEA_output')
        while not os.path.exists('KEA/KEA_output.csv'):
            sleep(.5)
        # Read in KEA output and process
        print(expt+' : Adding to results dict')
        finalDict[expt] = return_kea_results() #*** Control whether you what values are in final DF
    
    pd.DataFrame(finalDict).to_csv(KEA_summary_path, sep='\t', header=True, index=None, na_rep='NA')
    return finalDict

## Run KEA with each database

In [None]:
kinase_dbs_KEA = ['KEA_2018',
        'ARCHS4',
        'iPTMnet',
        'NetworkIN',
        'Phospho.ELM',
        'Phosphopoint',
        'PhosphositePlus',
        'MINT']

def run_KEA_over_kinase_dbs(kinase_dbs_KEA, experiments_GMT):
    KEA_kinase_db_results={}
    for db in kinase_dbs_KEA:
        print(db)
        KEA_kinase_db_results[db] = run_KEA_old(experiments_GMT, KEA_summary_path='KEAsummary.txt', kinaseDatabase=db)
    return KEA_kinase_db_results


import pickle
## UP
KEA_kinase_db_results_UP = run_KEA_over_kinase_dbs(kinase_dbs_KEA, experiments_GMT='Kinase_Perturbations_from_GEO_up.txt')
pickle.dump( KEA_kinase_db_results_UP, open( "KEAresults_eachKinaseDB_UP.pkl", "wb" ) )
## DN
KEA_kinase_db_results_DN = run_KEA_over_kinase_dbs(kinase_dbs_KEA, experiments_GMT='Kinase_Perturbations_from_GEO_down.txt')
pickle.dump( x2k_kinase_db_results_DN, open( "KEAresults_eachKinaseDB_DN.pkl", "wb" ) )