## Plot importances

Plot the importances of the features used in classification

### Files Needed:
1. Positions features as a csv

### Instructions:
Run cells in order

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import sys
import os

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier

# CV Splitting
import random as random
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import make_classification
from sklearn.cross_validation import KFold
from sklearn.preprocessing import scale

# Evaluation
from sklearn.metrics import roc_curve, auc, precision_recall_curve,average_precision_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest

# Plotting
import matplotlib.pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# Downsamplers imports - prototype selection - controlled
from imblearn.under_sampling import RandomUnderSampler

#Import utils functions
from utils import create_groups, test_model_iterative_fixed
curr_dir = !pwd
sys.path.append(curr_dir[0]+"/utils")
from neg_pos_funcs import create_negatives_datasets, create_positives_datasets
from CV_funcs import calc_CV_idx_iterative

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")



### Reading the input dataset

In [2]:
curr_dir = !pwd
input_path = curr_dir[0]+"/../../10.Prediction/domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_01.25.18.csv"
bind_scores_num = 10
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]

features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
#Features columns names, without the labels (the binding scores)
features_cols = features_all.columns.tolist()
#removing binding scores and domain name
for ligand in ligands:
    score_str = ligand+"_binding_score"
    features_cols.remove(score_str)
features_cols.remove("max_binding_score")
features_cols.remove("domain_name")

print "all samples positions #: "+str(features_all.shape[0])

#CV splits dictionary
with open(curr_dir[0]+"/../CV_splits/domain_10_splits_dict.pik", 'rb') as handle:
    splits_dict = pickle.load(handle)

classifiers = {}
classifiers["Logistic"] = LogisticRegression(C=0.001, random_state=0)
classifiers["RF"] = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0)
#classifiers["KNN"] = KNeighborsClassifier(n_neighbors=100, n_jobs=-1)
#classifiers["SVM"] = SVC(kernel='rbf', probability=True, random_state=0)
#classifiers["ADA"] = AdaBoostClassifier(n_estimators=1000, random_state=0)
classifiers["XGB"] = XGBClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_depth=6, min_child_weight=0.05, colsample_bytree=0.5)

ABSOLUTE_NEGATIVES = False
FILTER_DOMAIN = False
FILTER_MAX_SCORE_ZERO = False

all samples positions #: 38944


### Datasets of negative examples

In [3]:
ligands_negatives_df = create_negatives_datasets(FILTER_DOMAIN, ABSOLUTE_NEGATIVES, FILTER_MAX_SCORE_ZERO, features_all, features_cols)

dna non-binding #:38095
dnabase non-binding #:38577
dnabackbone non-binding #:38203
rna non-binding #:38047
rnabase non-binding #:38407
rnabackbone non-binding #:38223
peptide non-binding #:35437
ion non-binding #:34488
metabolite non-binding #:33971
all_ligands non-binding #:27191


### Datasets of positive examples by ligand

In [4]:
bind_th = 0.1
ligands_features_df = create_positives_datasets(bind_th, features_all, features_cols)

dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


### Plot average feature importance for each group for each ligand

In [5]:
features_groups = create_groups(ligands_features_df)

------- population --------

avg_maf_all
avg_maf_altered
maf_hist_0-0.001
maf_hist_0.001-0.005
maf_hist_0.005-0.01
maf_hist_0.01-0.02
maf_hist_0.02-0.04
maf_hist_0.04-0.06
maf_hist_0.06-0.08
maf_hist_0.08-0.1
maf_hist_0.1-0.2
maf_hist_0.2-0.5
alter_num_aa
alter_num_aa_norm
alter_num_snp
alter_num_snp_norm
avg_aa_polymorphisms
frac_poly_aa
rare_poly_0.5
rare_poly_0.05
rare_poly_0.005

------- dna-con --------

phastCons1_avg
phastCons2_avg
phastCons3_avg
phyloP1_avg
phyloP2_avg
phyloP3_avg
phastCons1_hist_0.0-0.25
phastCons1_hist_0.25-0.5
phastCons1_hist_0.5-0.75
phastCons1_hist_0.75-0.8
phastCons1_hist_0.8-0.85
phastCons1_hist_0.85-0.9
phastCons1_hist_0.9-0.95
phastCons1_hist_0.95-1.0
phastCons2_hist_0.0-0.25
phastCons2_hist_0.25-0.5
phastCons2_hist_0.5-0.75
phastCons2_hist_0.75-0.8
phastCons2_hist_0.8-0.85
phastCons2_hist_0.85-0.9
phastCons2_hist_0.9-0.95
phastCons2_hist_0.95-1.0
phastCons3_hist_0.0-0.25
phastCons3_hist_0.25-0.5
phastCons3_hist_0.5-0.75
phastCons3_hist_0.75-0.8
phastC

In [6]:
def reject_outliers(data, m=2):
    return data[abs(data - np.mean(data)) < m * np.std(data)]

In [7]:
# Build a forest and compute the feature importances
import heapq
def plot_feature_importances(classifier, groups_dict, ligand, positives, negatives):
    group_names = sorted(groups_dict.keys())
    
    if classifier == "Logistic":
        importance_attr = "coef_"
    else:
        importance_attr = "feature_importances_"
        
    if classifier == "XGB":
        classifiers[classifier].set_params(scale_pos_weight = positives.shape[0]/float(negatives.shape[0]))
        
    models_req_scaling = ["SVM", "KNN"]

    # Sample positives and negatives
    X = pd.concat([positives, negatives])
    if (classifier in models_req_scaling):
        idx = X.index
        cols = X.columns
        X = pd.DataFrame(scale(X)) #Is z-scoring the data needed?
        X.index = idx #Restoring indices after scaling
        X.columns = cols
    
    y = [1] * positives.shape[0]
    y.extend([0] * negatives.shape[0])
    y = np.array(y)
    y_df = pd.DataFrame(y)
    y_df.index = X.index
    y_df.columns = ["label"]
    
    cv_idx = calc_CV_idx_iterative(X, splits_dict)
    
    for k in range(len(cv_idx)):
        pred_idx = k+1
        test_index = cv_idx[k]["test"]
        train_index = cv_idx[k]["train"]
        X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
        y_train, y_test = y_df.loc[train_index,:], y_df.loc[test_index,:]
        
        #Not downsampling
        X_train_sampled = X_train
        y_train_sampled = y_train
        
        # Fit model
        model = classifiers[classifier]
        model.fit(X_train_sampled, y_train_sampled["label"])
        importances = np.array(getattr(model, importance_attr)).flatten()
        break #Get the importances from just the first fold

    means = []
    n_groups = []
    for i in range(0,len(group_names)):
        vals = importances[groups_dict[group_names[i]]]
        means.append(np.sum(vals)) #Need to sum because features_importance is already normalize to sum to 1!
        n_groups.append(len(vals))

    # Format with plotly
    trace = go.Scatter(
        y = np.abs(means),
        x = group_names,
        mode='markers',
        marker=dict(
            sizemode = 'diameter',
            sizeref = 1,
            size = 25,
            color = n_groups,
            colorscale = 'Portland',
            colorbar = go.ColorBar(
                title = 'Number of Features'
            ),
            showscale = True
        ),
        text = group_names
    )
    data = [trace]

    layout= go.Layout(
        autosize = True,
        title = classifier + ": "+ ligand,
        hovermode = 'closest',
        yaxis = dict(
            title = 'Feature Importance',
            ticklen = 5,
            gridwidth = 2
        ),
        showlegend = False
    )
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig) #,filename='scatter2010')
    return np.abs(means)

In [None]:
means = []
for ligand in ligands:
    group_names = sorted(features_groups.keys())
    ligand_sum = np.zeros([len(group_names),])
    for classifier in classifiers:
        if classifier != "Logistic":
            ligand_sum += plot_feature_importances(classifier, features_groups, ligand, ligands_features_df[ligand], ligands_negatives_df[ligand])
        else:
            plot_feature_importances(classifier, features_groups, ligand, ligands_features_df[ligand], ligands_negatives_df[ligand])          
    means = ligand_sum/(len(classifiers)-1)
    
    # Get number of features in each group
    n_groups = [len(features_groups[name]) for name in group_names]

    # Format with plotly
    data = [go.Bar(
        y = np.abs(means),
        x = group_names,
        width = 0.5,
        marker=dict(
            color = n_groups,
            colorscale = 'Portland',
            colorbar = go.ColorBar(
                title = 'Number of Features'
            ),
            showscale = True
        ),
        text = group_names
    )]

    layout= go.Layout(
        autosize = True,
        title = "Avg: "+ligand,
        hovermode = 'closest',
        yaxis = dict(
            title = 'Average Feature Importances',
            ticklen = 5,
            gridwidth = 2
        ),
        showlegend = False
    )
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig)