In [1]:
from __future__ import division, print_function

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.cm
import scipy

import itertools
from functools import partial
import re
from collections import defaultdict, OrderedDict
import operator

from sklearn.metrics import auc


import json
from loader import loader

In [2]:
from results import transform

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib tk

In [5]:
data_raw = pd.read_csv('data/results.21.06.csv')
data_raw.set_index('dataset', inplace=True)

In [6]:
data_means = data_raw.applymap(partial(transform, array_mapper=np.mean))
data_stds = data_raw.applymap(partial(transform, array_mapper=np.std))

Index([u'5ht2a_ExtFP', u'5ht2a_ExtFP+10%DUD', u'5ht2a_ExtFP+50%DUD',
       u'5ht2a_ExtFP+100%DUD', u'5ht2c_ExtFP', u'5ht2c_ExtFP+10%DUD',
       u'5ht2c_ExtFP+50%DUD', u'5ht2c_ExtFP+100%DUD', u'5ht6_ExtFP',
       u'5ht6_ExtFP+10%DUD', u'5ht6_ExtFP+50%DUD', u'5ht6_ExtFP+100%DUD',
       u'5ht7_ExtFP', u'5ht7_ExtFP+10%DUD', u'5ht7_ExtFP+50%DUD',
       u'5ht7_ExtFP+100%DUD', u'M1_ExtFP', u'M1_ExtFP+10%DUD',
       u'M1_ExtFP+50%DUD', u'M1_ExtFP+100%DUD', u'SERT_ExtFP',
       u'cathepsin_ExtFP', u'cathepsin_ExtFP+10%DUD',
       u'cathepsin_ExtFP+50%DUD', u'cathepsin_ExtFP+100%DUD', u'd2_ExtFP',
       u'h1_ExtFP', u'hERG_ExtFP', u'hERG_ExtFP+10%DUD', u'hERG_ExtFP+50%DUD',
       u'hERG_ExtFP+100%DUD', u'hiv_integrase_ExtFP', u'hiv_protease_ExtFP'],
      dtype='object', name=u'dataset')

In [106]:
data_means.loc[:, 'positive'] = dataset_stats.ix[data_means.index, 'positive_percent']


In [119]:
np.round(data_means['positive'].astype(float),  3) * 100

dataset
5ht2a_ExtFP                68.3
5ht2a_ExtFP+10%DUD         21.4
5ht2a_ExtFP+50%DUD          5.7
5ht2a_ExtFP+100%DUD         3.0
5ht2c_ExtFP                56.6
5ht2c_ExtFP+10%DUD         10.2
5ht2c_ExtFP+50%DUD          2.4
5ht2c_ExtFP+100%DUD         1.2
5ht6_ExtFP                 81.4
5ht6_ExtFP+10%DUD           9.7
5ht6_ExtFP+50%DUD           2.1
5ht6_ExtFP+100%DUD          1.1
5ht7_ExtFP                 67.5
5ht7_ExtFP+10%DUD          10.5
5ht7_ExtFP+50%DUD           2.4
5ht7_ExtFP+100%DUD          1.2
M1_ExtFP                   44.7
M1_ExtFP+10%DUD            17.0
M1_ExtFP+50%DUD             4.9
M1_ExtFP+100%DUD            2.6
SERT_ExtFP                 68.0
cathepsin_ExtFP            20.6
cathepsin_ExtFP+10%DUD     16.2
cathepsin_ExtFP+50%DUD      8.7
cathepsin_ExtFP+100%DUD     5.5
d2_ExtFP                   53.8
h1_ExtFP                   53.8
hERG_ExtFP                 10.1
hERG_ExtFP+10%DUD           6.2
hERG_ExtFP+50%DUD           2.5
hERG_ExtFP+100%DUD          1.4


In [77]:
def get_family_and_metric(model_metric):
    model_family,_, model_metric = re.match('(\w+)(\((\w+)\))?', model_metric).groups()
    model_family = model_family.replace('Theano', '')
    if model_metric is None:
        model_metric = model_family
    return model_family, model_metric

In [8]:
def get_protein_and_percent(dataset_name):
    dataset_protein, _, dataset_percent = re.match('([\w_]+)(\+(\d+)%)?', dataset_name).groups()
    if dataset_percent is None:
        dataset_percent = 0
    dataset_percent = int(dataset_percent)
    return dataset_protein, dataset_percent

In [78]:
data = pd.DataFrame(columns=['model_name',
                             'model_family',
                             'model_metric',
                             'dataset_name',
                             'dataset_percent',
                             'dataset_protein',
                             'score'])

index = 0
for row in data_raw.index:
    for column in data_raw.columns:
        values = transform(data_raw.get_value(row, column), array_mapper=lambda a: a)
        model_name = column
        model_family, model_metric = get_family_and_metric(model_name)
        
        
        dataset_name = row
        dataset_protein, dataset_percent = get_protein_and_percent(dataset_name)
        
        for value in values:
            data.loc[index] = [model_name, model_family, model_metric,dataset_name, dataset_percent,dataset_protein, value]
            index += 1

        

In [11]:
with open(r'data/cache/filenames.json', 'r') as f:
    filenames = json.load(f, object_pairs_hook=OrderedDict)


dataset_stats = pd.DataFrame(columns=['dataset', 'count', 'positive', 'positive_percent'])
dataset_stats.set_index('dataset', inplace=True)

for name, (X_filename, y_filename) in filenames.iteritems():
    y = np.load(y_filename+'.npy').astype(int)
    positive = (y==1).sum()
    dataset_stats.ix[name, ['count', 'positive', 'positive_percent']] = len(y),positive, positive/len(y)

In [79]:
positive_percent =dataset_stats.ix[data['dataset_name'], 'positive_percent']
data.loc[:, 'positive_percent'] = positive_percent.reset_index(drop=True)

In [90]:
def rank(data):
    ds_result = pd.DataFrame(index=data.index, columns=data.columns)
    for ds in data.index:
        models_result = data.ix[ds]
        ranks = scipy.stats.rankdata(models_result.values)
        ds_result.ix[ds] = ranks
    return ds_result


    
ranks = rank(data_means)
ranks.ix['avg'] = ranks.values.mean(axis=0)
ranks

Unnamed: 0_level_0,EEMTheano(tanimoto),EEMTheano(kulczynski2),EEMTheano(kulczynski3),EEMTheano(f1_score),RBFNet,XELMTheano(tanimoto),XELMTheano(kulczynski2),XELMTheano(kulczynski3),XELMTheano(f1_score),RandomForestClassifier,SVC,LogisticRegression
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5ht2a_ExtFP,12.0,9.0,10.0,7.0,5.0,4.0,3.0,2.0,1.0,8.0,11.0,6.0
5ht2a_ExtFP+10%DUD,1.0,5.0,4.0,3.0,2.0,7.0,9.0,8.0,6.0,12.0,11.0,10.0
5ht2a_ExtFP+50%DUD,3.0,1.0,2.0,6.0,4.0,8.0,12.0,11.0,5.0,10.0,9.0,7.0
5ht2a_ExtFP+100%DUD,3.0,1.0,2.0,6.0,4.0,9.0,12.0,11.0,5.0,10.0,8.0,7.0
5ht2c_ExtFP,12.0,4.0,8.0,2.0,9.0,6.0,3.0,7.0,1.0,10.0,11.0,5.0
5ht2c_ExtFP+10%DUD,3.0,2.0,4.0,10.0,1.0,6.0,7.0,8.0,5.0,11.0,12.0,9.0
5ht2c_ExtFP+50%DUD,4.0,1.0,2.0,9.0,3.0,6.0,8.0,7.0,5.0,10.0,12.0,11.0
5ht2c_ExtFP+100%DUD,5.0,1.0,4.0,9.0,2.0,6.0,8.0,7.0,3.0,10.0,12.0,11.0
5ht6_ExtFP,12.0,11.0,10.0,9.0,2.0,3.0,4.0,5.0,1.0,7.0,8.0,6.0
5ht6_ExtFP+10%DUD,3.0,1.0,2.0,4.0,5.0,10.0,9.0,12.0,8.0,6.0,7.0,11.0


In [91]:
ranks_stacked = ranks.stack().reset_index(drop=False)
ranks_stacked.columns = ['dataset_name',  'model_name', 'rank']

protein_percent = ranks_stacked['dataset_name'].apply(get_protein_and_percent)
ranks_stacked.loc[:, 'dataset_protein'] = map(operator.itemgetter(0), protein_percent)
ranks_stacked.loc[:, 'dataset_percent'] = map(operator.itemgetter(1), protein_percent)

family_metrics = ranks_stacked['model_name'].apply(get_family_and_metric)
ranks_stacked.loc[:, 'model_family'] = map(operator.itemgetter(0), family_metrics)
ranks_stacked.loc[:, 'model_metric'] = map(operator.itemgetter(1), family_metrics)

stats = dataset_stats.ix[data['dataset_name']]
positive_percent =stats['positive']/stats['count']

ranks_stacked.loc[:, 'positive_percent'] = dataset_stats.ix[ranks_stacked['dataset_name'], 
                                                            'positive_percent'].reset_index(drop=True)


In [15]:
from matplotlib.font_manager import FontProperties

fontP = FontProperties()
fontP.set_size('small')

In [521]:
n_proteins = len(data['dataset_protein'].unique())

markers = {
    '':'o',
    'tanimoto':'x',
    'kulczynski2':'D',
    'kulczynski3':'*',
    'f1_score':'s',
}

colors = {
    'EEM': 0,
    'XELM':1,
    'RBFNet':2,
    'RandomForestClassifier':3,
    'SVC':4,
    'LogisticRegression':5,
}

for ds_protein, ds_data in data.groupby('dataset_protein'):
    if (data.loc[data['dataset_protein'] == ds_protein, 'dataset_percent'] > 0).sum() == 0:
        print(ds_protein)
        continue
    fig, ax = plt.subplots()
    fig.gca().invert_xaxis()
    i=0
    n_models = len(ds_data['model_name'].unique())
    
    for (model_metric), model_data in ds_data.groupby(['model_metric']):

        ranks = ranks_stacked.ix[#(ranks_stacked['model_family'] == model_name) & \
                                 (ranks_stacked['model_metric'] == model_metric) & \
                                 (ranks_stacked['dataset_protein'] == ds_protein)]
        
        mean_ranks = ranks.groupby('positive_percent').mean()
        mean_ranks.reset_index(drop=False, inplace=True)
        pos_percent = mean_ranks['positive_percent'].values
        rank = mean_ranks['rank'].values
        #print(rank)

#         st= scipy.stats.binned_statistic(pos_percent, rank, bins=10)
#         pos_percent = st.bin_edges
#         pos_percent = pos_percent[:len(rank)]
#         rank = st.statistic

#         pos_percent =pos_percent[~np.isnan(rank)]
#         rank =rank[~np.isnan(rank)]
        
#         mean = model_data.groupby('positive_percent').mean()
#         mean = mean.reset_index(drop=False)
#         print(mean)
#         pos_percent = mean['positive_percent'].values
#         score = mean['score'].values
        
        # print(type(pos_percent), type(score))
        #plt.plot()
        label = model_name       
        if model_metric != '':
            label = '%s(%s)' % (label, model_metric)
            
        
            
        ax.plot(pos_percent, rank,
                color=matplotlib.cm.gist_ncar(colors[model_name]/6), marker=markers[model_metric], label=label)
        #plt.scatter(mean['positive_percent'].values, mean['score'].values, c=matplotlib.cm.jet(i/n_models))
        i+=1
    ax.legend(loc='lower left', prop=fontP)
    #ax.set_ylim([0.5, 1])
    ax.set_title(ds_protein)
    
#plt.show()
i


SERT_ExtFP
d2_ExtFP
h1_ExtFP
hiv_integrase_ExtFP
hiv_protease_ExtFP


5

In [16]:
n_proteins = len(data['dataset_protein'].unique())

markers = {
    '':'o',
    'tanimoto':'x',
    'kulczynski2':'D',
    'kulczynski3':'*',
    'f1_score':'s',
}

colors = {
    'EEM': 0,
    'XELM':1,
    'RBFNet':2,
    'RandomForestClassifier':3,
    'SVC':4,
    'LogisticRegression':5,
}

for ds_protein, ds_data in data.groupby('dataset_protein'):
    if (data.loc[data['dataset_protein'] == ds_protein, 'dataset_percent'] > 0).sum() == 0:
        print(ds_protein)
        continue
    fig, ax = plt.subplots()
    fig.gca().invert_xaxis()
    i=0
    n_models = len(ds_data['model_name'].unique())
    
    for (model_name, model_metric), model_data in ds_data.groupby(['model_family', 'model_metric']):

        mean = model_data.groupby('dataset_percent').mean()
        mean = mean.reset_index(drop=True)
        # print(mean)
        pos_percent = mean['positive_percent'].values
        score = mean['score'].values
        
        # print(type(pos_percent), type(score))
        #plt.plot()
        label = model_name      
        if model_metric != '':
            label = '%s(%s)' % (label, model_metric)
            
        
            
        ax.plot(mean['positive_percent'].values, mean['score'].values,
                color=matplotlib.cm.gist_ncar(colors[model_name]/len(colors)), marker=markers[model_metric], label=label)
        #plt.scatter(mean['positive_percent'].values, mean['score'].values, c=matplotlib.cm.jet(i/n_models))
        i+=1
    ax.legend(loc='lower left', prop=fontP)
    ax.set_ylim([0.5, 1])
    ax.set_title(ds_protein)
    



KeyError: 'positive_percent'

In [21]:
np.sort(data['positive_percent'].unique())

array([0.010832109571513733, 0.012149204277323159, 0.012201885745978924,
       0.013961605584642234, 0.021379785341215634, 0.023788459648088078,
       0.023970853621165175, 0.024521678943985763, 0.025903552779768607,
       0.029855360135365994, 0.048970901348474094, 0.055429864253393663,
       0.057211448525285279, 0.062100913985225992, 0.087375178316690436,
       0.096734402389144977, 0.099507389162561577, 0.10064935064935066,
       0.10184327918525377, 0.10494931425163984, 0.16214427531436135,
       0.17029391967691271, 0.20622895622895623, 0.21429405582155786,
       0.44725987035945786, 0.53773129525341912, 0.53813559322033899,
       0.56647940074906367, 0.67497603068072864, 0.68036704263047221,
       0.68317200297840652, 0.77862783810463965, 0.81376297105406881], dtype=object)

In [95]:
n_proteins = len(data['dataset_protein'].unique())

markers = {
    'tanimoto':0,
    'kulczynski2':1,
    'kulczynski3':2,
    'f1_score':3,
    'RBFNet':4,
    'RandomForestClassifier':5,
    'SVC':6,
    'LogisticRegression':7,
}

colors_metric = {
    'tanimoto':0,
    'kulczynski2':1,
    'kulczynski3':2,
    'f1_score':3,
    'RBFNet':4,
    'RandomForestClassifier':5,
    'SVC':6,
    'LogisticRegression':7,
}
colors_family = {
    'XELM':0,
    'RBFNet':1,
    'EEM':2,
    'RandomForestClassifier':3,
    'SVC':4,
    'LogisticRegression':5,
}

ds_by_experimets = \
data.groupby(['dataset_protein','positive_percent', 'model_family', 'model_metric']).mean()

ds_by_experimets.reset_index(drop=False, inplace=True)

fig, ax = plt.subplots()
fig.gca().invert_xaxis()
for  metric, ds_by_metric in ds_by_experimets.groupby(['model_metric']):
    mean = ds_by_metric.groupby('positive_percent').max()
    mean.reset_index(inplace=True, drop=False)
    pos_percent = mean['positive_percent'].values
    score = mean['score'].values
    
    st= scipy.stats.binned_statistic(pos_percent, score, bins=[0,0.05, 0.1, 0.2, 0.4, 0.6, 1])
    pos_percent = st.bin_edges
    score = st.statistic
    pos_percent = pos_percent[:len(score)]
    
    pos_percent =pos_percent[~np.isnan(score)]
    score =score[~np.isnan(score)]
    
    ax.plot(pos_percent, score,
                color=matplotlib.cm.gist_ncar(colors_metric[metric]/len(colors_metric)), 
            label = 'other' if metric=='' else metric, marker='o')
    
    
ax.legend(loc='lower left', prop=fontP)
ax.set_ylim([0.5, 1])
ax.set_xlabel('positive percent')
ax.set_ylabel('best BAC')
ax.set_title('BAC by metric')

    



<matplotlib.text.Text at 0x7efc41747090>

In [96]:
ds_by_experimets

Unnamed: 0,dataset_protein,positive_percent,model_family,model_metric,dataset_percent,score
0,5ht2a_ExtFP,0.029855,EEM,f1_score,100.0,0.950087
1,5ht2a_ExtFP,0.029855,EEM,kulczynski2,100.0,0.701853
2,5ht2a_ExtFP,0.029855,EEM,kulczynski3,100.0,0.726278
3,5ht2a_ExtFP,0.029855,EEM,tanimoto,100.0,0.786686
4,5ht2a_ExtFP,0.029855,LogisticRegression,LogisticRegression,100.0,0.961564
5,5ht2a_ExtFP,0.029855,RBFNet,RBFNet,100.0,0.814835
6,5ht2a_ExtFP,0.029855,RandomForestClassifier,RandomForestClassifier,100.0,0.969915
7,5ht2a_ExtFP,0.029855,SVC,SVC,100.0,0.966321
8,5ht2a_ExtFP,0.029855,XELM,f1_score,100.0,0.947654
9,5ht2a_ExtFP,0.029855,XELM,kulczynski2,100.0,0.977299


In [120]:
markers = {
    '':4,
    'tanimoto':0,
    'kulczynski2':1,
    'kulczynski3':2,
    'f1_score':3,
}

colors_metric = {
    'tanimoto':0,
    'kulczynski2':1,
    'kulczynski3':2,
    'f1_score':3,
    'RBFNet':4,
    'RandomForestClassifier':5,
    'SVC':6,
    'LogisticRegression':7,
}
colors_family = {
    'XELM':0,
    'RBFNet':1,
    'EEM':2,
    'RandomForestClassifier':3,
    'SVC':4,
    'LogisticRegression':5,
}

ds_by_experiments = \
data.groupby(['dataset_protein','dataset_percent', 'model_family', 'model_metric']).mean()
ds_by_experiments.reset_index(drop=False, inplace=True)

# print(ds_by_experimets['model_family'].unique())#, ranks_stacked['model_name'].unique())

fig, ax = plt.subplots()
fig.gca().invert_xaxis()
fig.gca().invert_yaxis()


for (metric, experiments) in ds_by_experimets.groupby(['model_metric']):

    ranks = ranks_stacked.ix[ranks_stacked['model_metric'] == metric]
    mean_ranks = ranks.groupby('positive_percent').min()
    mean_ranks.reset_index(drop=False, inplace=True)
    pos_percent = mean_ranks['positive_percent'].values
    rank = mean_ranks['rank'].values
    
    
    st= scipy.stats.binned_statistic(pos_percent, rank, bins=[0,0.015, 0.05, 0.1, 0.2, 0.4 ,0.7, 1])
    pos_percent = st.bin_edges
    rank = st.statistic
    pos_percent = pos_percent[:len(rank)]
    
    
    #print(rank, pos_percent)
    
    pos_percent =pos_percent[~np.isnan(rank)]
    rank =rank[~np.isnan(rank)]
    

    
    # print(pos_percent, rank)
    ax.plot(pos_percent,rank ,
                color=matplotlib.cm.gist_ncar(colors_metric[metric]/len(colors_metric)), 
            label = 'other' if metric=='' else metric)
    #break
ax.legend(loc='lower left', prop=fontP)
#ax.set_ylim([12, 1])
ax.set_xlabel('positive percent')
ax.set_ylabel('best rank')
ax.set_title('rank by metric')

<matplotlib.text.Text at 0x7efc415ad210>