In [391]:
from __future__ import division, print_function

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.cm
import scipy

import itertools
from functools import partial
import re
from collections import defaultdict, OrderedDict
import operator

from sklearn.metrics import auc


import json
from loader import loader

In [3]:
from results import transform

In [165]:
%load_ext autoreload
%autoreload 2
%matplotlib tk

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
data_raw = pd.read_csv('data/results.21.06.csv')
data_raw.set_index('dataset', inplace=True)

In [318]:
data_means = data_raw.applymap(partial(transform, array_mapper=np.mean))
data_stds = data_raw.applymap(partial(transform, array_mapper=np.std))

In [56]:
re.match('([\w_]+)(\+(\d+)%)?', column)

('a_Qb', '+10%', '10')

In [439]:
def get_family_and_metric(model_metric):
    model_family,_, model_metric = re.match('(\w+)(\((\w+)\))?', model_metric).groups()
    model_family = model_family.replace('Theano', '')
    if model_metric is None:
        model_metric = ''
    return model_family, model_metric

In [380]:
def get_protein_and_percent(dataset_name):
    dataset_protein, _, dataset_percent = re.match('([\w_]+)(\+(\d+)%)?', dataset_name).groups()
    if dataset_percent is None:
        dataset_percent = 0
    dataset_percent = int(dataset_percent)
    return dataset_protein, dataset_percent

In [440]:
data = pd.DataFrame(columns=['model_name',
                             'model_family',
                             'model_metric',
                             'dataset_name',
                             'dataset_percent',
                             'dataset_protein',
                             'score'])

index = 0
for row in data_raw.index:
    for column in data_raw.columns:
        values = transform(data_raw.get_value(row, column), array_mapper=lambda a: a)
        model_name = column
        model_family, model_metric = get_family_and_metric(model_name)
        
        
        dataset_name = row
        dataset_protein, dataset_percent = get_protein_and_percent(dataset_name)
        
        for value in values:
            data.loc[index] = [model_name, model_family, model_metric,dataset_name, dataset_percent,dataset_protein, value]
            index += 1

        

In [470]:
with open(r'data/cache/filenames.json', 'r') as f:
    filenames = json.load(f, object_pairs_hook=OrderedDict)


dataset_stats = pd.DataFrame(columns=['dataset', 'count', 'positive', 'positive_percent'])
dataset_stats.set_index('dataset', inplace=True)

for name, (X_filename, y_filename) in filenames.iteritems():
    y = np.load(y_filename+'.npy').astype(int)
    positive = (y==1).sum()
    dataset_stats.ix[name, ['count', 'positive', 'positive_percent']] = len(y),positive, positive/len(y)

In [477]:
positive_percent =dataset_stats.ix[data['dataset_name'], 'positive_percent']
data.loc[:, 'positive_percent'] = positive_percent.reset_index(drop=True)

In [508]:
def rank(data):
    ds_result = pd.DataFrame(index=data.index, columns=data.columns)
    for ds in data.index:
        models_result = data.ix[ds]
        ranks = scipy.stats.rankdata(models_result.values)
        ds_result.ix[ds] = ranks
    return ds_result


    
ranks = rank(data_means)
ranks.ix['avg'] = ranks.values.mean(axis=0)
ranks

Unnamed: 0_level_0,EEMTheano(tanimoto),EEMTheano(kulczynski2),EEMTheano(kulczynski3),EEMTheano(f1_score),RBFNet,XELMTheano(tanimoto),XELMTheano(kulczynski2),XELMTheano(kulczynski3),XELMTheano(f1_score),RandomForestClassifier,SVC,LogisticRegression
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5ht2a_ExtFP,12.0,9.0,10.0,7.0,5.0,4.0,3.0,2.0,1.0,8.0,11.0,6.0
5ht2a_ExtFP+10%DUD,1.0,5.0,4.0,3.0,2.0,7.0,9.0,8.0,6.0,12.0,11.0,10.0
5ht2a_ExtFP+50%DUD,3.0,1.0,2.0,6.0,4.0,8.0,12.0,11.0,5.0,10.0,9.0,7.0
5ht2a_ExtFP+100%DUD,3.0,1.0,2.0,6.0,4.0,9.0,12.0,11.0,5.0,10.0,8.0,7.0
5ht2c_ExtFP,12.0,4.0,8.0,2.0,9.0,6.0,3.0,7.0,1.0,10.0,11.0,5.0
5ht2c_ExtFP+10%DUD,3.0,2.0,4.0,10.0,1.0,6.0,7.0,8.0,5.0,11.0,12.0,9.0
5ht2c_ExtFP+50%DUD,4.0,1.0,2.0,9.0,3.0,6.0,8.0,7.0,5.0,10.0,12.0,11.0
5ht2c_ExtFP+100%DUD,5.0,1.0,4.0,9.0,2.0,6.0,8.0,7.0,3.0,10.0,12.0,11.0
5ht6_ExtFP,12.0,11.0,10.0,9.0,2.0,3.0,4.0,5.0,1.0,7.0,8.0,6.0
5ht6_ExtFP+10%DUD,3.0,1.0,2.0,4.0,5.0,10.0,9.0,12.0,8.0,6.0,7.0,11.0


In [480]:
ranks_stacked = ranks.stack().reset_index(drop=False)
ranks_stacked.columns = ['dataset_name',  'model_name', 'rank']

protein_percent = ranks_stacked['dataset_name'].apply(get_protein_and_percent)
ranks_stacked.loc[:, 'dataset_protein'] = map(operator.itemgetter(0), protein_percent)
ranks_stacked.loc[:, 'dataset_percent'] = map(operator.itemgetter(1), protein_percent)

family_metrics = ranks_stacked['model_name'].apply(get_family_and_metric)
ranks_stacked.loc[:, 'model_family'] = map(operator.itemgetter(0), family_metrics)
ranks_stacked.loc[:, 'model_metric'] = map(operator.itemgetter(1), family_metrics)

stats = dataset_stats.ix[data['dataset_name']]
positive_percent =stats['positive']/stats['count']

ranks_stacked.loc[:, 'positive_percent'] = dataset_stats.ix[ranks_stacked['dataset_name'], 
                                                            'positive_percent'].reset_index(drop=True)


In [220]:
from matplotlib.font_manager import FontProperties

fontP = FontProperties()
fontP.set_size('small')

In [521]:
n_proteins = len(data['dataset_protein'].unique())

markers = {
    '':'o',
    'tanimoto':'x',
    'kulczynski2':'D',
    'kulczynski3':'*',
    'f1_score':'s',
}

colors = {
    'EEM': 0,
    'XELM':1,
    'RBFNet':2,
    'RandomForestClassifier':3,
    'SVC':4,
    'LogisticRegression':5,
}

for ds_protein, ds_data in data.groupby('dataset_protein'):
    if (data.loc[data['dataset_protein'] == ds_protein, 'dataset_percent'] > 0).sum() == 0:
        print(ds_protein)
        continue
    fig, ax = plt.subplots()
    fig.gca().invert_xaxis()
    i=0
    n_models = len(ds_data['model_name'].unique())
    
    for (model_metric), model_data in ds_data.groupby(['model_metric']):

        ranks = ranks_stacked.ix[#(ranks_stacked['model_family'] == model_name) & \
                                 (ranks_stacked['model_metric'] == model_metric) & \
                                 (ranks_stacked['dataset_protein'] == ds_protein)]
        
        mean_ranks = ranks.groupby('positive_percent').mean()
        mean_ranks.reset_index(drop=False, inplace=True)
        pos_percent = mean_ranks['positive_percent'].values
        rank = mean_ranks['rank'].values
        #print(rank)

#         st= scipy.stats.binned_statistic(pos_percent, rank, bins=10)
#         pos_percent = st.bin_edges
#         pos_percent = pos_percent[:len(rank)]
#         rank = st.statistic

#         pos_percent =pos_percent[~np.isnan(rank)]
#         rank =rank[~np.isnan(rank)]
        
#         mean = model_data.groupby('positive_percent').mean()
#         mean = mean.reset_index(drop=False)
#         print(mean)
#         pos_percent = mean['positive_percent'].values
#         score = mean['score'].values
        
        # print(type(pos_percent), type(score))
        #plt.plot()
        label = model_name       
        if model_metric != '':
            label = '%s(%s)' % (label, model_metric)
            
        
            
        ax.plot(pos_percent, rank,
                color=matplotlib.cm.gist_ncar(colors[model_name]/6), marker=markers[model_metric], label=label)
        #plt.scatter(mean['positive_percent'].values, mean['score'].values, c=matplotlib.cm.jet(i/n_models))
        i+=1
    ax.legend(loc='lower left', prop=fontP)
    #ax.set_ylim([0.5, 1])
    ax.set_title(ds_protein)
    
#plt.show()
i


SERT_ExtFP
d2_ExtFP
h1_ExtFP
hiv_integrase_ExtFP
hiv_protease_ExtFP


5

In [447]:
n_proteins = len(data['dataset_protein'].unique())

markers = {
    '':'o',
    'tanimoto':'x',
    'kulczynski2':'D',
    'kulczynski3':'*',
    'f1_score':'s',
}

colors = {
    'EEM': 0,
    'XELM':1,
    'RBFNet':2,
    'RandomForestClassifier':3,
    'SVC':4,
    'LogisticRegression':5,
}

for ds_protein, ds_data in data.groupby('dataset_protein'):
    if (data.loc[data['dataset_protein'] == ds_protein, 'dataset_percent'] > 0).sum() == 0:
        print(ds_protein)
        continue
    fig, ax = plt.subplots()
    fig.gca().invert_xaxis()
    i=0
    n_models = len(ds_data['model_name'].unique())
    
    for (model_name, model_metric), model_data in ds_data.groupby(['model_family', 'model_metric']):

        mean = model_data.groupby('dataset_percent').mean()
        mean = mean.reset_index(drop=True)
        # print(mean)
        pos_percent = mean['positive_percent'].values
        score = mean['score'].values
        
        # print(type(pos_percent), type(score))
        #plt.plot()
        label = model_name      
        if model_metric != '':
            label = '%s(%s)' % (label, model_metric)
            
        
            
        ax.plot(mean['positive_percent'].values, mean['score'].values,
                color=matplotlib.cm.gist_ncar(colors[model_name]/len(colors)), marker=markers[model_metric], label=label)
        #plt.scatter(mean['positive_percent'].values, mean['score'].values, c=matplotlib.cm.jet(i/n_models))
        i+=1
    ax.legend(loc='lower left', prop=fontP)
    ax.set_ylim([0.5, 1])
    ax.set_title(ds_protein)
    
#plt.show()
i


EEM
EEM
EEM
EEM
LogisticRegression
RBFNet
RandomForestClassifier
SVC
XELM
XELM
XELM
XELM
EEM
EEM
EEM
EEM
LogisticRegression
RBFNet
RandomForestClassifier
SVC
XELM
XELM
XELM
XELM
EEM
EEM
EEM
EEM
LogisticRegression
RBFNet
RandomForestClassifier
SVC
XELM
XELM
XELM
XELM
EEM
EEM
EEM
EEM
LogisticRegression
RBFNet
RandomForestClassifier
SVC
XELM
XELM
XELM
XELM
EEM
EEM
EEM
EEM
LogisticRegression
RBFNet
RandomForestClassifier
SVC
XELM
XELM
XELM
XELM
SERT_ExtFP
EEM
EEM
EEM
EEM
LogisticRegression
RBFNet
RandomForestClassifier
SVC
XELM
XELM
XELM
XELM
d2_ExtFP
h1_ExtFP
EEM
EEM
EEM
EEM
LogisticRegression
RBFNet
RandomForestClassifier
SVC
XELM
XELM
XELM
XELM
hiv_integrase_ExtFP
hiv_protease_ExtFP


12

In [506]:
n_proteins = len(data['dataset_protein'].unique())

markers = {
    '':4,
    'tanimoto':0,
    'kulczynski2':1,
    'kulczynski3':2,
    'f1_score':3,
}

colors = {
    '':0,
    'tanimoto':1,
    'kulczynski2':2,
    'kulczynski3':3,
    'f1_score':4,
}

ds_by_experimets = \
data.groupby(['dataset_protein','positive_percent', 'model_family', 'model_metric']).mean()

ds_by_experimets.reset_index(drop=False, inplace=True)

fig, ax = plt.subplots()
fig.gca().invert_xaxis()
for metric, ds_by_metric in ds_by_experimets.groupby(['model_metric']):
    mean = ds_by_metric.groupby('positive_percent').mean()
    mean.reset_index(inplace=True, drop=False)
    pos_percent = mean['positive_percent'].values
    score = mean['score'].values
    
    st= scipy.stats.binned_statistic(pos_percent, score, bins=10)
    pos_percent = st.bin_edges
    pos_percent = pos_percent[:len(score)]
    score = st.statistic
    pos_percent =pos_percent[~np.isnan(score)]
    score =score[~np.isnan(score)]
    
    ax.plot(pos_percent, score,
                color=matplotlib.cm.gist_ncar(colors[metric]/len(colors)), 
            label = 'other' if metric=='' else metric)
    
    
ax.legend(loc='lower left', prop=fontP)
ax.set_ylim([0.5, 1])
ax.set_title('metric avg')

    





<matplotlib.text.Text at 0x57485358>

In [407]:
ranks_stacked

Unnamed: 0,dataset_name,model_name,rank,dataset_protein,dataset_percent,model_family,model_metric
0,5ht2a_ExtFP,EEMTheano(tanimoto),12,5ht2a_ExtFP,0,EEMTheano,tanimoto
1,5ht2a_ExtFP,EEMTheano(kulczynski2),9,5ht2a_ExtFP,0,EEMTheano,kulczynski2
2,5ht2a_ExtFP,EEMTheano(kulczynski3),10,5ht2a_ExtFP,0,EEMTheano,kulczynski3
3,5ht2a_ExtFP,EEMTheano(f1_score),7,5ht2a_ExtFP,0,EEMTheano,f1_score
4,5ht2a_ExtFP,RBFNet,5,5ht2a_ExtFP,0,RBFNet,
5,5ht2a_ExtFP,XELMTheano(tanimoto),4,5ht2a_ExtFP,0,XELMTheano,tanimoto
6,5ht2a_ExtFP,XELMTheano(kulczynski2),3,5ht2a_ExtFP,0,XELMTheano,kulczynski2
7,5ht2a_ExtFP,XELMTheano(kulczynski3),2,5ht2a_ExtFP,0,XELMTheano,kulczynski3
8,5ht2a_ExtFP,XELMTheano(f1_score),1,5ht2a_ExtFP,0,XELMTheano,f1_score
9,5ht2a_ExtFP,RandomForestClassifier,8,5ht2a_ExtFP,0,RandomForestClassifier,


In [503]:
markers = {
    '':4,
    'tanimoto':0,
    'kulczynski2':1,
    'kulczynski3':2,
    'f1_score':3,
}

colors = {
    '':0,
    'tanimoto':1,
    'kulczynski2':2,
    'kulczynski3':3,
    'f1_score':4,
}

ds_by_experiments = \
data.groupby(['dataset_protein','dataset_percent', 'model_family', 'model_metric']).mean()
ds_by_experiments.reset_index(drop=False, inplace=True)

# print(ds_by_experimets['model_family'].unique())#, ranks_stacked['model_name'].unique())

fig, ax = plt.subplots()
fig.gca().invert_xaxis()

for (metric, experiments) in ds_by_experimets.groupby(['model_metric']):

    ranks = ranks_stacked.ix[ranks_stacked['model_metric'] == metric]
    mean_ranks = ranks.groupby('positive_percent').mean()
    mean_ranks.reset_index(drop=False, inplace=True)
    pos_percent = mean_ranks['positive_percent'].values
    rank = mean_ranks['rank'].values
    
    st= scipy.stats.binned_statistic(pos_percent, rank, bins=10)
    pos_percent = st.bin_edges
    pos_percent = pos_percent[:len(rank)]
    rank = st.statistic
    
    pos_percent =pos_percent[~np.isnan(rank)]
    rank =rank[~np.isnan(rank)]
    

    
    # print(pos_percent, rank)
    ax.plot(pos_percent,rank ,
                color=matplotlib.cm.gist_ncar(colors[metric]/len(colors)), 
            label = 'other' if metric=='' else metric)
    #break
ax.legend(loc='lower left', prop=fontP)
#ax.set_ylim([0.5, 1])
ax.set_title('metric avg')

[ 0.01083211  0.0911252   0.17141828  0.41229754  0.49259063  0.6531768
  0.73346988] [ 7.93333333  7.5         6.875       8.25        8.91666667  7.5         5.625     ]
[ 0.01083211  0.0911252   0.17141828  0.41229754  0.49259063  0.6531768
  0.73346988] [ 6.5         5.07142857  3.          2.5         2.66666667  2.33333333
  4.        ]
[ 0.01083211  0.0911252   0.17141828  0.41229754  0.49259063  0.6531768
  0.73346988] [ 5.56666667  5.78571429  8.          6.5         4.66666667  6.66666667
  7.75      ]
[ 0.01083211  0.0911252   0.17141828  0.41229754  0.49259063  0.6531768
  0.73346988] [ 5.7         6.78571429  8.          6.5         5.33333333  6.33333333
  7.25      ]
[ 0.01083211  0.0911252   0.17141828  0.41229754  0.49259063  0.6531768
  0.73346988] [ 5.36666667  6.35714286  6.25        7.          8.5         8.66666667
  8.75      ]




<matplotlib.text.Text at 0x2366fbe0>