In [53]:
from __future__ import print_function, division

__author__ = 'amrit'

import sys

#sys.dont_write_bytecode = True
import pickle
import os
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.text as mpl_text
import numpy as np

ROOT=os.getcwd()
MLS=["DT", "RF", "SVM",  "FFT1"]
#metrics=['accuracy','recall','precision','false_alarm']
metrics=['recall','precision']
files=["pitsA", "pitsC", "pitsD", "pitsE", "pitsF"]
files_1 = ["pitsA_1", "pitsC_1", "pitsD_1",  "pitsE_1", "pitsF_1"]
features = ['10', '25', '50', '100']

class AnyObject(object):
    def __init__(self, text, color):
        self.my_text = text
        self.my_color = color

class AnyObjectHandler(object):
    def legend_artist(self, legend, orig_handle, fontsize, handlebox):
        x0, y0 = handlebox.xdescent, handlebox.ydescent
        width, height = handlebox.width, handlebox.height
        patch = mpl_text.Text(x=0, y=0, text=orig_handle.my_text, color=orig_handle.my_color, verticalalignment=u'baseline',
                                horizontalalignment=u'left', multialignment=None,
                                fontproperties=None, linespacing=None,
                                rotation_mode=None)
        handlebox.add_artist(patch)
        return patch

def dump_files(f='',prefix=''):
    for _, _, files in os.walk(ROOT + "/../dump/"):
        for file in files:
            if file.startswith(f+prefix):
                return file

def LDADE_FFT(LDADE,LDA, res):
    tuned_med = {}
    for f in files:
        for m in metrics:
            if m not in tuned_med:
                tuned_med[m] = {}
            if f not in tuned_med[m]:
                tuned_med[m][f]=[]
                tuned_med[m][f].append(['LDADE_FFT'] + LDADE[f]['FFT1'][m])
                tuned_med[m][f].append(['LDADE_SVM'] + LDADE[f]['SVM'][m])

            for x in features:
                    tuned_med[m][f].append([str(x) + '_FFT'] + LDA[f+res][x]['FFT1'][m])
    return tuned_med

def FFT_features(LDA):
    tuned_med = {}
    for f in files:
        for m in ['features']:
            if m not in tuned_med:
                tuned_med[m] = {}
            if f not in tuned_med[m]:
                tuned_med[m][f]=[]
            for x in ['10']:
                    tuned_med[m][f].append([str(x) + '_FFT'] + LDA[f][x]['FFT1'][m])
    return tuned_med

def LDADE_FFT_runtimes(LDADE):
    tuned_med = {}
    for f in files:
        for m in ['times']:
            if m not in tuned_med:
                tuned_med[m] = {}
            if f not in tuned_med[m]:
                tuned_med[m][f]=[]
                tuned_med[m][f].append(['LDADE_FFT'] + [np.median(LDADE[f]['FFT1'][m])])
                tuned_med[m][f].append(['LDADE_SVM'] + [np.median(LDADE[f]['SVM'][m])])

    return tuned_med

def SVM_FFT(LDA, untuned, res):
    tuned_med = {}
    for f in files:
        for m in metrics:
            if m not in tuned_med:
                tuned_med[m] = {}
            if f not in tuned_med[m]:
                tuned_med[m][f]=[]
                tuned_med[m][f].append(['TFIDF_SVM'] + untuned[f]['TFIDF']['SVM'][m])

            for x in features:
                    tuned_med[m][f].append([str(x) + '_FFT'] + LDA[f+res][x]['FFT1'][m])
    return tuned_med

def SVM_FFT_runtimes(LDA,untuned):
    tuned_med = {}
    for f in files:
        for m in ['times']:
            if m not in tuned_med:
                tuned_med[m] = {}
            if f not in tuned_med[m]:
                tuned_med[m][f]=[]
                tuned_med[m][f].append(['TFIDF_SVM'] + [np.median(untuned[f]['TFIDF']['SVM'][m])])

            for x in features:
                    tuned_med[m][f].append([str(x) + '_FFT'] + [np.median(LDA[f][x]['FFT1'][m])])
    return tuned_med

def draw(dic):
    font = {'size': 70}
    plt.rc('font', **font)
    paras = {'lines.linewidth': 70, 'legend.fontsize': 70, 'axes.labelsize': 80, 'legend.frameon': True,
                  'figure.autolayout': True,'axes.linewidth':8}
    plt.rcParams.update(paras) 

    boxprops = dict(linewidth=9,color='black')
    colors=['red','green', 'blue', 'orange','cyan','purple']*6
    whiskerprops = dict(linewidth=5)
    medianprops = dict(linewidth=8, color='firebrick')
    #meanpointprops = dict(marker='D', markeredgecolor='black',markerfacecolor='firebrick',markersize=20)

    fig = plt.figure(figsize=(80, 60))
    outer = gridspec.GridSpec(1, 1, wspace=0.1, hspace=0.2)
    for i,a in enumerate([1]):
        inner = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=outer[i], wspace=0.05, hspace=0.0)
        for j,b in enumerate(dic.keys()):
            ax = plt.Subplot(fig, inner[j])
            temp=[item[1:] for sublist in dic[b].values() for item in sublist]

            bplot=ax.boxplot(temp,showmeans=False,showfliers=False,medianprops=medianprops,capprops=whiskerprops,
                       flierprops=whiskerprops,boxprops=boxprops,whiskerprops=whiskerprops,
                       positions=[1,2,3,4,5,6, 8,9,10,11,12,13 ,15,16,17,18,19,20 ,22,23,24,25,26,27 ,29,30,31,32,33,34,
                                  36,37,38,39,40,41])
            for patch, color in zip(bplot['boxes'], colors):
                patch.set(color=color)
            ax.set_xticks([3.5,11.5,17.5,24.5,31.5,38.5])
            ax.set_xticklabels(dic[b].keys())
            ax.set_ylabel(b,labelpad=30)
            #ax.set_ylim([0,1])
            if j!=1:
                plt.setp(ax.get_xticklabels(), visible=False)
            fig.add_subplot(ax)

    # box1 = TextArea("DT", textprops=dict(color=colors[0],size='large'))
    # box2 = TextArea("RF", textprops=dict(color=colors[1],size='large'))
    # box3 = TextArea("SVM", textprops=dict(color=colors[2],size='large'))
    # box = HPacker(children=[box1, box2, box3],
    #               align="center",
    #               pad=0, sep=5)
    #
    # anchored_box = AnchoredOffsetbox(loc=3,child=box, pad=0.,frameon=True,
    #                                  bbox_to_anchor=(0., 1.02),borderpad=0.)
    #
    # plt.artist(anchored_box)
    obj_0 = AnyObject("LDADE_FFT", colors[0])
    obj_1 = AnyObject("LDADE_SVM", colors[1])
    obj_2 = AnyObject("10_FFT", TFIDF_SVMcolors[2])
    obj_3 = AnyObject("25_FFT", colors[3])
    obj_4 = AnyObject("50_FFT", colors[4])
    obj_5 = AnyObject("100_FFT", colors[5])

    plt.legend([obj_0, obj_1,obj_2,obj_3,obj_4,obj_5], ['LDADE_FFT','LDADE_SVM', '10_FFT', '25_FFT','50_FFT','100_FFT'],
               handler_map={obj_0: AnyObjectHandler(), obj_1: AnyObjectHandler(),obj_2: AnyObjectHandler(),
               obj_3: AnyObjectHandler(), obj_4: AnyObjectHandler(),obj_5: AnyObjectHandler()},
               loc='upper center', bbox_to_anchor=(0.5, 2.1),
               fancybox=True, shadow=True, ncol=6,handletextpad=4)
    # plt.figtext(0.40, 0.9, 'DT', color=colors[0],size='large')
    # plt.figtext(0.50, 0.9, 'RF', color=colors[1],size='large')
    # plt.figtext(0.60, 0.9, 'SVM', color=colors[2],size='large')

    plt.savefig("../results/graph1.png", bbox_inches='tight')
    plt.close(fig)

def draw1(dic):
    font = {'size': 70}
    plt.rc('font', **font)
    paras = {'lines.linewidth': 70, 'legend.fontsize': 70, 'axes.labelsize': 80, 'legend.frameon': True,
                  'figure.autolayout': True,'axes.linewidth':8}
    plt.rcParams.update(paras)

    boxprops = dict(linewidth=9,color='black')
    colors=['red','green', 'blue', 'orange','purple']*6
    whiskerprops = dict(linewidth=5)
    medianprops = dict(linewidth=8, color='firebrick')
    #meanpointprops = dict(marker='D', markeredgecolor='black',markerfacecolor='firebrick',markersize=20)

    fig = plt.figure(figsize=(80, 60))
    outer = gridspec.GridSpec(1, 1, wspace=0.1, hspace=0.2)
    for i,a in enumerate([1]):
        inner = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=outer[i], wspace=0.05, hspace=0.0)
        for j,b in enumerate(dic.keys()):
            ax = plt.Subplot(fig, inner[j])
            temp=[item[1:] for sublist in dic[b].values() for item in sublist]

            bplot=ax.boxplot(temp,showmeans=False,showfliers=False,medianprops=medianprops,capprops=whiskerprops,
                       flierprops=whiskerprops,boxprops=boxprops,whiskerprops=whiskerprops,
                       positions=[1,2,3,4,5, 7,8,9,10,11, 13,14,15,16,17 ,19,20,21,22,23 ,25,26,27,28,29, 31,32,33,34,35
                                  ])
            for patch, color in zip(bplot['boxes'], colors):
                patch.set(color=color)
            ax.set_xticks([3,9,15,21,27,33])
            ax.set_xticklabels(dic[b].keys())
            ax.set_ylabel(b,labelpad=30)
            #ax.set_ylim([0,1])
            if j!=1:
                plt.setp(ax.get_xticklabels(), visible=False)
            fig.add_subplot(ax)

    # box1 = TextArea("DT", textprops=dict(color=colors[0],size='large'))
    # box2 = TextArea("RF", textprops=dict(color=colors[1],size='large'))
    # box3 = TextArea("SVM", textprops=dict(color=colors[2],size='large'))
    # box = HPacker(children=[box1, box2, box3],
    #               align="center",
    #               pad=0, sep=5)
    #
    # anchored_box = AnchoredOffsetbox(loc=3,child=box, pad=0.,frameon=True,
    #                                  bbox_to_anchor=(0., 1.02),borderpad=0.)
    #
    # plt.artist(anchored_box)
    obj_0 = AnyObject("TFIDF_SVM", colors[0])
    obj_1 = AnyObject("10_FFT", colors[1])
    obj_2 = AnyObject("25_FFT", colors[2])
    obj_3 = AnyObject("50_FFT", colors[3])
    obj_4 = AnyObject("100_FFT", colors[4])

    plt.legend([obj_0, obj_1,obj_2,obj_3,obj_4], ['TFIDF_SVM', '10_FFT', '25_FFT','50_FFT','100_FFT'],
               handler_map={obj_0: AnyObjectHandler(), obj_1: AnyObjectHandler(),obj_2: AnyObjectHandler(),
               obj_3: AnyObjectHandler(), obj_4: AnyObjectHandler()},
               loc='upper center', bbox_to_anchor=(0.5, 2.1),
               fancybox=True, shadow=True, ncol=5,handletextpad=4)
    # plt.figtext(0.40, 0.9, 'DT', color=colors[0],size='large')
    # plt.figtext(0.50, 0.9, 'RF', color=colors[1],size='large')
    # plt.figtext(0.60, 0.9, 'SVM', color=colors[2],size='large')

    plt.savefig("../results/graph2.png", bbox_inches='tight')
    plt.close(fig)

def for_LDADE(files):
    filenames=[]
    dic={}
    for f in files:
        filenames.append('LDADE'+f+".pickle")
    for f in filenames:
        with open("../dump/" + f, 'rb') as handle:
            g=f.split(".pickle")[0].split("LDADE")[1]
            dic[g] = pickle.load(handle, encoding='latin1')
    return dic

def for_LDA(files):
    filenames=[]
    dic={}
    for f in files:
        filenames.append("LDA" + f + "_1.pickle")
    for f in filenames:
        with open("../dump/" + f, 'rb') as handle:
            g = f.split(".pickle")[0].split("LDA")[1]
            dic[g] = pickle.load(handle)
    return dic

def for_LDA_py2(files):
    filenames=[]
    dic={}
    for f in files:
        filenames.append("LDA" + f + ".pickle")
    for f in filenames:
        with open("../dump/" + f, 'rb') as handle:
            g = f.split(".pickle")[0].split("LDA")[1]
            dic[g] = pickle.load(handle, encoding='latin1')
    return dic

def for_untuned(files):
    filenames=[]
    dic={}
    for f in files:
        filenames.append("untuned" + f + ".pickle")
    for f in filenames:
        with open("../dump/" + f, 'rb') as handle:
            g = f.split(".pickle")[0].split("untuned")[1]
            dic[g] = pickle.load(handle, encoding='latin1')
    return dic


In [55]:
LDADE=for_LDADE(files)
LDA_1=for_LDA(files)
LDA=for_LDA_py2(files)
untuned = for_untuned(files)

In [56]:
LDA.keys()

dict_keys(['pitsA', 'pitsC', 'pitsD', 'pitsE', 'pitsF'])

In [58]:
dic_1 = []
dic_2 = []
dic_1.append(SVM_FFT(LDA,untuned, ""))
dic_1.append(LDADE_FFT(LDADE,LDA, ""))
dic_2.append(SVM_FFT(LDA_1,untuned, "_1"))
dic_2.append(LDADE_FFT(LDADE,LDA_1, "_1"))

In [59]:
for p in dic_1[0].keys():
    for d in dic_1[0][p].keys():
        for r in  dic_1[0][p][d]:
            if r[0] == "TFIDF_SVM":
                dic_1[1][p][d].append(r)

In [60]:
for p in dic_2[0].keys():
    for d in dic_2[0][p].keys():
        for r in  dic_2[0][p][d]:
            if r[0] == "TFIDF_SVM":
                dic_2[1][p][d].append(r)

In [61]:
ori_dic = dic_1[1]
dic_copy = dic_2[1]

In [None]:
from __future__ import division, print_function
import sys, random, argparse

sys.dont_write_bytecode = True


class o():
    "Anonymous container"

    def __init__(i, **fields):
        i.override(fields)

    def override(i, d): i.__dict__.update(d); return i

    def __repr__(i):
        d = i.__dict__
        name = i.__class__.__name__
        return name + '{' + ' '.join([':%s %s' % (k, d[k])
                                      for k in i.show()]) + '}'

    def show(i):
        return [k for k in sorted(i.__dict__.keys())
                if not "_" in k]


The = o(cohen=0.3, small=3, epsilon=0.01,
        width=50, lo=0, hi=100, conf=0.01, b=1000, a12=0.56)

parser = argparse.ArgumentParser(
    description="Apply Scott-Knot test to data read from standard input")

p = parser.add_argument

p("--demo", default=False, action="store_true")
p("--cohen", type=float,
  default=0.3, metavar='N',
  help="too small if delta less than N*std of the data)")
p("--small", type=int, metavar="N", default=3,
  help="too small if hold less than N items")
p("--epsilon", type=float, default=0.01, metavar="N",
  help="a range is too small of its hi - lo < N")
p("--width", type=int, default=50, metavar="N",
  help="width of quintile display")
p("--text", type=int, default=12, metavar="N",
  help="width of text display")
p("--conf", type=float, default=0.01, metavar="N",
  help="bootstrap tests with confidence 1-n")
p("--a12", type=float, default=0.56, metavar="N",
  help="threshold for a12 test: disable,small,med,large=0,0.56,0.64,0.71")

args = parser.parse_args()
The.cohen = args.cohen
The.small = args.small
The.epsilon = args.epsilon
The.conf = args.conf
The.width = args.width + 0
The.a12 = args.a12 + 0
The.text = args.text + 0

"""
TODO
try:                                
        opts, args = getopt.getopt(argv, "hg:d", ["help", "grammar="]) 2
    except getopt.GetoptError:           3
        usage()                          4
        sys.exit(2)                     
# Analysis of Experimental Data
This page is about the non-parametric statistical tests. It is also a chance for us to discuss a little
statistical theory.
## Before we begin...
Imagine the following example contain objective scores gained from different optimizers
_x1,x2,x3,x4,...etc_. Which results are ranked one, two, three etc...
### Lesson Zero
Some differences are obvious
"""


def rdiv0():
    rdivDemo([
        ["x1", 0.34, 0.49, 0.51, 0.6],
        ["x2", 6, 7, 8, 9]])


"""
rank ,         name ,    med   ,  iqr 
----------------------------------------------------
   1 ,           x1 ,      51  ,    11 (*              |              ), 0.34,  0.49,  0.51,  0.51,  0.60
   2 ,           x2 ,     800  ,   200 (               |   ----   *-- ), 6.00,  7.00,  8.00,  8.00,  9.00
### Lesson One
Some similarities are obvious...
"""


def rdiv1():
    rdivDemo([
        ["x1", 0.1, 0.2, 0.3, 0.4],
        ["x2", 0.1, 0.2, 0.3, 0.4],
        ["x3", 6, 7, 8, 9]])


"""
rank ,         name ,    med   ,  iqr 
----------------------------------------------------
   1 ,           x1 ,      30  ,    20 (*              |              ), 0.10,  0.20,  0.30,  0.30,  0.40
   1 ,           x2 ,      30  ,    20 (*              |              ), 0.10,  0.20,  0.30,  0.30,  0.40
   2 ,           x3 ,     800  ,   200 (               |   ----   *-- ), 6.00,  7.00,  8.00,  8.00,  9.00
### Lesson Two
Many results often clump into less-than-many ranks.
"""


def rdiv2():
    rdivDemo([
        ["x1", 0.34, 0.49, 0.51, 0.6],
        ["x2", 0.6, 0.7, 0.8, 0.9],
        ["x3", 0.15, 0.25, 0.4, 0.35],
        ["x4", 0.6, 0.7, 0.8, 0.9],
        ["x5", 0.1, 0.2, 0.3, 0.4]])


"""
rank ,         name ,    med   ,  iqr 
----------------------------------------------------
   1 ,           x5 ,      30  ,    20 (---    *---    |              ), 0.10,  0.20,  0.30,  0.30,  0.40
   1 ,           x3 ,      35  ,    15 ( ----    *-    |              ), 0.15,  0.25,  0.35,  0.35,  0.40
   2 ,           x1 ,      51  ,    11 (        ------ *--            ), 0.34,  0.49,  0.51,  0.51,  0.60
   3 ,           x2 ,      80  ,    20 (               |  ----    *-- ), 0.60,  0.70,  0.80,  0.80,  0.90
   3 ,           x4 ,      80  ,    20 (               |  ----    *-- ), 0.60,  0.70,  0.80,  0.80,  0.90
### Lesson Three
Some results even clump into one rank (the great null result).
"""


def rdiv3():
    rdivDemo([
        ["x1", 101, 100, 99, 101, 99.5],
        ["x2", 101, 100, 99, 101, 100],
        ["x3", 101, 100, 99.5, 101, 99],
        ["x4", 101, 100, 99, 101, 100]])


"""
rank ,         name ,    med   ,  iqr 
----------------------------------------------------
   1 ,           x1 ,    10000  ,   150 (-------       *|              ),99.00, 99.50, 100.00, 101.00, 101.00
   1 ,           x2 ,    10000  ,   100 (--------------*|              ),99.00, 100.00, 100.00, 101.00, 101.00
   1 ,           x3 ,    10000  ,   150 (-------       *|              ),99.00, 99.50, 100.00, 101.00, 101.00
   1 ,           x4 ,    10000  ,   100 (--------------*|              ),99.00, 100.00, 100.00, 101.00, 101.00
#### Lesson Four
Heh? Where's  lesson four?
### Lesson Five
Some things had better clump to one thing (sanity check for the ranker).
"""


def rdiv5():
    rdivDemo([
        ["x1", 11, 11, 11],
        ["x2", 11, 11, 11],
        ["x3", 11, 11, 11]])


"""
rank ,         name ,    med   ,  iqr 
----------------------------------------------------
   1 ,           x1 ,    1100  ,     0 (*              |              ),11.00, 11.00, 11.00, 11.00, 11.00
   1 ,           x2 ,    1100  ,     0 (*              |              ),11.00, 11.00, 11.00, 11.00, 11.00
   1 ,           x3 ,    1100  ,     0 (*              |              ),11.00, 11.00, 11.00, 11.00, 11.00
### Lesson Six
Some things had better clump to one thing (sanity check for the ranker).
"""


def rdiv6():
    rdivDemo([
        ["x1", 11, 11, 11],
        ["x2", 11, 11, 11],
        ["x4", 32, 33, 34, 35]])


"""
rank ,         name ,    med   ,  iqr 
----------------------------------------------------
   1 ,           x1 ,    1100  ,     0 (*              |              ),11.00, 11.00, 11.00, 11.00, 11.00
   1 ,           x2 ,    1100  ,     0 (*              |              ),11.00, 11.00, 11.00, 11.00, 11.00
   2 ,           x4 ,    3400  ,   200 (               |          - * ),32.00, 33.00, 34.00, 34.00, 35.00
### Lesson Seven
All the above scales to succinct summaries of hundreds, thousands, millions of numbers
"""


def rdiv7():
    rdivDemo([
        ["x1"] + [rand() ** 0.5 for _ in range(256)],
        ["x2"] + [rand() ** 2 for _ in range(256)],
        ["x3"] + [rand() for _ in range(256)]
    ])





def _ab12():
    def a12slow(lst1, lst2):
        more = same = 0.0
        for x in sorted(lst1):
            for y in sorted(lst2):
                if x == y:
                    same += 1
                elif x > y:
                    more += 1
        return (more + 0.5 * same) / (len(lst1) * len(lst2))

    random.seed(1)
    l1 = [random.random() for x in range(5000)]
    more = [random.random() * 2 for x in range(5000)]
    l2 = [random.random() for x in range(5000)]
    less = [random.random() / 2.0 for x in range(5000)]
    for tag, one, two in [("1less", l1, more),
                          ("1more", more, less), ("same", l1, l2)]:
        t1 = msecs(lambda: a12(l1, less))
        t2 = msecs(lambda: a12slow(l1, less))
        print("\n", tag, "\n", t1, a12(one, two))
        print(t2, a12slow(one, two))


"""
Note that the test code \__ab12_ shows that our fast and slow method generate the same A12 score, but the
fast way does so thousands of times faster. The following tests show runtimes for lists of 5000 numbers:
    experimemt  msecs(fast)  a12(fast)  msecs(slow)  a12(slow)
    1less          13        0.257      9382           0.257  
    1more          20        0.868      9869           0.868
    same           11        0,502      9937           0.502
## Significance Tests
### Standard Utils
Didn't we do this before?
"""

"""
Misc functions:
"""
rand = random.random
any = random.choice
seed = random.seed
exp = lambda n: math.e ** n
ln = lambda n: math.log(n, math.e)
g = lambda n: round(n, 2)


def median(lst, ordered=False):
    if not ordered: lst = sorted(lst)
    n = len(lst)
    p = n // 2
    if n % 2: return lst[p]
    q = p - 1
    q = max(0, min(q, n))
    return (lst[p] + lst[q]) / 2


def msecs(f):
    import time
    t1 = time.time()
    f()
    return (time.time() - t1) * 1000


def pairs(lst):
    "Return all pairs of items i,i+1 from a list."
    last = lst[0]
    for i in lst[1:]:
        yield last, i
        last = i


def xtile(lst, lo=The.lo, hi=The.hi, width=The.width,
          chops=[0.1, 0.3, 0.5, 0.7, 0.9],
          marks=["-", " ", " ", "-", " "],
          bar="|", star="*", show=" %3.0f"):
    """The function _xtile_ takes a list of (possibly)
    unsorted numbers and presents them as a horizontal
    xtile chart (in ascii format). The default is a
    contracted _quintile_ that shows the
    10,30,50,70,90 breaks in the data (but this can be
    changed- see the optional flags of the function).
    """

    def pos(p):
        return ordered[int(len(lst) * p)]

    def place(x):
        return int(width * float((x - lo)) / (hi - lo + 0.00001))

    def pretty(lst):
        return ', '.join([show % x for x in lst])

    ordered = sorted(lst)
    lo = min(lo, ordered[0])
    hi = max(hi, ordered[-1])
    what = [pos(p) for p in chops]
    where = [place(n) for n in what]
    out = [" "] * width
    for one, two in pairs(where):
        for i in range(one, two):
            out[i] = marks[0]
        marks = marks[1:]
    out[int(width / 2)] = bar
    out[place(pos(0.5))] = star
    return '(' + ''.join(out) + ")," + pretty(what)


def _tileX():
    import random
    random.seed(1)
    nums = [random.random() ** 2 for _ in range(100)]
    print(xtile(nums, lo=0, hi=1.0, width=25, show=" %5.2f"))


"""````
### Standard Accumulator for Numbers
Note the _lt_ method: this accumulator can be sorted by median values.
Warning: this accumulator keeps _all_ numbers. Might be better to use
a bounded cache.
"""


class Num:
    "An Accumulator for numbers"

    def __init__(i, name, inits=[]):
        i.n = i.m2 = i.mu = 0.0
        i.all = []
        i._median = None
        i.name = name
        i.rank = 0
        for x in inits: i.add(x)

    def s(i):
        return (i.m2 / (i.n - 1)) ** 0.5

    def add(i, x):
        i._median = None
        i.n += 1
        i.all += [x]
        delta = x - i.mu
        i.mu += delta * 1.0 / i.n
        i.m2 += delta * (x - i.mu)

    def __add__(i, j):
        return Num(i.name + j.name, i.all + j.all)

    def quartiles(i):
        def p(x): return g(xs[x]) #int()

        i.median()
        xs = i.all
        n = int(len(xs) * 0.25)
        return p(n), p(2 * n), p(3 * n)

    def median(i):
        if not i._median:
            i.all = sorted(i.all)
            i._median = median(i.all)
        return i._median

    def __lt__(i, j):
        return i.median() < j.median()

    def spread(i):
        i.all = sorted(i.all)
        n1 = i.n * 0.25
        n2 = i.n * 0.75
        if len(i.all) <= 1:
            return 0
        if len(i.all) == 2:
            return i.all[1] - i.all[0]
        else:
            return i.all[int(n2)] - i.all[int(n1)]


"""
### The A12 Effect Size Test 
As above
"""


def a12slow(lst1, lst2):
    "how often is x in lst1 more than y in lst2?"
    more = same = 0.0
    for x in lst1:
        for y in lst2:
            if x == y:
                same += 1
            elif x > y:
                more += 1
    x = (more + 0.5 * same) / (len(lst1) * len(lst2))
    return x


def a12(lst1, lst2):
    "how often is x in lst1 more than y in lst2?"

    def loop(t, t1, t2):
        while t1.j < t1.n and t2.j < t2.n:
            h1 = t1.l[t1.j]
            h2 = t2.l[t2.j]
            h3 = t2.l[t2.j + 1] if t2.j + 1 < t2.n else None
            if h1 > h2:
                t1.j += 1;
                t1.gt += t2.n - t2.j
            elif h1 == h2:
                if h3 and h1 > h3:
                    t1.gt += t2.n - t2.j - 1
                t1.j += 1;
                t1.eq += 1;
                t2.eq += 1
            else:
                t2, t1 = t1, t2
        return t.gt * 1.0, t.eq * 1.0

    # --------------------------
    lst1 = sorted(lst1, reverse=True)
    lst2 = sorted(lst2, reverse=True)
    n1 = len(lst1)
    n2 = len(lst2)
    t1 = o(l=lst1, j=0, eq=0, gt=0, n=n1)
    t2 = o(l=lst2, j=0, eq=0, gt=0, n=n2)
    gt, eq = loop(t1, t1, t2)
    return gt / (n1 * n2) + eq / 2 / (n1 * n2) >= The.a12


def _a12():
    def f1(): return a12slow(l1, l2)

    def f2(): return a12(l1, l2)

    for n in [100, 200, 400, 800, 1600, 3200, 6400]:
        l1 = [rand() for _ in xrange(n)]
        l2 = [rand() for _ in xrange(n)]
        t1 = msecs(f1)
        t2 = msecs(f2)
        print(n, g(f1()), g(f2()), int((t1 / t2)))



def sampleWithReplacement(lst):
    "returns a list same size as list"

    def any(n): return random.uniform(0, n)

    def one(lst): return lst[int(any(len(lst)))]

    return [one(lst) for _ in lst]


def testStatistic(y, z):
    """Checks if two means are different, tempered
     by the sample size of 'y' and 'z'"""
    tmp1 = tmp2 = 0
    for y1 in y.all: tmp1 += (y1 - y.mu) ** 2
    for z1 in z.all: tmp2 += (z1 - z.mu) ** 2
    s1 = (float(tmp1) / (y.n - 1)) ** 0.5
    s2 = (float(tmp2) / (z.n - 1)) ** 0.5
    delta = z.mu - y.mu
    if s1 + s2:
        delta = delta / ((s1 / y.n + s2 / z.n) ** 0.5)
    return delta


"""
The rest is just details:
+ Efron advises
  to make the mean of the populations the same (see
  the _yhat,zhat_ stuff shown below).
+ The class _total_ is a just a quick and dirty accumulation class.
+ For more details see [the Efron text][efron01].  
"""


def bootstrap(y0, z0, conf=The.conf, b=The.b):
    """The bootstrap hypothesis test from
       p220 to 223 of Efron's book 'An
      introduction to the boostrap."""

    class total():
        "quick and dirty data collector"

        def __init__(i, some=[]):
            i.sum = i.n = i.mu = 0;
            i.all = []
            for one in some: i.put(one)

        def put(i, x):
            i.all.append(x);
            i.sum += x;
            i.n += 1;
            i.mu = float(i.sum) / i.n

        def __add__(i1, i2): return total(i1.all + i2.all)

    y, z = total(y0), total(z0)
    x = y + z
    tobs = testStatistic(y, z)
    yhat = [y1 - y.mu + x.mu for y1 in y.all]
    zhat = [z1 - z.mu + x.mu for z1 in z.all]
    bigger = 0.0
    for i in range(b):
        if testStatistic(total(sampleWithReplacement(yhat)),
                         total(sampleWithReplacement(zhat))) > tobs:
            bigger += 1
    return bigger / b < conf


"""
#### Examples
"""


def _bootstraped():
    def worker(n=1000,
               mu1=10, sigma1=1,
               mu2=10.2, sigma2=1):
        def g(mu, sigma): return random.gauss(mu, sigma)

        x = [g(mu1, sigma1) for i in range(n)]
        y = [g(mu2, sigma2) for i in range(n)]
        return n, mu1, sigma1, mu2, sigma2, \
               'different' if bootstrap(x, y) else 'same'

    # very different means, same std
    print(worker(mu1=10, sigma1=10,
                 mu2=100, sigma2=10))
    # similar means and std
    print(worker(mu1=10.1, sigma1=1,
                 mu2=10.2, sigma2=1))
    # slightly different means, same std
    print(worker(mu1=10.1, sigma1=1,
                 mu2=10.8, sigma2=1))
    # different in mu eater by large std
    print(worker(mu1=10.1, sigma1=10,
                 mu2=10.8, sigma2=1))


"""
Output:
"""

# _bootstraped()

(1000, 10, 10, 100, 10, 'different')
(1000, 10.1, 1, 10.2, 1, 'same')
(1000, 10.1, 1, 10.8, 1, 'different')
(1000, 10.1, 10, 10.8, 1, 'same')



def different(l1, l2):
    # return bootstrap(l1,l2) and a12(l2,l1)
    return a12(l2, l1) and bootstrap(l1, l2)




def scottknott(data, cohen=The.cohen, small=The.small, useA12=The.a12 > 0, epsilon=The.epsilon):
    """Recursively split data, maximizing delta of
    the expected value of the mean before and
    after the splits.
    Reject splits with under 3 items"""
    all = reduce(lambda x, y: x + y, data)
    same = lambda l, r: abs(l.median() - r.median()) <= all.s() * cohen
    if useA12:
        same = lambda l, r: not different(l.all, r.all)
    big = lambda n: n > small
    return rdiv(data, all, minMu, big, same, epsilon)


def rdiv(data,  # a list of class Nums
         all,  # all the data combined into one num
         div,  # function: find the best split
         big,  # function: rejects small splits
         same,  # function: rejects similar splits
         epsilon):  # small enough to split two parts
    """Looks for ways to split sorted data,
    Recurses into each split. Assigns a 'rank' number
    to all the leaf splits found in this way.
    """

    def recurse(parts, all, rank=0):
        "Split, then recurse on each part."
        cut, left, right = maybeIgnore(div(parts, all, big, epsilon),
                                       same, parts)
        if cut:
            # if cut, rank "right" higher than "left"
            rank = recurse(parts[:cut], left, rank) + 1
            rank = recurse(parts[cut:], right, rank)
        else:
            # if no cut, then all get same rank
            for part in parts:
                part.rank = rank
        return rank

    recurse(sorted(data), all)
    return data


def maybeIgnore((cut, left, right), same, parts):
    if cut:
        if same(sum(parts[:cut], Num('upto')),
                sum(parts[cut:], Num('above'))):
            cut = left = right = None
    return cut, left, right


def minMu(parts, all, big, epsilon):
    """Find a cut in the parts that maximizes
    the expected value of the difference in
    the mean before and after the cut.
    Reject splits that are insignificantly
    different or that generate very small subsets.
    """
    cut, left, right = None, None, None
    before, mu = 0, all.mu
    for i, l, r in leftRight(parts, epsilon):
        if big(l.n) and big(r.n):
            n = all.n * 1.0
            now = l.n / n * (mu - l.mu) ** 2 + r.n / n * (mu - r.mu) ** 2
            if now > before:
                before, cut, left, right = now, i, l, r
    return cut, left, right


def leftRight(parts, epsilon=The.epsilon):
    """Iterator. For all items in 'parts',
    return everything to the left and everything
    from here to the end. For reasons of
    efficiency, take a first pass over the data
    to pre-compute and cache right-hand-sides
    """
    rights = {}
    n = j = len(parts) - 1
    while j > 0:
        rights[j] = parts[j]
        if j < n: rights[j] += rights[j + 1]
        j -= 1
    left = parts[0]
    for i, one in enumerate(parts):
        if i > 0:
            if parts[i]._median - parts[i - 1]._median > epsilon:
                yield i, left, rights[i]
            left += one


"""
## Putting it All Together
Driver for the demos:
"""


def rdivDemo(data):
    def zzz(x):
        return int(100 * (x - lo) / (hi - lo + 0.00001))

    data = map(lambda lst: Num(lst[0], lst[1:]),
               data)
    print("")
    ranks = []
    for x in scottknott(data, useA12=True):
        ranks += [(x.rank, x.median(), x)]
    all = []
    for _, __, x in sorted(ranks): all += x.all
    all = sorted(all)
    lo, hi = all[0], all[-1]
    line = "----------------------------------------------------"
    last = None
    formatStr = '%%4s , %%%ss ,    %%s   , %%4s ' % The.text
    print((formatStr % \
           ('rank', 'name', 'med', 'iqr')) + "\n" + line)
    for _, __, x in sorted(ranks):
        q1, q2, q3 = x.quartiles()
        print((formatStr % \
               (x.rank + 1, x.name, q2, q3 - q1)) + \
              xtile(x.all, lo=lo, hi=hi, width=30, show="%5.2f"))
        last = x.rank


def rdivDemo1(data):
    def zzz(x):
        return int(100 * (x - lo) / (hi - lo + 0.00001))

    data = map(lambda lst: Num(lst[0], lst[1:]),
               data)

    ranks = []
    for x in scottknott(data, useA12=True):
        ranks += [(x.rank, x.median(), x)]

    max_rank = max([x[0] for x in ranks])
    # print(max_rank)
    all = []
    for _, __, x in sorted(ranks): all += x.all
    all = sorted(all)
    lo, hi = all[0], all[-1]
    line = "----------------------------------------------------"
    last = None
    formatStr = '%%4s , %%%ss ,    %%s   , %%4s ' % The.text

    ret = {}
    for _, __, x in sorted(ranks, reverse=True):
        q1, q2, q3 = x.quartiles()
        # print((formatStr % \
        #        (max_rank - x.rank + 1, x.name, q2, q3 - q1)) + \
        #       xtile(x.all, lo=lo, hi=hi, width=30, show="%5.2f"))
        last = x.rank

        ret[x.name] = max_rank - x.rank + 1
    return ret


def _rdivs():
    seed(1)
    rdiv0()
    rdiv1()
    rdiv2()
    rdiv3()
    rdiv5()
    rdiv6()
    print("###")
    rdiv7()


####################################

def thing(x):
    "Numbers become numbers; every other x is a symbol."
    try:
        return int(x)
    except ValueError:
        try:
            return float(x)
        except ValueError:
            return x


def main():
    log = None
    all = {}
    now = []
    for line in sys.stdin:
        for word in line.split():
            word = thing(word)
            if isinstance(word, str):
                now = all[word] = all.get(word, [])
            else:
                now += [word]
    rdivDemo([[k] + v for k, v in all.items()])


def another_main(filename):
    data = open(filename).readlines()
    all = {}
    now = []
    for line in data:
        for word in line.split():
            word = thing(word)
            if isinstance(word, str):
                now = all[word] = all.get(word, [])
            else:
                now += [word]
    ranks = rdivDemo1([[k] + v for k, v in all.items()])
    return ranks


def concatenate_results(data, perf_measures):
    collect = {}
    for k, v in data.items():
        collect[k] = {'learners': {"SVM": 0, "KNN": 0, "DTC": 0, "RF": 0},
                      'optimizers': {"default": 0, "de": 0, "random": 0, "smac": 0, "grid": 0}}
        for dataset in v:
            temp = {"precision": {"learners": set(), "optimizers": set()},
                    "f1": {"learners": set(), "optimizers": set()},
                    "time": []}
            for perf_m in perf_measures:
                filename = "stats_files/" + perf_m + "_" + dataset
                if perf_m == "time":
                    filename = "stats_files/time_f1" + "_" + dataset
                temp_results = another_main(filename)
                for key, rank in temp_results.items():
                    if perf_m != "time":
                        if rank == 1:
                            optimizer = key.split("_")[0]
                            learner = key.split("_")[1]
                            temp[perf_m]['learners'].add(learner)
                            temp[perf_m]['optimizers'].add(optimizer)
                    else:
                        list(temp_results).sort(key=lambda x: x[1])
                        temp[perf_m] = temp_results
            same_learners = set.intersection(temp["precision"]["learners"], temp["f1"]["learners"])
            same_optimizers = set.intersection(temp["precision"]["optimizers"], temp["f1"]["optimizers"])
            for i in same_learners:
                collect[k]['learners'][i] += 1
            if same_optimizers:
                for j in same_optimizers:
                    collect[k]['optimizers'][j] += 1
            else:
                collect[k]['optimizers']['default'] += 1
    with open("collect_v4.1.p", 'wb') as handle:
        pickle.dump(collect, handle)


def concatenate_results_per_measure():
    perf_measures = ['precision']
    for perf_m in perf_measures:
        collect[perf_m] = {}
        for l in learners:
            collect[perf_m][l] = {}
            for k, v in data.items():
                collect[perf_m][l][k] = {"default": 0, "de": 0, "random": 0, "smac": 0, "grid": 0}
                temp = []
                for dataset in v:
                    filename = "stats_files/" + l + "/" + l + "_" + perf_m + "_" + dataset
                    if perf_m == "time":
                        filename = "stats_files/time_f1" + "_" + dataset
                    print(filename)
                    temp_results = another_main(filename)
                    for key, rank in temp_results.items():
                        if perf_m != "time":
                            if rank == 1:
                                optimizer = key.split("_")[0]
                                learner = key.split("_")[1]
                                temp.append(optimizer)
                        else:
                            list(temp_results).sort(key=lambda x: x[1])
                            temp = temp_results
                            exit()
                for j in temp:
                    # print(collect[perf_m][l][k][j], perf_m, l, k, j)
                    collect[perf_m][l][k][j] += 1

def concatenate_results_per_exp_types(data, perf_measures):
    exp_types = ['train', 'test']

    for perf_m in perf_measures:
        collect[perf_m] = {}
        for l in learners:
            collect[perf_m][l] = {}
            for k, v in data.items():
                temp = []
                for dataset in v:
                    res = {"train": {"default": 0, "de": 0, "random": 0, "grid": 0},
                            "test": {"default": 0, "de": 0, "random": 0, "grid": 0}}
                    for t in exp_types:
                        filename = t + "/" + l + "/" + l + "_" + perf_m + "_" + dataset
                        #print(filename)
                        temp_results = another_main(filename)
                        for key, rank in temp_results.items():
                            res[t][key.split("_")[0]] = rank
                    #print(res)
                    dif = 0
                    for o in res['train'].keys():
                        if res['train'][o] != res['test'][o]:
                            dif += 1
                    collect[perf_m][l][dataset] = dif
    with open("collect_v10.p", 'wb') as handle:
        pickle.dump(collect, handle)
    #print(collect)
    return collect


    

In [64]:
perf_measures = ['precision', 'recall']
datasets = ['pitsA', 'pitsC', 'pitsD', 'pitsE', 'pitsF']

#for v in data.values():
#    for i in v:
#        print(i[:-4], ',', end=" ")
#print("\n")
rq1 = ""
for perf in perf_measures:
    for d in datasets:
        print(perf, d)
        for v in dic_2[perf][d]:
            print(v[0], end=" ")
            for i in range(1, len(v)):
                print(v[i], ',', end=" ")
            print()
    print()
print()


precision pitsA


TypeError: list indices must be integers or slices, not str

In [71]:
perf_measures = ['precision', 'recall']
datasets = ['pitsA', 'pitsC', 'pitsD',  'pitsE', 'pitsF']

#for v in data.values():
#    for i in v:
#        print(i[:-4], ',', end=" ")
#print("\n")

for perf in perf_measures:
    for d in datasets:
        rq = ""
        for v in dic_copy[perf][d]:
            rq += v[0] + " "
            for i in range(1, len(v)):
                rq += str(v[i]) + ' '                
            rq += "\n"
        file_write = open("copy_" + perf + "_" + d + ".txt", "w")
        file_write.write(rq)
        file_write.close()
    print()
print()





