In [32]:
# Magic! (don't worry about this)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# Let use use some useful things. See lib.py for the source code.
from lib import *
from classifiers import *
import numpy as np

In [39]:
# load the data
microarray_file_name = '../data/leukemia_ALL_AML_matrix.txt'
labels_file_name = '../data/leukemia_ALL_AML_labels.txt'
data_store = DataSet(microarray_file_name, labels_file_name) # Data

train = data_store.get_train_set()

In [40]:
# Let's build a majority classifier
class MajorityClassifier:
    
    def train(self, data):
        self.counts = collections.Counter()
        for d in data:
            self.counts[d.label] += 1
        # get the label that was seen most often
        self.majority = self.counts.most_common()[0][0]
            
    def classify(self, data):
        labelled = []
        for d in data:
            labelled.append((d, self.majority))
        return labelled
        
    def print_stats(self):
        total = sum(self.counts.values())
        if total == 0:
            print 'This classifier hasn\'t been trained yet'
        else:
            for label in self.counts:
                percent = (self.counts[label] * 100.0) / total
                print 'I saw label "%s" %i times (%.2f%%)' % (label, self.counts[label], percent)
            

In [41]:
mj = MajorityClassifier()
mj.train(train)
mj.print_stats()

I saw label "0" 29 times (61.70%)
I saw label "1" 18 times (38.30%)


In [21]:
labelled = mj.classify(data_store.get_test_set())

In [42]:
evaluate_results(labelled)

Accuracy: 72.00%


How would we increase the accuracy of this method?

How many points do we need to distribute in the interval [0-1] to have on average 1 point in any 0.1 interval? 


In [31]:
k = 10
n = 500
num_in_reg = np.zeros(n)
for i in range(n):
    num_in_reg[i] = (np.random.uniform(0,1,k) < 0.1).sum()
print np.mean(num_in_reg)

1.012


What about in the interval $[0-1]^2$ looking square of $0.1^2$?

What about in the interval $[0-1]^{10}$ looking square of $0.1^{10}$?

What does this mean for local methods? 

In [72]:
k = 1000
n = 100
d = 3
num_in_reg = np.zeros(n)
for i in range(n):
    for j in range(k):
        if (np.random.uniform(0,1,d) < .1).all():
            num_in_reg[i] += 1
print np.mean(num_in_reg)

0.94


In [51]:
num_in_reg
np.random.uniform(0,1,d)

array([ 0.21834475,  0.50865988])