## Anthony Soroka
## CS 207 HW 6

## Q1.

Add a __setitem__ to the python linked list implementation from the lecture (this past wednesday).

In [1]:
#your code here

from doctest import run_docstring_examples as dtest
import numbers
import reprlib
class LL:
    """
    >>> A = LL()  
    >>> A[0]
    Traceback (most recent call last):
        ...
    IndexError: trying to index an empty LL
    >>> A.insert_front(1)
    >>> A[0]
    1
    >>> A.insert_back(2)
    >>> A[1]
    2
    >>> A
    LL([1,...])
    >>> myll = LL.from_components([1,2])
    >>> myll[1]
    1
    >>> len(myll)
    2
    >>> myll[2]
    Traceback (most recent call last):
        ...
    IndexError: LL index out of range
    >>> myll[0:1]
    Traceback (most recent call last):
        ...
    TypeError: LL indices must be integers
    """
    @classmethod
    def from_components(cls, components):
        inst = cls(components[0])
        for c in components[1:]:
            inst.insert_front(c)
        return inst
        
    def __init__(self, head=None):
        if head is None:
            self._headNode = None
        else:
            self._headNode = [head, None]
            
    def insert_front(self, element):
        new_node = [element, None]
        new_node[1] = self._headNode
        self._headNode = new_node
        
    def insert_back(self, element):
        new_node = [element, None]
        curr_ptr = self._headNode
        while curr_ptr[1] is not None:
            curr_ptr = curr_ptr[1]
        curr_ptr[1]= new_node
        
    def __repr__(self):
        class_name = type(self).__name__
        if len(self)==0:
            components=""
        else:
            components = reprlib.repr(self[0])
        return '{}([{},...])'.format(class_name,components)


    def __len__(self):
        curr_ptr = self._headNode
        count = 0
        if curr_ptr==None:
            return 0
        while 1:
            count = count + 1
            if curr_ptr[1] is None:
                break
            curr_ptr = curr_ptr[1]
        return count    
    
    def __getitem__(self, index):
        class_name = type(self).__name__
        if isinstance(index, numbers.Integral): 
            curr_ptr = self._headNode
            if curr_ptr==None:
                msg = 'trying to index an empty {class_name}' 
                raise IndexError(msg.format(class_name=class_name))
            next_ptr = self._headNode[1]
            count = 0
            while 1:
                if index == count:
                    return curr_ptr[0]
                if curr_ptr[1] is None:
                    msg = '{class_name} index out of range' 
                    raise IndexError(msg.format(class_name=class_name))       
                count += 1
                curr_ptr = curr_ptr[1]
        else:
            msg = '{class_name} indices must be integers' 
            raise TypeError(msg.format(class_name=class_name))
    
    def __setitem__(self, index, value):
        if isinstance(index, numbers.Integral): 
            curr_ptr = self._headNode
            if curr_ptr==None:
                msg = 'trying to index an empty {class_name}' 
                raise IndexError(msg.format(class_name=class_name))
            next_ptr = self._headNode[1]
            count = 0
            while 1:
                if index == count:
                    curr_ptr[0] = value
                    return
                if curr_ptr[1] is None:
                    msg = '{class_name} index out of range' 
                    raise IndexError(msg.format(class_name=class_name))       
                count += 1
                curr_ptr = curr_ptr[1]
        else:
            msg = '{class_name} indices must be integers' 
            raise TypeError(msg.format(class_name=class_name))



In [2]:
myll=LL.from_components([1,2,32,-4,5])
myll

LL([5,...])

In [3]:
myll[0]

5

In [4]:
myll[0]=10

In [5]:
myll[0]

10

## Q2.

An online mean and standard deviation algorithm.

Below is a function to generate a potentially infinite stream of 1-D data.

In [6]:
from random import normalvariate, random
from itertools import count
def make_data(m, stop=None):
    for _ in count():
        if stop and _ > stop:
            break
        yield 1.0e09 + normalvariate(0, m*random() )
        

Here is an implementation of an online mean algorithm..see http://www.johndcook.com/blog/standard_deviation/ and the link to http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/ in-between. (Convince yourselves of the formulas...)

In [7]:
def online_mean(iterator):
    n = 0
    mu = 0
    for value in iterator:
        n += 1
        delta = value - mu
        mu = mu + delta/n
        yield mu

We use out generator functions to implement iterators:

In [8]:
g = make_data(5, 10)
list(g)

[1000000001.2913613,
 999999998.541625,
 999999996.4813172,
 1000000001.2651098,
 1000000000.1076093,
 1000000005.0059614,
 999999993.9145777,
 1000000000.4092168,
 999999997.8543077,
 1000000000.3356925,
 999999999.4412125]

In [9]:
g = online_mean(make_data(5, 100))
print(type(g))
list(g)

<class 'generator'>


[1000000000.6461619,
 1000000000.6865436,
 1000000000.4372303,
 1000000000.1577805,
 999999999.6702225,
 999999998.7093304,
 999999998.6592153,
 999999998.8635647,
 999999998.9865268,
 999999998.893634,
 999999998.8449852,
 999999998.8877618,
 999999998.913712,
 999999998.9869508,
 999999999.3656963,
 999999999.5846772,
 999999999.7313927,
 999999999.7135594,
 999999999.7919178,
 999999999.885739,
 1000000000.1059027,
 999999999.8517704,
 999999999.9311228,
 999999999.8749071,
 999999999.845178,
 999999999.8017563,
 999999999.7589154,
 999999999.7940309,
 999999999.691059,
 999999999.5243303,
 999999999.5256184,
 999999999.5473416,
 999999999.5923157,
 999999999.7052277,
 999999999.7112566,
 999999999.7436287,
 999999999.7758672,
 999999999.7784277,
 999999999.7965505,
 999999999.7512287,
 999999999.7019122,
 999999999.7459227,
 999999999.7031747,
 999999999.5941386,
 999999999.6038179,
 999999999.6088938,
 999999999.6161351,
 999999999.6151828,
 999999999.6201885,
 999999999.5919363,


### 2.1

Implement the standard deviation algorithm as a generator function as

```python
def online_mean_dev(iterator):
    BLA BLA
    if n > 1:
        stddev = math.sqrt(dev_accum/(n-1))
        yield (n, value, mu, stddev)
```

In [23]:
import math

# your code here
def online_mean_dev(iterator):
    n = 0
    mu = 0
    stddev = 0 
    dev_accum = 0
    for value in iterator:
        n += 1
        delta = value - mu
        muOld = mu
        mu = mu + delta/n
        
        if n == 1:
            yield (n, value, mu, stddev)
            
        if n > 1:
            dev_accum = dev_accum + (value - muOld)*(value-mu)
            stddev = math.sqrt(dev_accum/(n-1))
            yield (n, value, mu, stddev)

Here we make 100000 element data, and run this iterator on it (imagine running this on a time-series being slowly read from disk

In [29]:
data_with_stats = online_mean_dev(make_data(5, 100000))

## Q3.

Let's do Anomaly detection. Write a routine `is_ok`:

```python
def is_ok(level, t)
```

which takes a tuple like the one yielded by your code above and returns True if the value is inbetween `level`-$\sigma$ of the mean.

ie mean plues or minus level * sigma

In [30]:
#your code here
def is_ok(level, t):
    if (t[1] >= t[2] - level*t[3]) and (t[1] <= t[2] + level*t[3]):
        return True
    else:
        return False

We use this function to create a predicate passed through to `itertools.filterfalse` which is then used to obtain an iterator on the anomalies.

In [31]:
from itertools import filterfalse
pred = lambda t: is_ok(5, t)
anomalies = filterfalse(pred, data_with_stats)

We materialize the anomalies...

In [32]:
list(anomalies)#materialize

[(736, 999999985.3929704, 1000000000.1680317, 2.9528528810254486),
 (2360, 999999982.8804344, 1000000000.118056, 2.9539614862434367),
 (8919, 1000000015.09703, 999999999.9985555, 2.936650432039978),
 (10016, 999999985.297056, 1000000000.0011773, 2.927714005453207),
 (13635, 1000000016.3330787, 1000000000.0105134, 2.9184096730237257),
 (18051, 999999984.842913, 1000000000.0060955, 2.914297259435276),
 (21268, 1000000016.4218655, 1000000000.019413, 2.9064304694046013),
 (30894, 999999982.1473114, 1000000000.0102682, 2.8895235427478783),
 (33988, 1000000015.8762891, 1000000000.0137726, 2.890059605747303),
 (36732, 999999985.2819097, 1000000000.0068102, 2.8793577063205396),
 (37123, 999999982.6506431, 1000000000.0062026, 2.8828898517093364),
 (38762, 1000000016.0274158, 1000000000.0048505, 2.8854737055909094),
 (41321, 1000000015.14062, 1000000000.0103319, 2.8891115884661485),
 (43886, 1000000014.6411188, 1000000000.0090114, 2.88947711931464),
 (61743, 999999985.2794871, 999999999.9982488,

## To think of, but not hand in

What kinds of anomalies will this algorithm pick up? What kinds would a shorter "window" of anomaly detection, like 100 points around the time in question pick? How might you create an algorithm which does window based averaging? (hint: the window size is small compared to the time series size). 

Finally think a bit of how you might implement all of this in a production environment..remember that data streaming in might get backed up when you handle an anomaly.

(Some inspiration might accrue if you look at the docs for `collections.deque`).