* intro to kinesis
* configure your AWS credentials
* KCL wordputter
* implementing simple processor class (echo)
* running processor with MultiLangDaeamon
* implementing simple counter (no ordering)
* testing with two KCL wordputters with lag (simulating long network delay)
* implement counter with buffer

# Intro

At Sqreen we are using AWS Kinesis service to process data from our agents in near real-time.

## Requirements

Install dependencies:
    
```
pip install aws
pip install amazon_kcl
```

## Creating stream


To connect to AWS, you must first create your credentials (you will get them from the AWS Console). Then, simply configure them using the following command:

```aws configure --profile blogpost-kinesis```

`blogpost-kinesis` is the name of the profile you will use for this tutorial. You will need to copy you public and secret access keys obtained from AWS Management Console.

In [1]:
import boto
from boto.kinesis.exceptions import ResourceInUseException
import os
import time

os.environ['AWS_PROFILE'] = 'blogpost-kinesis'

stream_name =  'blogpost-word-stream'
region = 'eu-west-1'
kinesis = boto.kinesis.connect_to_region(region)

try:
    kinesis.create_stream(stream_name, 1)
    print('stream {} created in region {}'.format(stream_name, region))
except ResourceInUseException:
    print('stream {} already exists in region {}'.format(stream_name, region))

def get_status():
    r = kinesis.describe_stream(stream_name)
    description = r.get('StreamDescription')
    status = description.get('StreamStatus')
    return status

while get_status() != 'ACTIVE':
    time.sleep(1)


stream blogpost-word-stream created in region eu-west-1


# Putting data into streams

In [2]:
import datetime
import time
import threading
from boto.kinesis.exceptions import ResourceNotFoundException

class KinesisProducer(threading.Thread):
    def __init__(self, stream_name, sleep_interval=None, ip_addr='8.8.8.8'):
        self.stream_name = stream_name
        self.sleep_interval = sleep_interval
        self.ip_addr = ip_addr
        super().__init__()
        
    def put_record(self):
        timestamp = datetime.datetime.utcnow()

        kinesis.put_record(self.stream_name, timestamp.isoformat(), self.ip_addr)
    
    def run_continously(self):
        """put a record at regular intervals"""
        while True:
            self.put_record()
            time.sleep(self.sleep_interval)
                
    def run(self):
        try:
            if self.sleep_interval:
                self.run_continously()
            else:
                self.put_record()
        except ResourceNotFoundException:
            print('stream {} not found. Exiting'.format(self.stream_name))
            return

In [3]:
producer1 = KinesisProducer(stream_name, sleep_interval=2, ip_addr='8.8.8.8')
producer2 = KinesisProducer(stream_name, sleep_interval=5, ip_addr='8.8.8.9')
producer1.start()
producer2.start()

# Consuming from stream

In [4]:
from boto.kinesis.exceptions import ProvisionedThroughputExceededException

import os
import datetime
import time

In [5]:
# https://github.com/aws-samples/kinesis-poster-worker/blob/master/worker.py

class KinesisWorker:
    """Generic Consumer for Amazon Kinesis Streams"""
    def __init__(self, stream_name, shard_id, iterator_type,
                 worker_time=30, sleep_interval=0.5):
   
        self.stream_name = stream_name
        self.shard_id = str(shard_id)
        self.iterator_type = iterator_type
        self.worker_time = worker_time
        self.sleep_interval = sleep_interval
        
    def process_records(self, records):
        pass
    
    @staticmethod
    def iter_records(records):
        for record in records:
            part_key = record['PartitionKey']
            data = record['Data']
            yield part_key, data
    
    def run(self):
        response = kinesis.get_shard_iterator(self.stream_name,
            self.shard_id, self.iterator_type)
        
        next_iterator = response['ShardIterator']

        start = datetime.datetime.now()
        finish = start + datetime.timedelta(seconds=self.worker_time)
        
        while finish > datetime.datetime.now():
            try:
                response = kinesis.get_records(next_iterator, limit=25)
        

                records = response['Records']
            
                if records:
                    self.process_records(records)
            
                next_iterator = response['NextShardIterator']
                time.sleep(self.sleep_interval)
            except ProvisionedThroughputExceededException as ptee:
                time.sleep(1)

In [6]:
class EchoWorker(KinesisWorker):
    """Consumers that echos received data to standard output"""
    def process_records(self, records):
        for part_key, data in self.iter_records(records):
            print(part_key, ":", data)

In [7]:
shard_id = 'shardId-000000000000'
iterator_type =  'LATEST'
worker = EchoWorker(stream_name, shard_id, iterator_type, worker_time=10)

In [8]:
worker.run()

8.8.8.8 : 2018-07-28T21:42:42.357598
8.8.8.8 : 2018-07-28T21:42:44.517408
8.8.8.9 : 2018-07-28T21:42:45.628920
8.8.8.8 : 2018-07-28T21:42:47.407898
8.8.8.8 : 2018-07-28T21:42:49.528144


In [9]:
from collections import defaultdict, Counter
from dateutil import parser
from operator import itemgetter

class CounterWorker(KinesisWorker):
    """Consumer that counts IP occurances in 1-minute time buckets"""
    
    def __init__(self, stream_name, shard_id, iterator_type, worker_time):
        sleep_interval = 20 # seconds
        super().__init__(stream_name, shard_id, iterator_type, worker_time, sleep_interval)
        
    
    @staticmethod
    def print_counters(time_buckets):
        """helper method to show the counting results"""
        for timestamp, ip_counts in time_buckets.items():
            # sort counts with respect to the IP address
            ip_counts = sorted(ip_counts.items(), key=itemgetter(0))
            print(timestamp, ':', list(ip_counts))
            
    def process_records(self, records):
        time_buckets = defaultdict(Counter)
        for ip_addr, timestamp_str in self.iter_records(records):
            timestamp = parser.parse(timestamp_str)
            timestamp = timestamp.replace(second=0, microsecond=0)
            time_buckets[timestamp][ip_addr] += 1
        self.print_counters(time_buckets)         

In [10]:
worker = CounterWorker(stream_name, shard_id, iterator_type, worker_time=120)
worker.run()

2018-07-28 21:42:00 : [('8.8.8.8', 4), ('8.8.8.9', 1)]
2018-07-28 21:43:00 : [('8.8.8.8', 6), ('8.8.8.9', 3)]
2018-07-28 21:43:00 : [('8.8.8.8', 9), ('8.8.8.9', 3)]
2018-07-28 21:43:00 : [('8.8.8.8', 10), ('8.8.8.9', 4)]
2018-07-28 21:43:00 : [('8.8.8.8', 3), ('8.8.8.9', 2)]
2018-07-28 21:44:00 : [('8.8.8.8', 6), ('8.8.8.9', 2)]
2018-07-28 21:44:00 : [('8.8.8.8', 9), ('8.8.8.9', 4)]


In [11]:
# delete the stream at the end of the exercise to minimize AWS costs
kinesis.delete_stream(stream_name)

stream blogpost-word-stream not found. Exiting
stream blogpost-word-stream not found. Exiting
