Lab
===
In Pairs
------------
### 1. Work through the getting started guide at http://avro.apache.org/docs/current/gettingstartedpython.html

In [239]:
import avro.io
import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

### 2. Generate sample data
Consider the following data:

In [240]:
data = [{'pedigree': {'true_as_of_secs': 1234567890},
           'dataunit': {'page_property': {'id': {'url': 'http://mysite.com/'},
                                    'property': {'page_views': 1}}},
           },
        {"pedigree": {"true_as_of_secs": 1234567891},
           "dataunit": {"equiv": {"id1": {"cookie": "ABCDE"},
                                  "id2": {"user_id": 123}}},
           },
        {"pedigree": {"true_as_of_secs": 1234567892},
           "dataunit": {"page_view": {"person": {"cookie": "ABCDE"},
                                      "page": {"url": "http://mysite.com/"},
                                      "nonce": 1234567890987654321}}
           },
        {"pedigree": {'true_as_of_secs': 1234567893},
           "dataunit": {"person_property": {"id": {"cookie": "ABCDE"},
                                            "property": {"full_name": "Alessandro"}}}
           },
        {"pedigree": {'true_as_of_secs': 1234567894},
           "dataunit": {"person_property": {"id": {"user_id": 123},
                                            "property": {"gender": "MALE"}}}
           },
        {"pedigree": {'true_as_of_secs': 1234567895},
           "dataunit": {"person_property": {"id": {"user_id": 123},
                                            "property": {"location": {"city" : "San Francisco", 
                                                                      "state": "CA"}}}}
           },
       {"pedigree": {'true_as_of_secs': 1234567896},
               "dataunit": {"person_property": {"id": {"user_id": 9876543210},
                                                "property": {"age": 23}}}
               }]

In [241]:
import copy, pprint, time, math, random, string

#data gen
def datagenerator(n):

    #lambda function to randomly assign gender
    gender = lambda x: 'MALE' if x == 1 else 'FEMALE'

    for i in xrange(0, n):
        #creating random data
        datacopy = copy.deepcopy(data)
        URL = r"https://"+''.join(random.SystemRandom().choice(
            string.ascii_lowercase) for _ in range(6))+".com"
        cookie = ''.join(random.SystemRandom().choice(
            string.ascii_uppercase) for _ in range(5))
        userid = random.randint(100,999)
        nonce = ''.join([str(random.randint(0, 9)) for i in range(19)])
        name = random.SystemRandom().choice(string.ascii_uppercase)+\
            ''.join(random.SystemRandom().choice(string.ascii_lowercase)
            for _ in range(6))
        city = random.SystemRandom().choice(string.ascii_uppercase)+\
            ''.join(random.SystemRandom().choice(string.ascii_lowercase)
            for _ in range(9))
        state = ''.join(random.SystemRandom().choice(string.ascii_uppercase)
            for _ in range(2))
        age = int(''.join([str(random.randint(0, 9)) for i in range(2)]))
        
        #updating data by list element then dict element
        datacopy[0]['pedigree']['true_as_of_secs'] = int(math.floor(time.time()))
        datacopy[0]['dataunit']['page_property']['id']['url'] = URL
        datacopy[0]['dataunit']['page_property']['property']['page_views'] = \
            random.randint(1,10)

        datacopy[1]['pedigree']['true_as_of_secs'] = int(math.floor(time.time()))
        datacopy[1]['dataunit']['equiv']['id1']['cookie'] = cookie
        datacopy[1]['dataunit']['equiv']['id2']['user_id'] = userid

        datacopy[2]['pedigree']['true_as_of_secs'] = int(math.floor(time.time()))
        datacopy[2]['dataunit']['page_view']['page']['url'] = URL
        datacopy[2]['dataunit']['page_view']['nonce'] = long(nonce)
        datacopy[2]['dataunit']['page_view']['person']['cookie'] = cookie

        datacopy[3]['pedigree']['true_as_of_secs'] = int(math.floor(time.time()))
        datacopy[3]['dataunit']['person_property']['id']['cookie'] = cookie
        datacopy[3]['dataunit']['person_property']['property']['full_name'] = \
            name

        datacopy[4]['pedigree']['true_as_of_secs'] = int(math.floor(time.time())) 
        datacopy[4]['dataunit']['person_property']['id']['user_id'] = userid
        datacopy[4]['dataunit']['person_property']['property']['gender'] = \
            gender(random.getrandbits(1))

        datacopy[5]['pedigree']['true_as_of_secs'] = int(math.floor(time.time()))
        datacopy[5]['dataunit']['person_property']['id']['user_id'] = userid
        datacopy[5]['dataunit']['person_property']['property']['location']['city']\
            = city
        datacopy[5]['dataunit']['person_property']['property']['location']['state']\
            = state
            
        datacopy[6]['pedigree']['true_as_of_secs'] = int(math.floor(time.time()))
        datacopy[6]['dataunit']['person_property']['id']['user_id'] = userid
        datacopy[6]['dataunit']['person_property']['property']['age'] = age
            
        yield datacopy

Write a generator that will yield `n` data and validate it against the schema below:

In [258]:
%%writefile schema.avsc
[
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "Pedigree",
        "fields": [{"name": "true_as_of_secs", "type": "int"}]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PersonID1",
        "fields": [{"name": "cookie", "type": "string"}]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PersonID2",
        "fields": [{"name": "user_id", "type": "long"}]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PageID",
        "fields": [{"name": "url", "type": "string"}]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PageProperty",
        "fields": [
            {
                "name": "id",
                "type": "PageID"
            },
            {
                "name": "property",
                "type": {
                    "type": "record",
                    "name": "PagePropertyValue",
                    "fields": [{"name": "page_views", "type": "int"}]
                }
            }
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PersonProperty",
        "fields": [
            {
                "name": "id",
                "type": [
                    "PersonID1",
                    "PersonID2"
                ]
            },
            {
                "name": "property",
                "type": [
                    {
                        "type": "record",
                        "name": "PersonPropertyValue1",
                        "fields": [{"name": "full_name", "type": "string"}]
                    },
                    {
                        "type": "record",
                        "name": "PersonPropertyValue2",
                        "fields": [
                            {
                                "name": "gender", 
                                "type": {
                                    "type": "enum",
                                    "name": "GenderType",
                                    "symbols": ["MALE", "FEMALE"]
                                }
                            }
                        ]
                    },
                    {
                        "type": "record",
                        "name": "PersonPropertyValue3",
                        "fields": [
                            {
                                "name": "location", 
                                "type": {
                                    "type": "record",
                                    "name": "Location",
                                    "fields": [
                                        {"name": "city", "type": ["string", "null"]},
                                        {"name": "state", "type": ["string", "null"]},
                                        {"name": "country", "type": [ "string","null"]}
                                    ]
                                }
                            }
                        ]
                    },
                    {
                        "type": "record",
                        "name": "PersonPropertyValue4",
                        "fields": [{"name": "age", "type": "int"}]
                    }
                ]
            }
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "EquivEdge",
        "fields": [
            {"name": "id1", "type": ["PersonID1", "PersonID2"]},
            {"name": "id2", "type": ["PersonID1", "PersonID2"]}
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PageViewEdge",
        "fields": [
            {"name": "person", "type": ["PersonID1", "PersonID2"]},
            {"name": "page", "type": "PageID"},
            {"name": "nonce", "type": "long"}
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "Data",
        "fields": [
            {
                "name": "pedigree",
                "type": "Pedigree"
            },
            {
                "name": "dataunit",
                "type": [
                    {
                        "type": "record",
                        "name": "DataUnit1",
                        "fields": [{"name": "person_property", "type": "PersonProperty"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit2",
                        "fields": [{"name": "page_property", "type": "PageProperty"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit3",
                        "fields": [{"name": "equiv", "type": "EquivEdge"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit4",
                        "fields": [{"name": "page_view", "type": "PageViewEdge"}]
                    }
                ]
            }
        ]
    }
]

Overwriting schema.avsc


In [259]:
schema = avro.schema.parse(open("schema.avsc").read())
def test_good_data(datum, schema=schema):
    return avro.io.validate(schema, datum)

In [260]:
import pprint, json

pp = pprint.PrettyPrinter()
newdata = datagenerator(10)
testdata = []
for i in xrange(0,1):
    testdata.extend(newdata.next()) 

# pp.pprint(testdata)
# print map(test_good_data, testdata)
print map(test_good_data, data)

[True, True, True, True, True, True, True]


In [246]:
smalldata = [{'pedigree': {'true_as_of_secs': 1234567890},
           'dataunit': {'page_property': {'id': {'url': 'http://mysite.com/blog'},
                                    'property': {'page_views': 1}}},
           },
       {'pedigree': {'true_as_of_secs': 1234567891},
           'dataunit': {'page_property': {'id': {'url': 'http://mysite.com/'},
                                    'property': {'page_views': 1}}},
           },
        {"pedigree": {'true_as_of_secs': 1234567896},
               "dataunit": {"page_link": {"source": {'url' : "http://mysite.com/blog"},
                                          "target": {'url' : "http://mysite.com/"},}
            }}]

In [247]:
%%writefile smallschema.avsc
[
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "Pedigree",
        "fields": [{"name": "true_as_of_secs", "type": "int"}]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PageID1",
        "fields": [{"name": "url", "type": "string"}]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PageProperty",
        "fields": [
            {
                "name": "id",
                "type": "PageID1"
            },
            {
                "name": "property",
                "type": {
                    "type": "record",
                    "name": "PagePropertyValue",
                    "fields": [{"name": "page_views", "type": "int"}]
                }
            }
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PageLinkEdge",
        "fields": [
            {"name": "source", "type": "PageID1"},
            {"name": "target", "type": "PageID1"}
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "Data",
        "fields": [
            {
                "name": "pedigree",
                "type": "Pedigree"
            },
            {
                "name": "dataunit",
                "type": [
                    {
                        "type": "record",
                        "name": "DataUnit2",
                        "fields": [{"name": "page_property", "type": "PageProperty"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit5",
                        "fields": [{"name": "page_link", "type": "PageLinkEdge"}]
                    }
                ]
            }
        ]
    }
]

Overwriting smallschema.avsc


In [248]:
schema = avro.schema.parse(open("smallschema.avsc").read())

In [249]:
def test_good_data(datum, schema=schema):
    return avro.io.validate(schema, datum)

In [250]:
# import json
# with open('data.json', 'w') as outfile:
#     json.dump(data, outfile)
print map(test_good_data, smalldata)
# print type(data[-1]['dataunit']['page_link']['target'])
# print schema

[True, True, True]


### 3. Extend the Schema

#### a) Allow a new person property called age which accepts integers  
1. Add this to your generator (these data should fail validation)
2. Adjust your schema to allow these new data.
3. Invent negative examples to make sure your schema is not too permissive.

In [7]:
age_example = {"pedigree": {'true_as_of_secs': 1234567896},
               "dataunit": {"person_property": {"id": {"user_id": 9876543210},
                                                "property": {"age": 23}}}
               }

#### b) Allow links between pages
1. Add this to your generator (these data should fail validation)
2. Adjust your schema to allow these new data.
3. Invent negative examples to make sure your schema is not too permissive.

In [211]:
linked_edge_example = {"pedigree": {'true_as_of_secs': 1234567896},
               "dataunit": {"page_link": {"source": "http://mysite.com/blog",
                                          "target": "http://mysite.com/"}}
               }

In [212]:
avro.io.validate(schema, linked_edge_example)

False

On Your Own
------------
Define a fact-based graph schema based on the system you described yesterday.
1. Use [Gliffy](https://www.gliffy.com/) to map it out.
2. Write sample data and tests to see if your sample data fits. *Also generate examples that should fail.*
3. Use [Avro](http://avro.apache.org/docs/current/index.html) to define your schema and test it