Lab
===
In Pairs
------------
### 1. Work through the getting started guide at http://avro.apache.org/docs/current/gettingstartedpython.html

In [1]:
import avro.io
import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

### 2. Generate sample data
Consider the following data:

In [4]:
data = [{'pedigree': {'true_as_of_secs': 1234567890},
           'dataunit': {'page_property': {'id': {'url': 'http://mysite.com/'},
                                    'property': {'page_views': 1}}},
           },
        {"pedigree": {"true_as_of_secs": 1234567891},
           "dataunit": {"equiv": {"id1": {"cookie": "ABCDE"},
                                  "id2": {"user_id": 123}}},
           },
        {"pedigree": {"true_as_of_secs": 1234567892},
           "dataunit": {"page_view": {"person": {"cookie": "ABCDE"},
                                      "page": {"url": "http://mysite.com/"},
                                      "nonce": 1234567890987654321}}
           },
        {"pedigree": {'true_as_of_secs': 1234567893},
           "dataunit": {"person_property": {"id": {"cookie": "ABCDE"},
                                            "property": {"full_name": "Alessandro"}}}
           },
        {"pedigree": {'true_as_of_secs': 1234567894},
           "dataunit": {"person_property": {"id": {"user_id": 123},
                                            "property": {"gender": "MALE"}}}
           },
        {"pedigree": {'true_as_of_secs': 1234567895},
           "dataunit": {"person_property": {"id": {"user_id": 123},
                                            "property": {"location": {"city" : "San Francisco", 
                                                                      "state": "CA"}}}}
           }]

Write a generator that will yield `n` data and validate it against the schema below:

In [2]:
%%writefile schema.avsc
[
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "Pedigree",
        "fields": [{"name": "true_as_of_secs", "type": "int"}]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PersonID1",
        "fields": [{"name": "cookie", "type": "string"}]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PersonID2",
        "fields": [{"name": "user_id", "type": "long"}]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PageID",
        "fields": [{"name": "url", "type": "string"}]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PageProperty",
        "fields": [
            {
                "name": "id",
                "type": "PageID"
            },
            {
                "name": "property",
                "type": {
                    "type": "record",
                    "name": "PagePropertyValue",
                    "fields": [{"name": "page_views", "type": "int"}]
                }
            }
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PersonProperty",
        "fields": [
            {
                "name": "id",
                "type": [
                    "PersonID1",
                    "PersonID2"
                ]
            },
            {
                "name": "property",
                "type": [
                    {
                        "type": "record",
                        "name": "PersonPropertyValue1",
                        "fields": [{"name": "full_name", "type": "string"}]
                    },
                    {
                        "type": "record",
                        "name": "PersonPropertyValue2",
                        "fields": [
                            {
                                "name": "gender", 
                                "type": {
                                    "type": "enum",
                                    "name": "GenderType",
                                    "symbols": ["MALE", "FEMALE"]
                                }
                            }
                        ]
                    },
                    {
                        "type": "record",
                        "name": "PersonPropertyValue3",
                        "fields": [
                            {
                                "name": "location", 
                                "type": {
                                    "type": "record",
                                    "name": "Location",
                                    "fields": [
                                        {"name": "city", "type": ["string", "null"]},
                                        {"name": "state", "type": ["string", "null"]},
                                        {"name": "country", "type": [ "string","null"]}
                                    ]
                                }
                            }
                        ]
                    }
                ]
            }
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "EquivEdge",
        "fields": [
            {"name": "id1", "type": ["PersonID1", "PersonID2"]},
            {"name": "id2", "type": ["PersonID1", "PersonID2"]}
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "PageViewEdge",
        "fields": [
            {"name": "person", "type": ["PersonID1", "PersonID2"]},
            {"name": "page", "type": "PageID"},
            {"name": "nonce", "type": "long"}
        ]
    },
    {
        "namespace": "analytics.avro",
        "type": "record",
        "name": "Data",
        "fields": [
            {
                "name": "pedigree",
                "type": "Pedigree"
            },
            {
                "name": "dataunit",
                "type": [
                    {
                        "type": "record",
                        "name": "DataUnit1",
                        "fields": [{"name": "person_property", "type": "PersonProperty"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit2",
                        "fields": [{"name": "page_property", "type": "PageProperty"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit3",
                        "fields": [{"name": "equiv", "type": "EquivEdge"}]
                    },
                    {
                        "type": "record",
                        "name": "DataUnit4",
                        "fields": [{"name": "page_view", "type": "PageViewEdge"}]
                    }
                ]
            }
        ]
    }
]

Overwriting schema.avsc


In [3]:
schema = avro.schema.parse(open("schema.avsc").read())

In [5]:
def test_good_data(datum, schema=schema):
    return avro.io.validate(schema, datum)

In [6]:
map(test_good_data, gooddata)

[True, True, True, True, True, True]

### 3. Extend the Schema

#### a) Allow a new person property called age which accepts integers  
1. Add this to your generator (these data should fail validation)
2. Adjust your schema to allow these new data.
3. Invent negative examples to make sure your schema is not too permissive.

In [7]:
age_example = {"pedigree": {'true_as_of_secs': 1234567896},
               "dataunit": {"person_property": {"id": {"user_id": 9876543210},
                                                "property": {"age": 23}}}
               }

In [8]:
avro.io.validate(schema, age_example)

False

#### b) Allow links between pages
1. Add this to your generator (these data should fail validation)
2. Adjust your schema to allow these new data.
3. Invent negative examples to make sure your schema is not too permissive.

In [9]:
linked_edge_example = {"pedigree": {'true_as_of_secs': 1234567896},
               "dataunit": {"page_link": {"source": "http://mysite.com/blog",
                                          "target": "http://mysite.com/"}}
               }

In [10]:
avro.io.validate(schema, age_example)

False

On Your Own
------------
Define a fact-based graph schema based on the system you described yesterday.
1. Use [Gliffy](https://www.gliffy.com/) to map it out.
2. Write sample data and tests to see if your sample data fits. *Also generate examples that should fail.*
3. Use [Avro](http://avro.apache.org/docs/current/index.html) to define your schema and test it