In [None]:
from __future__ import print_function, division

## Challenge 1
- Open up a new ipython notebook
- Download a few mta turnstile data files
- Open up a file, use csv reader to read it, make a python dict where there is a key for each (C/A, UNIT, SCP, STATION). These are the first four columns. The value for this key should be a list of lists. Each list in the list is the rest of the columns in a row. For example, one key-value pair should look like


        {    ('A002','R051','02-00-00','LEXINGTON AVE'):    
             [
               ['NQR456', 'BMT', '01/03/2015', '03:00:00', 'REGULAR', '0004945474', '0001675324'],          
                 ['NQR456', 'BMT', '01/03/2015', '07:00:00', 'REGULAR', '0004945478', '0001675333'],  
                ['NQR456', 'BMT', '01/03/2015', '11:00:00', 'REGULAR', '0004945515', '0001675364'],
              ...   
         ] 
        }


In [1]:
url_template = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_%s.txt'
for date in [160903, 160910, 160917]:
    url = url_template % date
    !wget --directory-prefix=data/ {url}

--2016-09-25 12:42:11--  http://web.mta.info/developers/data/nyct/turnstile/turnstile_160903.txt
Resolving web.mta.info... 23.72.180.105, 23.72.181.201
Connecting to web.mta.info|23.72.180.105|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: 'data/turnstile_160903.txt'

turnstile_160903.tx     [     <=>            ]  24.13M  1.98MB/s    in 13s     

2016-09-25 12:42:24 (1.90 MB/s) - 'data/turnstile_160903.txt' saved [25301340]

--2016-09-25 12:42:24--  http://web.mta.info/developers/data/nyct/turnstile/turnstile_160910.txt
Resolving web.mta.info... 23.72.180.105, 23.72.181.201
Connecting to web.mta.info|23.72.180.105|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: 'data/turnstile_160910.txt'

turnstile_160910.tx     [                <=> ]  24.35M  1.11MB/s    in 18s     

2016-09-25 12:42:42 (1.37 MB/s) - 'data/turnstile_160910.txt' saved [25529149]

--2016-09-25 12:

In [32]:
import csv, glob
from collections import defaultdict
from itertools import islice

def read_csv(csv_file_name):

    turnstile_to_count_reading = defaultdict(list)
    with open(csv_file_name, 'r') as csv_file:
        mta_reader = csv.reader(csv_file)
        for i, row in enumerate(mta_reader):
            # skip the first row, it's just header strings
            if i == 0:
                continue
            # read the rest
            turnstile_info = tuple(row[:4])
            count_reading = row[4:]
            turnstile_to_count_reading[turnstile_info].append(count_reading)
    return turnstile_to_count_reading


#A) List comprehension
weekly_data_dicts = [read_csv(csvfile) for csvfile in glob.glob("data/turnstile_*.txt")]

#B) Alternatively, separating the steps on multiple lines
# weekly_data_dicts = []
# for data_file in glob.glob("turnstile_*.txt"):
#    print('Processing %s' % data_file)
#    weekly_data_dicts.append(read_csv(data_file))
    
#(Choose the approach you find more readable)

In [None]:
from pprint import pprint
from itertools import islice

# just get 2 keys from the first dict to now overwhelm the output
sample_dict = dict(weekly_data_dicts[0].items())
pprint(sample_dict)

## Challenge 2
- Let's turn this into a time series.

 For each key (basically the control area, unit, device address and station of a specific turnstile), have a list again, but let the list be comprised of just the point in time and the cumulative count of entries.

This basically means keeping only the date, time, and entries fields in each list. You can convert the date and time into datetime objects -- That is a python class that represents a point in time. You can combine the date and time fields into a string and use the [dateutil][1] module to convert it into a datetime object. For an example check [this StackOverflow question][2].

Your new dict should look something like
 
    {    ('A002','R051','02-00-00','LEXINGTON AVE'):    
             [
                [datetime.datetime(2013, 3, 2, 3, 0), 3788],
                [datetime.datetime(2013, 3, 2, 7, 0), 2585],
                [datetime.datetime(2013, 3, 2, 12, 0), 10653],
                [datetime.datetime(2013, 3, 2, 17, 0), 11016],
                [datetime.datetime(2013, 3, 2, 23, 0), 10666],
                [datetime.datetime(2013, 3, 3, 3, 0), 10814],
                [datetime.datetime(2013, 3, 3, 7, 0), 10229],
                ...
              ],
     ....
     }



In [None]:
from itertools import groupby
from operator import itemgetter

def count_within_normal_bounds(count):
    if count is None:
        return True
    else:
        return 10000 > count >= 0

def convert_time_series_to_daily(high_res_time_series):
    daily_time_series = []
    # I can define a function WITHIN another function. It will only
    # be defined within the scope of the mother function
    def day_of_timestamp(time_series_entry):
        timestamp, tot_entries = time_series_entry
        # the .date() method of a datetime object returns the day
        #(as another datetime object)
        return timestamp.date()
    # groupby() requires data to be sorted. It is sorted already here,
    # but if it wasn't, we would have to sort it first
    count_on_previous_day = None
    for day, entries_on_this_day in groupby(high_res_time_series,
                                                      key=day_of_timestamp):
        # get the maximum cumulative count among the entries on this day
        cum_entry_count_on_day = max([count for time, count in entries_on_this_day])
        # skip the first entry if we don't know the previous day
        if count_on_previous_day is None:
            daily_entries = None
        else:
            daily_entries = cum_entry_count_on_day - count_on_previous_day
        # Save today's count for tomorrow's calculation
        count_on_previous_day = cum_entry_count_on_day
        # Only append if the cumulative increased. Otherwise there is something wrong in the data
        # skip with a warning
        if count_within_normal_bounds(daily_entries):
            daily_time_series.append( (day, daily_entries) )
        else:
            print('WARNING. Abnormal entry count found '
                   'on day %s: %s' % (day, daily_entries))
            daily_time_series.append( (day, None) )

    return daily_time_series


def combine_multiple_weeks_into_single_high_res_timeseries(weekly_time_series):
    combined_time_series = defaultdict(list)
    for turnstile_to_weeklong_time_series in weekly_time_series:
        for turnstile, weeklong_time_series in turnstile_to_weeklong_time_series.items():
            combined_time_series[turnstile] += weeklong_time_series
    # It's already sorted due to the nature of the files but if not you would want to sort
    # the dates first before retiurning it
    return combined_time_series


def convert_turnstile_to_high_res_time_series_to_daily(turnstile_to_time_series):
    turnstile_to_daily_time_series = {}
    for i, (turnstile, time_series) in enumerate(turnstile_to_time_series.items()):
        print('Processing turnstile', turnstile)
        turnstile_to_daily_time_series[turnstile] = convert_time_series_to_daily(time_series)
    return turnstile_to_daily_time_series


turnstile_to_full_time_series = combine_multiple_weeks_into_single_high_res_timeseries(weekly_time_series)
turnstile_to_daily_time_series = convert_turnstile_to_high_res_time_series_to_daily(turnstile_to_full_time_series)


In [None]:
from datetime import datetime
from dateutil.parser import parse
from collections import defaultdict


def convert_week_data_to_time_series(week_data_dict):
    turnstile_to_time_series = defaultdict(list)
    for i, (turnstile, row_data) in enumerate(week_data_dict.items()):
        # report every 100 turnstiles
        if i%100 == 0:
            print('Processing turnstile', turnstile)
        for lines, division, datestr, timestr, event, cum_entries, cum_exits in row_data:
            timestamp = parse('%sT%s' % (datestr,timestr))
            turnstile_to_time_series[turnstile].append([timestamp, int(cum_entries)])
    return turnstile_to_time_series


# this takes a while
weekly_time_series = [convert_week_data_to_time_series(item) for item in weekly_data_dicts]


In [None]:
# Checking the result
# just get 2 keys from the first dict to now overwhelm the output
sample_turnstile_to_time_series = dict(islice(weekly_time_series[0].items(), 0, 2))
pprint(sample_turnstile_to_time_series)

## Challenge 3
- These counts are cumulative every n hours. We want total daily entries. 

Now make it that we again have the same keys, but now we have a single value for a single day, which is not cumulative counts but the total number of passengers that entered through this turnstile on this day.


In [42]:
# Let's check
pprint( turnstile_to_daily_time_series[('A002', 'R051', '02-00-00', '59 ST')] )

[(datetime.date(2016, 8, 27), None),
 (datetime.date(2016, 8, 28), 677),
 (datetime.date(2016, 8, 29), 1538),
 (datetime.date(2016, 8, 30), 1539),
 (datetime.date(2016, 8, 31), 1508),
 (datetime.date(2016, 9, 1), 1607),
 (datetime.date(2016, 9, 2), 1626),
 (datetime.date(2016, 9, 3), 906),
 (datetime.date(2016, 9, 4), 666),
 (datetime.date(2016, 9, 5), 683),
 (datetime.date(2016, 9, 6), 1399),
 (datetime.date(2016, 9, 7), 1534),
 (datetime.date(2016, 9, 8), 1763),
 (datetime.date(2016, 9, 9), 1686),
 (datetime.date(2016, 9, 10), 994),
 (datetime.date(2016, 9, 11), 727),
 (datetime.date(2016, 9, 12), 1574),
 (datetime.date(2016, 9, 13), 1724),
 (datetime.date(2016, 9, 14), 1760),
 (datetime.date(2016, 9, 15), 1775),
 (datetime.date(2016, 9, 16), 1790)]
