In [89]:
from os import listdir
import pandas as pd
from collections import defaultdict
from datetime import datetime, timedelta

In [81]:
# Data source: http://web.mta.info/developers/turnstile.html
datafiles = ['../data/' + x for x in listdir('../data/')]

list_ = []
for file_ in datafiles:
    df = pd.read_csv(file_)
    list_.append(df)
frame = pd.concat(list_)

print frame.shape
frame.head(10)

# Slice limited sample
frame = frame[:10000]

(777253, 11)


In [82]:
# Build dictionary for challenge 1

device_audits = defaultdict(list)

for index, row in frame.iterrows():
    device_audits[tuple(row[0:4])].append(row[4:].tolist())
    
print len(device_audits)


234


In [83]:
# Build dictionary for challenge 2
# datetime, entries

device_audits = defaultdict(list)

for index, row in frame.iterrows():
    dstr = row['DATE'] + ' ' + row['TIME']
    dt = datetime.strptime(dstr, '%m/%d/%Y %H:%M:%S')
    en = row[u'ENTRIES']
    device_audits[tuple(row[0:4])].append([dt,en])
    
print len(device_audits)

234


## Times

Audits are usually taken every 4 hours, but there are also irregular audits in the data.  The DESC variable indicates whether a particular audit was taken regularly or not (http://web.mta.info/developers/resources/nyct/turnstile/ts_Field_Description.txt).  Samples may be taken irregularly due to planning or troubleshooting activities.

In [84]:
print "Count of audit types"
for s in set(frame["DESC"]):
    ct = frame["DESC"].tolist().count(s)
    print "%s: %d"%(s,ct)

Count of audit types
REGULAR: 9978
RECOVR AUD: 22


In [91]:
# Challenge 3

# Collapse a list of audits so that all audits on the same are combined into one
def collapse_days(l):
    day_entries = []
    dayset = set([x[0].date() for x in l])
    for day in dayset:
        entries = sum([x[1] for x in l if x[0].date() == day])
        day_entries.append([day, entries])
    return day_entries

# Test collapse_days
delta1 = timedelta(days=1)
print collapse_days([[datetime.now(), 5], [datetime.now(), 2], [datetime.now() - delta1, 2]])

device_audits_daily = defaultdict(list)
for key,value in device_audits.iteritems():
    device_audits_daily[key] = collapse_days(value)
    
print device_audits_daily.iteritems().next()

[[datetime.date(2016, 6, 26), 2], [datetime.date(2016, 6, 27), 7]]
(('A050', 'R088', '00-02-00', 'CORTLANDT ST'), [[datetime.date(2016, 5, 22), 18842563], [datetime.date(2016, 5, 23), 21985643], [datetime.date(2016, 5, 21), 18839061], [datetime.date(2016, 5, 26), 18852669], [datetime.date(2016, 5, 27), 18855632], [datetime.date(2016, 5, 24), 18847195], [datetime.date(2016, 5, 25), 18849798]])
