In [1]:
import urllib
import csv
import os
import datetime
import dateutil.parser

In [None]:
### CHALLENGE 1 ###

In [3]:
def file_save(start, end, timedelta): 
    """
    Saves the .txt files on the MTA website as .csv files in a directory called 'data'.
    
    @params start: datetime object
    @params end: datetime object
    @params timedelta: timedelta object
    """
    while start < end:
        suffix_year = str(start.year)[-2:]          # Creates the year in the suffix
        
        if len(str(start.month)) == 1:              # Creates the month in the suffix
            suffix_month = '0' + str(start.month)
        else:
            suffix_month = str(start.month)
        
        if len(str(start.day)) == 1:                # Creates the day in the suffix
            suffix_day = '0' + str(start.day)
        else:
            suffix_day = str(start.day)
        suf = suffix_year + suffix_month + suffix_day
        
        url = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_' + suf + '.txt'
        urllib.urlretrieve(url, 'data/turnstile_' + suf + '.csv')
        
        start = start + timedelta

In [8]:
# April - June 2014
# start1 = datetime.date(2014,3,29)
start1 = datetime.date(2014,4,26)
end1 = datetime.date(2014,6,30)
# April - June 2015
start2 = datetime.date(2015,3,28)
end2 = datetime.date(2015,6,30)

# Iterate by 7 day increments
timedelta = datetime.timedelta(days=7)

file_save(start1, end1, timedelta)
file_save(start2, end2, timedelta)

error: [Errno 54] Connection reset by peer

In [None]:
def read_files(file_list):
    """
    Reads the .csv files in the given file list and imports the data into a 
    dictionary with the following structure:
    key: (col1, col2, col3, col4)
    value: [[col5, col6, col7, col8, col9, col10, col11],...]
    
    NOTE that the first four columns repeat in the data, thus the values 
    are lists of lists representing other data points for the same key. 
    
    @params file_list: list
    Returns the dictionary.
    """
    mta_dict = {}
    for f in file_list:
        fn = open('data/' + f)
        reader = csv.reader(fn)
        reader.next()
        for line in reader:
            key = (line[0], line[1], line[2], line[3])
            value = line [4:]
            if key in mta_dict:
                mta_dict.get(key).append(value)
            else:
                mta_dict[key] = [value]
        fn.close()
    return mta_dict()

In [None]:
file_list = [f for f in os.listdir('data/') if f.endswith('.csv')]  # List of files in 'data' directory.
mta_dict = read_files(file_list)

In [None]:
# Sanity check some values
print 'Num of keys: ' + str(len(mta_dict.keys()))
count = 0
for key in mta_dict.keys():
    count += len(mta_dict[key])
print 'Num of values: ' + str(count)

print 'Sample key: value pair '
n_items = {k: mta_dict[k] for k in mta_dict.keys()[:1]}
print n_items

In [None]:
### Challenge 2 ###

In [None]:
def make_time_series(full_dict)
    """
    Keeping the keys the same, creates a new dictionary where the values are lists 
    of lists of dates and the number of entries on that day. The structure is:
    key: (col1, col2, col3, col4)
    value: [[datetime object, number of entries],...]
    
    @params full_dict: dictionary
    Returns the new dictionary.
    """
    time_dict = {}
    for key, value in full_dict.iteritems():
        for v in value:
            date = v[2]
            time = v[3]
            datetime = dateutil.parser.parse(str(date) + ' ' + str(time))
            try:
                entry = int(v[5].lstrip('0'))
            except ValueError:
                entry = 0
            updated_value = [datetime, entry]
            if key in time_dict:
                time_dict.get(key).append(updated_value)
            else:
                time_dict[key] = [updated_value]
    return time_dict

In [None]:
time_dict = make_time_series(mta_dict)

In [None]:
# Sanity check some values
print 'Num of keys: ' + str(len(time_dict.keys()))
count = 0
for key in time_dict.keys():
    count += len(time_dict[key])
print 'Num of values: ' + str(count)

print 'Sample key: value pair '
n_items = {k: time_dict[k] for k in time_dict.keys()[:1]}
print n_items

In [None]:
### Challenge 3 ###

In [None]:
def make_daily_time_series(time_dict):
    """
    Condenses the values of the given dictionary to create a daily 
    count of the number of entries. Structure is as follows:
    key: (col1, col2, col3, col4)
    value: [[date object, number of entries],...] 
    """
    day_dict = {}
    for key, value in time_dict.iteritems():
        values_sorted = sorted(value)
        i = 0
        while i < len(values_sorted):
            while values_sorted[i][0].day == values_sorted[i+1][0].day:
                values_sorted[i][1] += values_sorted[i+1][1]
                del values_sorted[i+1]
                if i == len(values_sorted)-1: break
            i = i + 1
        for day in values_sorted:
            day[0] = datetime.date(day[0].year, day[0].month, day[0].day)
        day_dict[key] = values_sorted
    return day_dict

In [None]:
daily_dict = make_daily_time_series(time_dict)

In [None]:
# Sanity check some values
print 'Num of keys: ' + str(len(daily_dict.keys()))
count = 0
for key in daily_dict.keys():
    count += len(daily_dict[key])
print 'Num of values: ' + str(count)

print 'Sample key: value pair '
n_items = {k: daily_dict[k] for k in daily_dict.keys()[:1]}
print n_items