In [1]:
import urllib
import csv
import os
import datetime
import dateutil.parser
import copy
from collections import defaultdict
import pickle

# Challenge 1 #

In [None]:
def read_files(file_list):
    """
    Reads the .csv files in the given file list and imports the data into a 
    dictionary with the following structure:
    key: (col1, col2, col3, col4)
    value: [[col5, col6, col7, col8, col9, col10, col11],...]
    
    NOTE that the format of the .csv file changes and thus we have two
    implementations of reading the file. 
    
    @params file_list: list
    Returns the dictionary.
    """
    mta_dict = {}
    
    for f in file_list:
        fn = open('data/' + f)
        reader = csv.reader(fn)
        reader.next()
        for line in reader:
            key = (line[0], line[1], line[2], line[3])
            value = line [4:]            
            if key in mta_dict:
                mta_dict.get(key).append(value)
            else:
                mta_dict[key] = [value]
        fn.close()
        print 'Done: ' + f
    return mta_dict

In [2]:
def read_files_modified(file_list):
    """
    Reads the .csv files in the given file list and imports the data into a 
    dictionary with the following structure:
    key: (col1, col2, col3, col4)
    value: [[col5, col6, col7, col8, col9, col10, col11],...]
    
    NOTE that the format of the .csv file changes and thus we have two
    implementations of reading the file. 
    
    @params file_list: list
    Returns the dictionary.
    """
    mta_dict = {}
    
    for f in file_list:
        fn = open('data/' + f)
        reader = csv.reader(fn)
        reader.next()
        for line in reader:
            key = (line[0], line[1], line[2], line[3]+' '+line[4])
            value = line [5:]            
            if key in mta_dict:
                mta_dict.get(key).append(value)
            else:
                mta_dict[key] = [value]
        fn.close()
        print 'Done: ' + f
    return mta_dict

In [3]:
# File list includes data from 3/28/15 through 6/27/15
file_list = [f for f in os.listdir('data/') if '.csv' in f]  # List of files in 'data' directory.
# mta_dict = read_files(file_list)
mta_dict_mod = read_files_modified(file_list)

Done: turnstile_150502.csv
Done: turnstile_150509.csv
Done: turnstile_150516.csv
Done: turnstile_150523.csv
Done: turnstile_150530.csv


In [None]:
# Sanity check some values
print 'Num of keys: ' + str(len(mta_dict.keys()))
count1 = 0
for key in mta_dict.keys():
    count1 += len(mta_dict[key])
print 'Num of values: ' + str(count1)

print 'Sample key: value pair '
n_items1 = {k: mta_dict_mod[k] for k in mta_dict_mod.keys()[:2]}
print n_items1

# Challenge 2 #

In [None]:
def make_time_series(full_dict):
    """
    Keeping the keys the same, creates a new dictionary where the values are lists 
    of lists of dates and the number of entries on that day. The structure is:
    key: (col1, col2, col3, col4)
    value: [[datetime object, cumulative number of entries],...]
    
    @params full_dict: dictionary
    Returns the new dictionary.
    """
    time_dict = defaultdict(list)
    count = 0
    for key, value in full_dict.iteritems():
        if count % 100 == 0: print count
        for v in value:
            date = v[2]
            time = v[3]
            datetime = dateutil.parser.parse(str(date) + ' ' + str(time))
            try:
                entry = int(v[5].lstrip('0'))
            except ValueError:
                entry = 0
            updated_value = [datetime, entry]
            time_dict[key].append(updated_value)
        count += 1
    return time_dict

In [4]:
def make_time_series_mod(full_dict):
    """
    Keeping the keys the same, creates a new dictionary where the values are lists 
    of lists of dates and the number of entries on that day. The structure is:
    key: (col1, col2, col3, col4)
    value: [[datetime object, cumulative number of entries],...]
    
    @params full_dict: dictionary
    Returns the new dictionary.
    """
    time_dict = defaultdict(list)
    count = 0
    for key, value in full_dict.iteritems():
        if count % 100 == 0: print count
        for v in value:
            date = v[1]
            time = v[2]
            datetime = dateutil.parser.parse(str(date) + ' ' + str(time))
            try:
                entry = int(v[4].lstrip('0'))
            except ValueError:
                entry = 0
            try:
                exit = int(v[5].lstrip('0'))
            except:
                exit = 0
            updated_value = [datetime, entry, exit]
            time_dict[key].append(updated_value)
        count += 1
    return time_dict

In [5]:
# time_dict = make_time_series(mta_dict)
time_dict_mod = make_time_series_mod(mta_dict_mod)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500


In [None]:
import pickle
with open('mta_mod.pickle','wb') as handle:
    pickle.dump(time_dict_mod, handle)

In [None]:
import pickle
with open('mta.pickle','wb') as handle:
    pickle.dump(time_dict, handle)
    

In [None]:
# Sanity check some values
print 'Num of keys: ' + str(len(time_dict_mod.keys()))
count2 = 0
for key in time_dict_mod.keys():
    count2 += len(time_dict_mod[key])
print 'Num of values: ' + str(count2)

print 'Sample key: value pair '
n_items2 = {k: time_dict_mod[k] for k in time_dict_mod.keys()[:1]}
print n_items2

# Challenge 3 #

In [None]:
with open('mta.pickle','rb') as handle:
    time_dict = pickle.load(handle)

In [6]:
def make_diffs(time_dict):
    diff_dict = defaultdict(list)
    for key, rows in time_dict.items():
        rows.sort(key = lambda x: x[0])
        for i in range(len(rows)-1):
            diff_dict[key].append([rows[i][0], rows[i+1][1]-rows[i][1], rows[i+1][2]-rows[i][2]])            
    return diff_dict

In [7]:
# daily_dict = make_diffs(time_dict)
daily_dict_mod = make_diffs(time_dict_mod)

# with open('hourly_entry_mta.pickle','wb') as handle:
#     pickle.dump(daily_dict, handle)

In [None]:
def check_zero_entries(d):
    zero_count = 0
    tot_count = 0
    for key in d.keys():
        for day in d[key]:
            if day[1] == 0: zero_count += 1
            tot_count += 1
    print 'There are %d data points with zero entries.' % zero_count
    print 'There are %d data points total' % tot_count
    print 'This represents %.9f of the data' % (zero_count//tot_count)

check_zero_entries(daily_dict_mod)

In [None]:
# Sanity check some values
print 'Num of keys: ' + str(len(daily_dict_mod.keys()))
count3 = 0
for key in daily_dict_mod.keys():
    count3 += len(daily_dict_mod[key])
print 'Num of values: ' + str(count3)

print 'Sample key: value pair '
n_items3 = {k: daily_dict_mod[k] for k in daily_dict_mod.keys()[:1]}
print n_items3

In [None]:
# with open('hourly_entry_mta.pickle','rb') as handle:
#     daily_dict_p = pickle.load(handle)

In [8]:
def make_daily(diff_dict):
    daily = defaultdict(list)
    for key, rows in diff_dict.items():
#         i = 0
#         while i < len(datetimes)-1: 
#             while datetimes[i][0].day == datetimes[i+1][0].day:
#                 datetimes[i][1] += datetimes[i+1][1]
#                 del datetimes[i+1]
#                 if i == len(datetimes)-1: break
#             i += 1
    
        by_day = {}
        for time in rows:
            day = time[0].date()
            by_day[day] = by_day.get(day, 0) + time[1]
        daily[key] = sorted(by_day.items())
    
    return daily

In [9]:
def make_daily_mod(diff_dict):
    daily = defaultdict(list)
    for key, rows in diff_dict.items():
        by_day = {}
        for time in rows:
            day = time[0].date()
            if time[1] < 0:
                time[1] = 0
            if time[2] < 0:
                time[2] = 0
            by_day[day] = map(lambda x,y: x+y, by_day.get(day, [0,0]),[time[1],time[2]]) 
        daily[key] = sorted(by_day.items())
    
    return daily

In [10]:
# d = make_daily(daily_dict)
d_mod = make_daily_mod(daily_dict_mod)

In [11]:
# Sanity check some values
print 'Num of keys: ' + str(len(d_mod.keys()))
count3 = 0
for key in d_mod.keys():
    count3 += len(d_mod[key])
print 'Num of values: ' + str(count3)

print 'Sample key: value pair '
n_items3 = {k: d_mod[k] for k in d_mod.keys()[:1]}
print n_items3

Num of keys: 4566
Num of values: 159251
Sample key: value pair 
{('N500', 'R020', '00-03-04', '47-50 ST-ROCK BDFM'): [(datetime.date(2015, 4, 25), [973, 542]), (datetime.date(2015, 4, 26), [778, 448]), (datetime.date(2015, 4, 27), [3214, 1960]), (datetime.date(2015, 4, 28), [3226, 2062]), (datetime.date(2015, 4, 29), [3394, 2081]), (datetime.date(2015, 4, 30), [3456, 1939]), (datetime.date(2015, 5, 1), [3491, 1812]), (datetime.date(2015, 5, 2), [1047, 575]), (datetime.date(2015, 5, 3), [838, 480]), (datetime.date(2015, 5, 4), [3241, 1904]), (datetime.date(2015, 5, 5), [3419, 2013]), (datetime.date(2015, 5, 6), [3319, 2021]), (datetime.date(2015, 5, 7), [3436, 1964]), (datetime.date(2015, 5, 8), [3180, 1858]), (datetime.date(2015, 5, 9), [1024, 556]), (datetime.date(2015, 5, 10), [865, 384]), (datetime.date(2015, 5, 11), [3110, 1889]), (datetime.date(2015, 5, 12), [3318, 2000]), (datetime.date(2015, 5, 13), [3404, 1972]), (datetime.date(2015, 5, 14), [3374, 2072]), (datetime.date(2015, 

In [12]:
with open('daily_entry_final.pickle','wb') as handle:
    pickle.dump(d_mod, handle)

In [None]:
### Challenge 4 ###

In [None]:
%matplotlib inline

values = n_items3.keys()[0]
values_split = zip(*values)
dates, counts = values_split[0], values_split[1]

plt.figure(figsize=(10,3))
plt.plot(dates,counts)