## Download taxi trip data

Open `.csv` file and download all trip data links (approximately 300GB).

In [1]:
import os
import requests

def download_file(url, folder):
    """Download online file and save it.

    Arguments:
        url {String} -- Url to download
        folder {String} -- Folder where file will be saved
    """
    
    # The last part of the url is the file name
    file_name = f'{url.split("/")[-1]}'

    output_file = "{}/{}".format(folder, file_name)
    
    if os.path.exists(output_file):
         print("File  '{}' already exists.".format(output_file))
    else:
        # Download if file does not exist
        print(f"Downloading {url}...")
        r = requests.get(url, allow_redirects=True)
        print(f"Saving '{output_file}'")
        open(output_file, "wb").write(r.content)

In [None]:
import pandas as pd

# Trip links source        
path_trip_list = "data/nyc/nyc_tripdata.csv"

# Local to save
target_path = "o:/phd/nyc_trips"

# Loading trip data csv (year, month, type[green, yellow, fhv], link)
df = pd.read_csv(path_trip_list)

# Filter trips from year
select_year = 2011
filter_links = df["year"]==select_year
links = df[filter_links]["link"]

# Download all
links = df["link"]

# Donwload all links
for link in links:
    download_file(link, target_path)

Downloading https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2018-01.csv...


## Generate date excerpts

In [1]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from collections import defaultdict
import json
import pprint

# Select time windows for days of an year
def get_dates(start, weeks, h1, m1, s1, h2, m2, s2):
    
    start_dt = datetime.strptime(start, "%Y-%m-%d %H:%M:%S")

    dates_year = start_dt+pd.to_timedelta(np.arange(weeks), 'W')

    for i in range(1,13):
        days_month = dates_year[dates_year.month==i]
        for d in days_month:
            yield (
                (d.year, d.month, d.day),
                d+timedelta(hours=h1, minutes=m1, seconds=s1),
                d+timedelta(hours=h2, minutes=m2, seconds=s2)
            )


start = "2011-01-04 00:00:00"
n_weeks = 52
#path = "O:/phd/nyc_trips/raw/"
path = ""
day_tw = defaultdict(list)
for (y,m,d), earliest, latest in get_dates(start, n_weeks, 0, 0, 0, 23, 59, 59):
    data_path = f"{path}yellow_tripdata_{y}-{m:02}.csv"
    day_tw[data_path].append((str(earliest), str(latest)))

pprint.pprint(day_tw)

# Version to paste in configuration json file
pprint.pprint(json.dumps(day_tw))

defaultdict(<class 'list'>,
            {'yellow_tripdata_2011-01.csv': [('2011-01-04 00:00:00',
                                              '2011-01-04 23:59:59'),
                                             ('2011-01-11 00:00:00',
                                              '2011-01-11 23:59:59'),
                                             ('2011-01-18 00:00:00',
                                              '2011-01-18 23:59:59'),
                                             ('2011-01-25 00:00:00',
                                              '2011-01-25 23:59:59')],
             'yellow_tripdata_2011-02.csv': [('2011-02-01 00:00:00',
                                              '2011-02-01 23:59:59'),
                                             ('2011-02-08 00:00:00',
                                              '2011-02-08 23:59:59'),
                                             ('2011-02-15 00:00:00',
                                              '2011-02-15 23:59:59')