### Data Acquisition Development Notebook
Notebook for testing functions to download, transform, and store datasets needed for analysis and modeling

In [1]:
# Packages
import numpy as np
import pandas as pd

In [49]:
# Function to download and transform a single wait time dataset from TouringPlans

def fetch_wait_times(url, attraction_name):
    """Retrieves and formats an attraction wait time dataset from public .csvs made available
    by TouringPlans (https://touringplans.com/walt-disney-world/crowd-calendar#DataSets). Output
    dataset is transformed to provide pertinent time data and wait times. Missing data is not handled
    at this point.
    
    Args:
        url : string
            The URL of the dataset
            
        ride_name : string
            Description of the ride
            
    Returns:
        wait_times : DataFrame
            The prepared data frame with columns 
            
            ['attraction_name',
            'month_of_year',
            'hour_of_day',
            'year_of_calendar',
            'wait_time']
    """
    
    # Read in the csv file
    wait_times = pd.read_csv(
        url,
        usecols=['datetime','SACTMIN','SPOSTMIN'],
        dtype={'datetime':str,'SACTMIN':np.float64,'SPOSTMIN':np.float64}
    )
    
    # Transforms for date elements
    wait_times.loc[:,'datetime'] = pd.to_datetime(wait_times.datetime, format='%Y-%m-%d %H:%M:%S')
    wait_times['month_of_year'] = wait_times.datetime.dt.month
    wait_times['hour_of_day'] = wait_times.datetime.dt.hour
    wait_times['year_of_calendar'] = wait_times.datetime.dt.year
    
    # Wait time coalesce (use the actual time if available)
    wait_times['wait_time'] = wait_times.SACTMIN.combine_first(wait_times.SPOSTMIN)
    
    # Descriptor
    wait_times['attraction_name'] = attraction_name
    
    # Output data
    wait_times = wait_times[['attraction_name','month_of_year','hour_of_day','year_of_calendar','wait_time']]
    return wait_times

In [58]:
# Download all datasets and save a local copy

def save_touring_plans_data(path):
    """Function to download all wait time datasets from public .csvs made available
    by TouringPlans (https://touringplans.com/walt-disney-world/crowd-calendar#DataSets).
    Output is saved as a single consolidated .csv file.
    
    Args:
        path : string
            The path to save the .csv file to
            
    Returns:
        None
    """
    
    # Dictionary of attraction names and .csv urls
    url_lookup = {
        'Alien Swirling Saucers': 'https://cdn.touringplans.com/datasets/alien_saucers.csv',
        'Avatar Flight of Passage': 'https://cdn.touringplans.com/datasets/flight_of_passage.csv',
        'DINOSAUR': 'https://cdn.touringplans.com/datasets/dinosaur.csv',
        'Expedition Everest': 'https://cdn.touringplans.com/datasets/expedition_everest.csv',
        'Kilimanjaro Safaris': 'https://cdn.touringplans.com/datasets/kilimanjaro_safaris.csv',
        'Navi River Journey': 'https://cdn.touringplans.com/datasets/navi_river.csv',
        'Pirates of the Caribbean': 'https://cdn.touringplans.com/datasets/pirates_of_caribbean.csv',
        'Rock n Roller Coaster': 'https://cdn.touringplans.com/datasets/rock_n_rollercoaster.csv',
        'Seven Dwarfs Mine Train': 'https://cdn.touringplans.com/datasets/7_dwarfs_train.csv',
        'Slinky Dog Dash': 'https://cdn.touringplans.com/datasets/slinky_dog.csv',
        'Soarin': 'https://cdn.touringplans.com/datasets/soarin.csv',
        'Spaceship Earth': 'https://cdn.touringplans.com/datasets/spaceship_earth.csv',
        'Splash Mountain': 'https://cdn.touringplans.com/datasets/splash_mountain.csv',
        'Toy Story Mania': 'https://cdn.touringplans.com/datasets/toy_story_mania.csv'     
    }
    
    # Set up output dataframe
    out_df = pd.DataFrame()
    
    # Iterate through dictionary
    for attraction_name, url in url_lookup.items():
        context_df = fetch_wait_times(url=url, attraction_name=attraction_name)
        out_df = pd.concat([out_df,context_df]).reset_index(drop=True)
        
    # Save the data as .csv
    out_df.to_csv(path, index=False)

In [59]:
# Test saving the file
save_touring_plans_data('../data/test_extract.csv')

In [67]:
test_df=pd.read_csv('../data/test_extract.csv')

In [68]:
test_df.shape

(2551775, 5)

In [69]:
test_df[test_df.attraction_name=='Soarin'].sample(10)

Unnamed: 0,attraction_name,month_of_year,hour_of_day,year_of_calendar,wait_time
1773078,Soarin,5,13,2017,60.0
1860087,Soarin,3,11,2019,65.0
1778860,Soarin,7,12,2017,75.0
1795398,Soarin,11,16,2017,40.0
1837451,Soarin,10,19,2018,15.0
1799973,Soarin,12,10,2017,105.0
1790474,Soarin,10,9,2017,15.0
1824482,Soarin,6,10,2018,45.0
1880747,Soarin,8,20,2019,10.0
1730274,Soarin,12,14,2015,110.0
