### Data Acquisition Development Notebook
Notebook for testing functions to download, transform, and store datasets needed for analysis and modeling

#### Wait Times

In [None]:
# Packages
import numpy as np
import pandas as pd
import holidays
from datetime import date
import calendar

In [None]:
# Function to download and transform a single wait time dataset from TouringPlans

def fetch_wait_times(url, attraction_name):
    """Retrieves and formats an attraction wait time dataset from public .csvs made available
    by TouringPlans (https://touringplans.com/walt-disney-world/crowd-calendar#DataSets). Output
    dataset is transformed to provide pertinent time data and wait times. Missing data is not handled
    at this point.
    
    Args:
        url : string
            The URL of the dataset
            
        ride_name : string
            Description of the ride
            
    Returns:
        wait_times : DataFrame
            The prepared data frame with columns 
            
            ['attraction_name',
            'month_of_year',
            'hour_of_day',
            'year_of_calendar',
            'wait_time']
    """
    
    # Read in the csv file
    wait_times = pd.read_csv(
        url,
        usecols=['datetime','SACTMIN','SPOSTMIN'],
        dtype={'datetime':str,'SACTMIN':np.float64,'SPOSTMIN':np.float64}
    )
    
    # Transforms for date elements
    wait_times.loc[:,'datetime'] = pd.to_datetime(wait_times.datetime, format='%Y-%m-%d %H:%M:%S')
    wait_times['month_of_year'] = wait_times.datetime.dt.month
    wait_times['hour_of_day'] = wait_times.datetime.dt.hour
    wait_times['year_of_calendar'] = wait_times.datetime.dt.year
    wait_times['date_id'] = wait_times.datetime.dt.date
    
    # Wait time coalesce (use the actual time if available)
    wait_times['wait_time'] = wait_times.SACTMIN.combine_first(wait_times.SPOSTMIN)
    
    # Descriptor
    wait_times['attraction_name'] = attraction_name
    
    # Output data
    wait_times = wait_times[['attraction_name','date_id','month_of_year','hour_of_day','year_of_calendar','wait_time']]
    return wait_times

In [None]:
# Download all datasets and save a local copy

def save_touring_plans_data(path):
    """Function to download all wait time datasets from public .csvs made available
    by TouringPlans (https://touringplans.com/walt-disney-world/crowd-calendar#DataSets).
    Output is saved as a single consolidated .csv file.
    
    Args:
        path : string
            The path to save the .csv file to
            
    Returns:
        None
    """
    
    # Dictionary of attraction names and .csv urls
    url_lookup = {
        'Alien Swirling Saucers': 'https://cdn.touringplans.com/datasets/alien_saucers.csv',
        'Avatar Flight of Passage': 'https://cdn.touringplans.com/datasets/flight_of_passage.csv',
        'DINOSAUR': 'https://cdn.touringplans.com/datasets/dinosaur.csv',
        'Expedition Everest': 'https://cdn.touringplans.com/datasets/expedition_everest.csv',
        'Kilimanjaro Safaris': 'https://cdn.touringplans.com/datasets/kilimanjaro_safaris.csv',
        'Navi River Journey': 'https://cdn.touringplans.com/datasets/navi_river.csv',
        'Pirates of the Caribbean': 'https://cdn.touringplans.com/datasets/pirates_of_caribbean.csv',
        'Rock n Roller Coaster': 'https://cdn.touringplans.com/datasets/rock_n_rollercoaster.csv',
        'Seven Dwarfs Mine Train': 'https://cdn.touringplans.com/datasets/7_dwarfs_train.csv',
        'Slinky Dog Dash': 'https://cdn.touringplans.com/datasets/slinky_dog.csv',
        'Soarin': 'https://cdn.touringplans.com/datasets/soarin.csv',
        'Spaceship Earth': 'https://cdn.touringplans.com/datasets/spaceship_earth.csv',
        'Splash Mountain': 'https://cdn.touringplans.com/datasets/splash_mountain.csv',
        'Toy Story Mania': 'https://cdn.touringplans.com/datasets/toy_story_mania.csv'     
    }
    
    # Set up output dataframe
    out_df = pd.DataFrame()
    
    # Iterate through dictionary
    for attraction_name, url in url_lookup.items():
        context_df = fetch_wait_times(url=url, attraction_name=attraction_name)
        out_df = pd.concat([out_df,context_df]).reset_index(drop=True)
        
    # Save the data as .csv
    out_df.to_csv(path, index=False)

In [None]:
# Test saving the file
save_touring_plans_data('../data/test_extract.csv')

In [None]:
# Test loading the file
test_df=pd.read_csv('../data/test_extract.csv')
test_df = test_df[test_df.attraction_name=='Soarin'].reset_index(drop=True)

In [None]:
test_df.head()

#### Dates/Holidays

In [None]:
for date, name in sorted(holidays.US(state='CA',years=2019).items()):
    print(date, name)

In [None]:
us_days = holidays.US()

In [None]:
test_df['holiday_name'] = test_df.date_id.apply(lambda x:us_days.get(x))

In [None]:
test_df.head()

#### Temperatures

In [None]:
import requests

In [None]:

pm_string = 'datasetid=GHCND&stationid=GHCND:USW00012815&startdate=2015-01-01&enddate=2015-12-31&datatypeid=TMAX&units=standard&limit=1000'

In [None]:
r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data', params=pm_string,
                headers=token)

In [None]:
x = pd.DataFrame.from_dict(r.json()['results'])

In [None]:
x.head()

In [None]:
temps = pd.read_csv('../data/noaa_orlando_mco_temps.csv')

In [None]:
temps.head()

#### Labor/Economic Statistics

In [None]:
# us unemployment
us_unemp = pd.read_excel('../data/us_unemployment_rates.xlsx',usecols='A:M',skiprows=11)
us_unemp = us_unemp.melt(id_vars='Year',value_vars=[i for i in us_unemp.columns if i!='Year'],var_name='month_of_year',
                        value_name='unemployment_pct')
us_unemp.loc[:, 'month_of_year'] = us_unemp.month_of_year.apply(lambda x:pd.to_datetime(x,format='%b')).dt.month
us_unemp.columns = ['year_of_calendar','month_of_year','unemployment_pct']
us_unemp.tail()

In [None]:
# fl unemployment
fl_unemp = pd.read_excel('../data/fl_unemployment.xlsx',usecols='A:M',skiprows=10)
fl_unemp = fl_unemp.melt(id_vars='Year',value_vars=[i for i in fl_unemp.columns if i!='Year'],var_name='month_of_year',
                        value_name='unemployment_pct')
fl_unemp.loc[:, 'month_of_year'] = fl_unemp.month_of_year.apply(lambda x:pd.to_datetime(x,format='%b')).dt.month
fl_unemp.columns = ['year_of_calendar','month_of_year','unemployment_pct']
fl_unemp.head()

In [None]:
# us cpi
us_cpi = pd.read_excel('../data/us_cpi.xlsx',usecols='A:M',skiprows=11)
us_cpi = us_cpi.melt(id_vars='Year',value_vars=[i for i in us_cpi.columns if i!='Year'],var_name='month_of_year',
                        value_name='cpi')
us_cpi.loc[:, 'month_of_year'] = us_cpi.month_of_year.apply(lambda x:pd.to_datetime(x,format='%b')).dt.month
us_cpi.columns = ['year_of_calendar','month_of_year','cpi']
us_cpi.head()