In [1]:
import pandas as pd
import numpy as np

# set cities of interest
cities = [('Atlanta', 'GA'), 
          ('New York', 'NY'), 
          ('New Orleans', 'LA'), 
          ('Seattle', 'WA'), 
          ('Detroit', 'MI')]

# set weeks where weekly data available
weeks = ['03-01', 
         '03-08', 
         '03-15', 
         '03-22', 
         '03-29', 
         '04-05', 
         '04-12', 
         '04-19']

In [21]:
# filter and concatenate weekly data
df = pd.DataFrame()
for week in weeks:
    filepath = './v1/main-file/2020-{}-weekly-patterns.csv.gz'.format(week)
    if week=='04-05':
        filepath = './v1/main-file/2020-04-05-weekly-patterns-corrected.csv.gz'
    weekly_df = pd.read_csv(filepath)
    for city, state in cities:
        subset = weekly_df[(weekly_df.city==city) & (weekly_df.region==state)]
        df = pd.concat([df, subset])
    del weekly_df

# save to pickle
df.to_pickle('./data/weekly_cities_03-01_through_04-19.csv.gz', 
             compression='gzip')

### Checkpoint - load from here for filtered concatenated weekly data

In [52]:
# read from pickle
df = pd.read_pickle('./data/weekly_cities_03-01_through_04-19.csv.gz', 
                    compression='gzip')

In [62]:
def vis_list(x):
    # converts string of format '[1,2,3]' to list of ints
    return np.fromstring(x[1:-1], dtype=int, sep=',')

def filter_cities(dataset):
    # narrow data to the cities of interest
    filtered = pd.DataFrame()
    for city, state in cities:
        subset = dataset[(dataset.city==city) & (dataset.region==state)]
        filtered = pd.concat([filtered, subset])
    return filtered

def days_and_dates(data, time_type):
    
    assert time_type=='seconds' or time_type=='YYYY-MM-DD', \
            'time_type must be \"seconds\" or \"YYYY-MM-DD\" '

    # turn 'visits by day' into list
    data['visits_zip'] = data['visits_by_day'].apply(vis_list)
    
    # create list of all columns except 'visits_zip'
    other_cols = list(data.columns)
    other_cols.remove('visits_zip')

    # Create 1 row for each day
    '''
    Adapted from: 
    https://stackoverflow.com/questions/53860398/
    pandas-dataframe-how-do-i-split-one-row-into-multiple-rows-by-multi-value-colum
    '''
    data = data.set_index(other_cols)['visits_zip'] \
               .apply(pd.Series).stack().reset_index() \
               .rename(columns={0:'visits', 'level_{}' \
                                   .format(len(other_cols)): 'days'})
    
    if time_type=='seconds':
        # For historical data, timestamp in UTC in seconds since January 1, 1970
        data['start_date'] = pd.to_timedelta(data['date_range_start'], 's') \
                                    + pd.to_datetime('1970, 1, 1')
    if time_type=='YYYY-MM-DD':
        # For new weekly data, ISO 8601 format of YYYY-MM-DDTHH:mm:SS±hh:mm 
        # (local time with offset from GMT)
        # The start time will be 12 a.m. Sunday in local time.
        data['start_date'] = pd.to_datetime(data['date_range_start'] \
                                                   .map(lambda x: x[:10]))

    #convert days to time delta
    data['days'] = pd.to_timedelta(data['days'], 'd')

    # compute date
    data['date'] = data['start_date'] + data['days']
    
    return data


In [55]:
# process 2020 data
df = days_and_dates(df, time_type='YYYY-MM-DD')

In [59]:
# save to pickle
df.to_pickle('./data/daily_cities_03-01-2020_04-25-2020.csv.gz',
                compression='gzip')

In [4]:
# read in historical data for march 2019
mar19 = pd.concat([pd.read_csv('./data/Mar19-AllPatterns-PATTERNS-2019_03-2020-03-23/patterns-part1.csv.gz',
                   compression='gzip'), 
                   pd.read_csv('./data/Mar19-AllPatterns-PATTERNS-2019_03-2020-03-23/patterns-part2.csv.gz',
                   compression='gzip'), 
                   pd.read_csv('./data/Mar19-AllPatterns-PATTERNS-2019_03-2020-03-23/patterns-part3.csv.gz',
                   compression='gzip')])

In [35]:
# process march 2019 data
mar19 = filter_cities(mar19)
mar19 = days_and_dates(mar19, time_type='seconds')

In [60]:
# save to pickle
mar19.to_pickle('./data/daily_cities_03-01-2019_03-31-2019.csv.gz',
                compression='gzip')

In [61]:
# read in historical data for april 2019
apr19 = pd.concat([pd.read_csv('./data/Apr19-AllPatterns-PATTERNS-2019_04-2020-03-23/patterns-part1.csv.gz',
                   compression='gzip'), 
                   pd.read_csv('./data/Apr19-AllPatterns-PATTERNS-2019_04-2020-03-23/patterns-part2.csv.gz',
                   compression='gzip'), 
                   pd.read_csv('./data/Apr19-AllPatterns-PATTERNS-2019_04-2020-03-23/patterns-part3.csv.gz',
                   compression='gzip')])

In [66]:
# process april 2019 data
apr19 = filter_cities(apr19)
apr19 = days_and_dates(apr19, time_type='seconds')

In [69]:
# save to pickle
apr19.to_pickle('./data/daily_cities_04-01-2019_04-30-2019.csv.gz',
                compression='gzip')

### Checkpoint - load from here for filtered, processed daily data from March-April 2019 and 2020

In [3]:
# read from pickle
df = pd.read_pickle('./data/daily_cities_03-01-2020_04-25-2020.csv.gz',
                       compression='gzip')
# read from pickle
mar19 = pd.read_pickle('./data/daily_cities_03-01-2019_03-31-2019.csv.gz',
                       compression='gzip')
# read from pickle
apr19 = pd.read_pickle('./data/daily_cities_04-01-2019_04-30-2019.csv.gz',
                       compression='gzip')

In [4]:
# create one frame for 2019 data
old=pd.concat([mar19, apr19])

In [5]:
# create date column without year
old['date_yearless'] = old['date'].dt.strftime('%m-%d')
df['date_yearless'] = df['date'].dt.strftime('%m-%d')

In [12]:
# join 2019 and 2020 datasets
data = df.join(old.set_index(['date_yearless', 'safegraph_place_id']), 
               on=['date_yearless', 'safegraph_place_id'], how='inner',
               lsuffix='_new', rsuffix='_old', sort=True)

In [13]:
# remove and rename duplicate columns after join
for col in old.columns:
    if (col+'_new' in data.columns) and (col+'_old' in data.columns):
        if all(data[col+'_new']==data[col+'_old']):
            data = data.rename(columns={col+'_new':col}).drop(columns=[col+'_old'])

# rename columns with only one source frame
for col in data.columns:
    if (col in df.columns) and (col not in old.columns):
        data = data.rename(columns={col: col+'_new'})
    if (col not in df.columns) and (col in old.columns):
        data = data.rename(columns={col: col+'_old'})

In [23]:
# save to pickle
data.to_pickle('./data/joined_03-01_04-25.csv.gz',
                compression='gzip')

### Checkpoint - load from here for joined 2019/2020 dataset

In [24]:
data = pd.read_pickle('./data/joined_03-01_04-25.csv.gz',
                      compression='gzip')

In [4]:
# view earliest date
# data.date_yearless.min()

In [5]:
# view latest date
# data.date_yearless.max()

In [3]:
# view all columns
# sorted(data.columns)

In [1]:
# #view data overview
# data.head(10)

In [2]:
# view rows where the location name has changed
# data.loc[data['location_name_new']!= \
#          data['location_name_old']][['location_name_new', 'location_name_old']]