In [10]:
import json
import pandas as pd
from collections import defaultdict
from pprint import pprint

In [11]:
def reformatJSON(file_name):
    # Open file
    with open(file_name, 'r') as f:
        data = json.load(f)

        reworked_data = defaultdict(list)
        park_dict = defaultdict(list)
        id = -1
        
        for instance in data['data']:
            id += 1
            for key in instance.keys():
                # Check if type is easy
                if isinstance(instance[key], str) or isinstance(instance[key], float): 
                    reworked_data[key].append(instance[key])

                # save parkCodes and states in dict so that things to do can be reformatted later
                elif key == 'relatedParks':
                    for d in instance[key]:
                        park_dict[id].append((d['states'], d['parkCode']))

                elif key == "tags":
                    reworked_data[key].append(','.join(instance[key]))

                elif isinstance(instance[key], list):
                    if key == 'activities' or key == 'topics' or 'key' == 'amenities':
                        # Join all into comma separated string
                        list_str = ', '.join(d['name'] for d in instance[key]) 
                        reworked_data[key].append(list_str)

        return reworked_data, park_dict

In [12]:
# Reformat the Park JSON file
reworked_park_data, _ = reformatJSON('Data/Parks.txt')
park_df = pd.DataFrame.from_dict(reworked_park_data)

# Reformat the Things to do JSON file
reworked_todo_data, park_dict = reformatJSON('Data/Things_to_do.txt')
temp_todo_df = pd.DataFrame.from_dict(reworked_todo_data)

In [13]:
# make a new rows so that each thing to do is associated with a single park
todo_df = pd.DataFrame(columns=list(temp_todo_df.columns).extend(['parkCode', 'states']))
for i in range(0, len(temp_todo_df)):
    for states, parkcode in park_dict[i]:
        new_row = temp_todo_df.iloc[i].copy()
        new_row['parkCode'] = parkcode
        new_row['states'] = states
        todo_df = pd.concat([todo_df, new_row.to_frame().T], ignore_index=True)

In [14]:
# Rename park datafram columns in preparation for merge
park_df.rename(columns={'url': 'park_url', 'fullName': 'park_name', 
                        'description': 'park_description', 'latitude': 'park_latitude',
                        'longitude': 'park_longitude','directionsInfo': 'park_directionsInfo', 
                        'directionsUrl': 'park_directionsUrl', 'weatherInfo': 'park_weatherInfo'},
                        inplace=True)

In [15]:
# Left join todo dataframe with park dataframe
park_activ_df = pd.merge(todo_df, park_df[['parkCode', 'park_url', 'park_name', 
                                           'park_description', 'park_latitude',
                                           'park_longitude','park_directionsInfo', 
                                            'park_directionsUrl', 'park_weatherInfo']],
                                            on = 'parkCode', how = 'left')

park_activ_df.head()

Unnamed: 0,id,url,title,shortDescription,tags,latitude,longitude,geometryPoiId,location,seasonDescription,...,parkCode,states,park_url,park_name,park_description,park_latitude,park_longitude,park_directionsInfo,park_directionsUrl,park_weatherInfo
0,83FB2FC0-EC43-4D0B-A29C-9C3EC1D3F89A,https://www.nps.gov/thingstodo/hike-cooper-roa...,Hike Cooper Road Trail,Hike 10.9 miles (17.5 km) one-way on Cooper Ro...,"hike,Cooper Road Trail,Cooper Road",35.610746646455034,-83.93318327381866,166cdab3-554b-4409-b2f0-211650614ba0,,,...,grsm,"NC,TN",https://www.nps.gov/grsm/index.htm,Great Smoky Mountains National Park,Ridge upon ridge of forest straddles the borde...,35.60116374,-83.50818326,Great Smoky Mountains National Park straddles ...,http://www.nps.gov/grsm/planyourvisit/directio...,Elevations in the park range from approximatel...
1,293841B3-F720-4872-BB59-214183BAF81E,https://www.nps.gov/thingstodo/hike-smokemont-...,Hike Smokemont Nature Trail,Hike 0.62-mile (1 km) roundtrip on Smokemont N...,"hike,Smokemont,Smokemont Nature Trail",35.55791295835083,-83.31234539082527,19b20900-10da-4c03-a944-cd7482ddbffa,,,...,grsm,"NC,TN",https://www.nps.gov/grsm/index.htm,Great Smoky Mountains National Park,Ridge upon ridge of forest straddles the borde...,35.60116374,-83.50818326,Great Smoky Mountains National Park straddles ...,http://www.nps.gov/grsm/planyourvisit/directio...,Elevations in the park range from approximatel...
2,8CDBCFD3-ADA7-4F47-B9F5-83F11223BABB,https://www.nps.gov/thingstodo/hike-cove-hardw...,Hike Cove Hardwood Trail,Hike 0.75 miles (1.2 km) roundtrip on Cove Har...,"hike,chimneys picnic area,Cove Hardwood,Cove H...",35.63653312170052,-83.49219918251038,,,,...,grsm,"NC,TN",https://www.nps.gov/grsm/index.htm,Great Smoky Mountains National Park,Ridge upon ridge of forest straddles the borde...,35.60116374,-83.50818326,Great Smoky Mountains National Park straddles ...,http://www.nps.gov/grsm/planyourvisit/directio...,Elevations in the park range from approximatel...
3,A32A8984-9383-4F06-928B-1757D520FFE7,https://www.nps.gov/thingstodo/leconte-via-app...,Hike to Mount Le Conte on the Appalachian Trai...,The longest of the five routes to Mount Le Con...,"Appalachian Trail,boulevard trail,le conte,Cha...",35.6112434302,-83.4252812048,e8fc6a6b-597c-405e-8b5d-7012b36456c8,Expect crowds and full parking at Newfound Gap...,,...,grsm,"NC,TN",https://www.nps.gov/grsm/index.htm,Great Smoky Mountains National Park,Ridge upon ridge of forest straddles the borde...,35.60116374,-83.50818326,Great Smoky Mountains National Park straddles ...,http://www.nps.gov/grsm/planyourvisit/directio...,Elevations in the park range from approximatel...
4,C858429B-FCD7-430F-87CB-9AB47405F28C,https://www.nps.gov/thingstodo/charlies-bunion...,Hike to Charlies Bunion,Hike a popular section of the Appalachian Trai...,"Charlies Bunion,Appalachian Trail,hike,Mountai...",35.6112434302,-83.4252812048,e8fc6a6b-597c-405e-8b5d-7012b36456c8,Expect full parking at Newfound Gap Overlook—c...,,...,grsm,"NC,TN",https://www.nps.gov/grsm/index.htm,Great Smoky Mountains National Park,Ridge upon ridge of forest straddles the borde...,35.60116374,-83.50818326,Great Smoky Mountains National Park straddles ...,http://www.nps.gov/grsm/planyourvisit/directio...,Elevations in the park range from approximatel...


In [16]:
# Drop any metadata columns
columns_to_drop = ['id', 'geometryPoiId', 'credit', 'relevanceScore']

smaller_park_activ_df = park_activ_df.drop(columns=columns_to_drop)

smaller_park_activ_df.head()

Unnamed: 0,url,title,shortDescription,tags,latitude,longitude,location,seasonDescription,accessibilityInformation,isReservationRequired,...,parkCode,states,park_url,park_name,park_description,park_latitude,park_longitude,park_directionsInfo,park_directionsUrl,park_weatherInfo
0,https://www.nps.gov/thingstodo/hike-cooper-roa...,Hike Cooper Road Trail,Hike 10.9 miles (17.5 km) one-way on Cooper Ro...,"hike,Cooper Road Trail,Cooper Road",35.610746646455034,-83.93318327381866,,,<p>The first mile of this trail from the Abram...,False,...,grsm,"NC,TN",https://www.nps.gov/grsm/index.htm,Great Smoky Mountains National Park,Ridge upon ridge of forest straddles the borde...,35.60116374,-83.50818326,Great Smoky Mountains National Park straddles ...,http://www.nps.gov/grsm/planyourvisit/directio...,Elevations in the park range from approximatel...
1,https://www.nps.gov/thingstodo/hike-smokemont-...,Hike Smokemont Nature Trail,Hike 0.62-mile (1 km) roundtrip on Smokemont N...,"hike,Smokemont,Smokemont Nature Trail",35.55791295835083,-83.31234539082527,,,<p>No designated accessible parking spaces. Th...,False,...,grsm,"NC,TN",https://www.nps.gov/grsm/index.htm,Great Smoky Mountains National Park,Ridge upon ridge of forest straddles the borde...,35.60116374,-83.50818326,Great Smoky Mountains National Park straddles ...,http://www.nps.gov/grsm/planyourvisit/directio...,Elevations in the park range from approximatel...
2,https://www.nps.gov/thingstodo/hike-cove-hardw...,Hike Cove Hardwood Trail,Hike 0.75 miles (1.2 km) roundtrip on Cove Har...,"hike,chimneys picnic area,Cove Hardwood,Cove H...",35.63653312170052,-83.49219918251038,,,"<p><a>Trail includes </a>some steep sections, ...",False,...,grsm,"NC,TN",https://www.nps.gov/grsm/index.htm,Great Smoky Mountains National Park,Ridge upon ridge of forest straddles the borde...,35.60116374,-83.50818326,Great Smoky Mountains National Park straddles ...,http://www.nps.gov/grsm/planyourvisit/directio...,Elevations in the park range from approximatel...
3,https://www.nps.gov/thingstodo/leconte-via-app...,Hike to Mount Le Conte on the Appalachian Trai...,The longest of the five routes to Mount Le Con...,"Appalachian Trail,boulevard trail,le conte,Cha...",35.6112434302,-83.4252812048,Expect crowds and full parking at Newfound Gap...,,<p>Trail is steep with occasional obstacles li...,False,...,grsm,"NC,TN",https://www.nps.gov/grsm/index.htm,Great Smoky Mountains National Park,Ridge upon ridge of forest straddles the borde...,35.60116374,-83.50818326,Great Smoky Mountains National Park straddles ...,http://www.nps.gov/grsm/planyourvisit/directio...,Elevations in the park range from approximatel...
4,https://www.nps.gov/thingstodo/charlies-bunion...,Hike to Charlies Bunion,Hike a popular section of the Appalachian Trai...,"Charlies Bunion,Appalachian Trail,hike,Mountai...",35.6112434302,-83.4252812048,Expect full parking at Newfound Gap Overlook—c...,,<p>Hike has many steep sections plus roots and...,False,...,grsm,"NC,TN",https://www.nps.gov/grsm/index.htm,Great Smoky Mountains National Park,Ridge upon ridge of forest straddles the borde...,35.60116374,-83.50818326,Great Smoky Mountains National Park straddles ...,http://www.nps.gov/grsm/planyourvisit/directio...,Elevations in the park range from approximatel...


In [17]:
# Save to csv
smaller_park_activ_df.to_csv('Data/Park Activities.csv')