# Web Scrapping Analysis

This notebook walks through the process of pulling, merging, and cleaning various datasets about climbing gyms around the US.

In [1]:
import pandas as pd
import numpy as np
import re
import requests
import json
from key import GKEY
import numpy as np


In [2]:
#load the scrapped data from mountainProject(mP)
df = pd.read_json('data.json', orient='index')

df

Unnamed: 0,locName,rating,img_list,gym_info
Auburn University Recreation and Well…,Auburn University Recreation and Well…,4.7,[],"[, 601 Heisman Dr, Auburn, AL 36849 ]"
Birmingham Boulders,Birmingham Boulders,4.9,[https://cdn2.apstatic.com/photos/climb/119942...,"[www.boulderingauthority.com, 136 INDUSTRIAL ..."
Campus Recreation Center,Campus Recreation Center,4.0,[],"[studentaffairs.uab.edu, University of Alabam..."
Faucett Brothers Activity Center,Faucett Brothers Activity Center,4.6,[],"[www.tcpara.org, 13040 Eugenia Faucett Drive ..."
First Avenue Rocks,First Avenue Rocks,4.8,[],"[firstaverocks.com, 2417 1st Avenue South, Bi..."
Fortenberry-Colton Fitness Center,Fortenberry-Colton Fitness Center,4.5,[],"[www.ftruckermwr.com, Bldg. 5900 on the corne..."
Gadrock,Gadrock,4.9,[],"[www.climbgadrock.com, 1403 Rainbow Drive Gad..."
High Point Climbing - Birmingham,High Point Climbing - Birmingham,4.7,[],"[highpointclimbing.com, 4766 US-280, Birmingh..."
High Point Climbing - Huntsville,High Point Climbing - Huntsville,4.7,[https://cdn2.apstatic.com/photos/climb/118309...,"[highpointclimbing.com, 1020 Nunnahsae Park D..."
University of Alabama,University of Alabama,Admi,[],"[urec.ua.edu, 401 5th Ave East, Tuscaloosa, A..."


In [3]:
#list how many picutures were on mP
df['NumberOfImages'] = df.img_list.map(len)

In [4]:
#make data more pandas friendly

def getFirstItem(gym_info: list):
    try:
        return gym_info[0]
    except:
        return ""

def getSecondItem(gym_info: list):
    try:
        return gym_info[1]
    except:
        return ""

df['gymURL'] = df['gym_info'].map(getFirstItem)
df['gym_address'] = df['gym_info'].map(getSecondItem)

del df['gym_info']

In [5]:
df["google_Place_ID"] = ""
df["locLat"] = ""
df["locLong"] = ""
df["business_status"] = ""
df["google_photoReferences"] = ""
df["googleRating"] = ""
df["numUsersRated"] = ""
df["typeList"] = ""

In [6]:
#grab non-nested values with try-except
def tryToGet(searchResults, resultKey: str, isList: bool = False):
    try:
        return searchResults[resultKey]
    except:
        if not isList:
            return ""
        else:
            return []
            

In [17]:
#specify the url endpoint to send the request to
placeTextSearchURL = 'https://maps.googleapis.com/maps/api/place/textsearch/json'

for idx, row in df.iterrows():

    #assign address to variable, and create params dict
    address = row['locName'].strip('…') + " " + row['gym_address']
    params = {
                "key": GKEY,
                "query": address
             }
    #get request to location text search api endpoint
    searchResults = requests.get(placeTextSearchURL, params=params)
    
    #conv to dict for easy extract
    searchResults = searchResults.json()
    
    
    if searchResults["results"]:
        searchResults = searchResults["results"][0]

    # extract non-nested value
    df.loc[idx,"google_Place_ID"] = tryToGet(searchResults,  "place_id")
    df.loc[idx,"business_status"] = tryToGet(searchResults,  "business_status")
    df.loc[idx,"googleRating"] = tryToGet(searchResults,"rating")
    df.at[idx,"typeList"] = tryToGet(searchResults, "types", True)
    df.loc[idx,"numUsersRated"] = tryToGet(searchResults, "user_ratings_total")
    
    #Try and update cleaner name
    try:
        df.loc[idx,"locName"] = searchResults["name"]
    except:
        pass
    
    #manual try blocks for nested values
    try:
        df.loc[idx,"locLat"] = searchResults["geometry"]["location"]["lat"]
        df.loc[idx,"locLong"] = searchResults["geometry"]["location"]["lat"]
    except:
        df.loc[idx,"locLat"] = np.nan
        df.loc[idx,"locLong"] = np.nan
        
           
    try:
        photoReferenceList = []
        for photo in searchResults['photos']:
            photoReferenceList.append(photo["photo_reference"])
        df.at[idx,"google_photoReferences"] = photoReferenceList
    except:
        df.at[idx,"google_photoReferences"] = []

In [18]:
df

Unnamed: 0,locName,rating,img_list,NumberOfImages,gymURL,gym_address,google_Place_ID,locLat,locLong,business_status,google_photoReferences,googleRating,numUsersRated,typeList
Auburn University Recreation and Well…,Auburn University Campus Recreation and Wellne...,4.7,[],0,,"601 Heisman Dr, Auburn, AL 36849",ChIJ_QyXYw7zjIgRhUWzNxmHgME,32.601239,32.601239,OPERATIONAL,[ATtYBwJVVyPqmu7dRHnf0otdQiaUpA2ws6vTTtBjN38Dy...,4.7,228.0,"[gym, health, point_of_interest, establishment]"
Birmingham Boulders,Birmingham Boulders,4.9,[https://cdn2.apstatic.com/photos/climb/119942...,2,www.boulderingauthority.com,"136 INDUSTRIAL DRIVE BIRMINGHAM, AL 35211",ChIJcWRT3-QeiYgR3v4dEd-ZWh8,33.451744,33.451744,OPERATIONAL,[ATtYBwIVf1FP-0QcJ2dbFsyn0BCdpdfw47Kf9_yDTd5Od...,4.9,73.0,"[school, gym, health, point_of_interest, estab..."
Campus Recreation Center,UAB Campus Recreation,4.0,[],0,studentaffairs.uab.edu,"University of Alabama at Birmingham, 1501 Uni...",ChIJR93YI-kbiYgRvn3bM5fi5Bo,33.500956,33.500956,OPERATIONAL,[ATtYBwLkTv6WmIG1g59iRKvz0k5cE8GhTcp2CWKFR4Lxg...,4.6,343.0,"[gym, health, point_of_interest, establishment]"
Faucett Brothers Activity Center,Faucett Brothers Activity Center,4.6,[],0,www.tcpara.org,"13040 Eugenia Faucett Drive Northport, AL 35473",ChIJV0bo9psbhogRBT112cf3WL8,33.286878,33.286878,OPERATIONAL,[ATtYBwIxgnG9a5ZB8f6qlutv7NP-niUFjq9tyn5LQOl4T...,4.6,125.0,"[gym, health, point_of_interest, establishment]"
First Avenue Rocks,First Avenue Rocks,4.8,[],0,firstaverocks.com,"2417 1st Avenue South, Birmingham, Alabama 35...",ChIJccr7oL4biYgRIKx0BFFwVmQ,33.514665,33.514665,CLOSED_PERMANENTLY,[ATtYBwKZ9LaFMJ__jSPQZEMJ0Ze68NrBrcE9Zj0eKaa5-...,4.8,31.0,"[school, gym, health, point_of_interest, store..."
Fortenberry-Colton Fitness Center,,4.5,[],0,www.ftruckermwr.com,Bldg. 5900 on the corner of Skychief Street a...,,,,,[],,,[]
Gadrock,Gadrock,4.9,[],0,www.climbgadrock.com,1403 Rainbow Drive Gadsden AL United States 3...,ChIJbe-sgFo5iogRiilt7YFURhs,33.986598,33.986598,OPERATIONAL,[ATtYBwJCAa_Le5tg3RACXzd4-Wu0IXLjprWmJLEuJSMny...,4.9,79.0,"[point_of_interest, establishment]"
High Point Climbing - Birmingham,High Point Climbing and Fitness - Birmingham,4.7,[],0,highpointclimbing.com,"4766 US-280, Birmingham, AL 35242",ChIJdaFwuksWiYgRLNOnRSAvVAw,33.420654,33.420654,OPERATIONAL,[ATtYBwK9Kbkqg9skB3S6nZGmi81zG4Ommm-gtGPPDZgxy...,4.7,136.0,"[gym, health, point_of_interest, establishment]"
High Point Climbing - Huntsville,High Point Climbing and Fitness - Huntsville,4.7,[https://cdn2.apstatic.com/photos/climb/118309...,2,highpointclimbing.com,1020 Nunnahsae Park Dr. NW (4.43 mi) Huntsvil...,ChIJf9WVenppYogR7zJB9wbuYPE,34.73785,34.73785,OPERATIONAL,[ATtYBwJ6gmOf_ZXtlB-gkEuUfeOpYV0hBe7lURcGK0dIN...,4.7,206.0,"[gym, health, point_of_interest, establishment]"
University of Alabama,Student Recreation Center,Admi,[],0,urec.ua.edu,"401 5th Ave East, Tuscaloosa, Alabama 35487",ChIJE-c-ehioiIgRUKK8o5HtCs0,33.212474,33.212474,,[],,,[premise]


In [11]:
photo_df = df[['locName', ]]

    


{
    "business_status": "OPERATIONAL",
    "formatted_address": "136 Industrial Dr, Birmingham, AL 35211, United States",
    "geometry": {
        "location": {
            "lat": 33.4517437,
            "lng": -86.8511012
        },
        "viewport": {
            "northeast": {
                "lat": 33.45309402989272,
                "lng": -86.84954267010728
            },
            "southwest": {
                "lat": 33.45039437010728,
                "lng": -86.85224232989273
            }
        }
    },
    "icon": "https://maps.gstatic.com/mapfiles/place_api/icons/v1/png_71/generic_business-71.png",
    "name": "Birmingham Boulders",
    "opening_hours": {
        "open_now": true
    },
    "photos": [
        {
            "height": 640,
            "html_attributions": [
                "<a href=\"https://maps.google.com/maps/contrib/109182361649883573002\">Birmingham Boulders</a>"
            ],
            "photo_reference": "ATtYBwLQ0UZHjGGIyEijkqCV9iGVB6-PMcaqA