# Web Scrapping Analysis

This notebook walks through the process of pulling, merging, and cleaning various datasets about climbing gyms around the US.

In [1]:
import pandas as pd
import numpy as np
import re
import requests
import json
from key import GKEY
import numpy as np


In [2]:
#load the scrapped data from mountainProject(mP)
df = pd.read_json('data.json', orient='index')

df

Unnamed: 0,locName,img_list,gInfo,gym_info
Campus Recreation Center,Campus Recreation Center,[],"{'geoLoc': [26.08135, -80.23653019999999], 'ra...","[studentaffairs.uab.edu, University of Alabam..."
Fortenberry-Colton Fitness Center,Fortenberry-Colton Fitness Center,[],"{'geoLoc': [31.3242639, -85.7180348], 'rating'...","[www.ftruckermwr.com, Bldg. 5900 on the corne..."
University of South Alabama - Student…,University of South Alabama - Student…,[],"{'geoLoc': [30.6959406, -88.184236], 'rating':...","[www.southalabama.edu, 307 N University Blvd,..."
High Point Climbing - Birmingham,High Point Climbing - Birmingham,[],"{'geoLoc': [33.4206544, -86.6982553], 'rating'...","[highpointclimbing.com, 4766 US-280, Birmingh..."
Birmingham Boulders,Birmingham Boulders,[https://cdn2.apstatic.com/photos/climb/119942...,"{'geoLoc': [33.4517437, -86.8511012], 'rating'...","[www.boulderingauthority.com, 136 INDUSTRIAL ..."
Gadrock,Gadrock,[],"{'geoLoc': [33.9865983, -86.0056007], 'rating'...","[www.climbgadrock.com, 1403 Rainbow Drive Gad..."
First Avenue Rocks,First Avenue Rocks,[],"{'geoLoc': [33.5146648, -86.7975348], 'rating'...","[firstaverocks.com, 2417 1st Avenue South, Bi..."
High Point Climbing - Huntsville,High Point Climbing - Huntsville,[https://cdn2.apstatic.com/photos/climb/118309...,"{'geoLoc': [34.73785, -86.66708129999999], 'ra...","[highpointclimbing.com, 1020 Nunnahsae Park D..."
Auburn University Recreation and Well…,Auburn University Recreation and Well…,[],"{'geoLoc': [32.6012386, -85.4929846], 'rating'...","[, 601 Heisman Dr, Auburn, AL 36849 ]"
Faucett Brothers Activity Center,Faucett Brothers Activity Center,[],"{'geoLoc': [33.2868784, -87.6325483], 'rating'...","[www.tcpara.org, 13040 Eugenia Faucett Drive ..."


In [3]:
#list how many picutures were on mP
df['NumberOfImages'] = df.img_list.map(len)

In [4]:
#make data more pandas friendly

def getFirstItem(gym_info: list):
    try:
        return gym_info[0]
    except:
        return ""

def getSecondItem(gym_info: list):
    try:
        return gym_info[1]
    except:
        return ""

df['gymURL'] = df['gym_info'].map(getFirstItem)
df['gym_address'] = df['gym_info'].map(getSecondItem)

del df['gym_info']

In [5]:
df["google_Place_ID"] = ""
df["locLat"] = ""
df["locLong"] = ""
df["business_status"] = ""
df["google_photoReferences"] = ""
df["googleRating"] = ""
df["numUsersRated"] = ""
df["typeList"] = ""

In [6]:
#grab non-nested values with try-except
def tryToGet(searchResults, resultKey: str, isList: bool = False):
    try:
        return searchResults[resultKey]
    except:
        if not isList:
            return ""
        else:
            return []
            

In [57]:
#specify the url endpoint to send the request to
placeTextSearchURL = 'https://maps.googleapis.com/maps/api/place/textsearch/json'
fields = [
        "place_id",
        "business_status",
        "rating",
        "types",
        "user_ratings_total",
        "name",
        "geometry",
        'photos'
    ]

for idx, row in df.iterrows():

    #assign address to variable, and create params dict
    address = row['locName'].strip('…') + " " + row['gym_address']
    
    params = {
                "key": GKEY,
                "query": address,
                'fields':fields
             }
    #get request to location text search api endpoint
    searchResults = requests.get(placeTextSearchURL, params=params)
    
    #conv to dict for easy extract
    searchResults = searchResults.json()
    
    #if results are empty, attmept a second search with just name
    if not searchResults["results"]:
        params.update({'query': row['locName'].strip('…')})
        searchResults = requests.get(placeTextSearchURL, params=params).json()
    
    if searchResults["results"]:
        searchResults = searchResults["results"][0]

    # extract non-nested value
    df.loc[idx,"google_Place_ID"] = tryToGet(searchResults,  "place_id")
    df.loc[idx,"business_status"] = tryToGet(searchResults,  "business_status")
    df.loc[idx,"googleRating"] = tryToGet(searchResults,"rating")
    df.at[idx,"typeList"] = tryToGet(searchResults, "types", True)
    df.loc[idx,"numUsersRated"] = tryToGet(searchResults, "user_ratings_total")
    
    #Try and update cleaner name
    try:
        df.loc[idx,"locName"] = searchResults["name"]
    except:
        pass
    
    #manual try blocks for nested values
    try:
        df.loc[idx,"locLat"] = searchResults["geometry"]["location"]["lat"]
        df.loc[idx,"locLong"] = searchResults["geometry"]["location"]["lat"]
    except:
        df.loc[idx,"locLat"] = np.nan
        df.loc[idx,"locLong"] = np.nan
        
           
    try:
        photoReferenceList = []
        for photo in searchResults['photos']:
            photoReferenceList.append(photo["photo_reference"])
        df.at[idx,"google_photoReferences"] = photoReferenceList
    except:
        df.at[idx,"google_photoReferences"] = []

In [58]:
df

Unnamed: 0,locName,img_list,gInfo,NumberOfImages,gymURL,gym_address,google_Place_ID,locLat,locLong,business_status,google_photoReferences,googleRating,numUsersRated,typeList
Campus Recreation Center,UAB Campus Recreation,[],"{'geoLoc': [26.08135, -80.23653019999999], 'ra...",0,studentaffairs.uab.edu,"University of Alabama at Birmingham, 1501 Uni...",ChIJR93YI-kbiYgRvn3bM5fi5Bo,33.500956,33.500956,OPERATIONAL,[ATtYBwLGAWXpVf9ekPrO-UEtML6Im0pdWZP56CsTxMCVD...,4.6,343.0,"[gym, health, point_of_interest, establishment]"
Fortenberry-Colton Fitness Center,Fortenberry-Colton Physical Fitness Center,[],"{'geoLoc': [31.3242639, -85.7180348], 'rating'...",0,www.ftruckermwr.com,Bldg. 5900 on the corner of Skychief Street a...,ChIJgbAAIFdkkogRdo8uPDRqM7U,31.324264,31.324264,OPERATIONAL,[ATtYBwK7HKyV0WzCUmYKtJw1c2-_bEEWp8jTlbUo3FFNP...,4.5,52.0,"[gym, health, point_of_interest, establishment]"
University of South Alabama - Student…,307 N University Blvd,[],"{'geoLoc': [30.6959406, -88.184236], 'rating':...",0,www.southalabama.edu,"307 N University Blvd, Mobile, Alabama 36688-...",ChIJNf7vQamym4gRkPQKC3B5dv8,30.695525,30.695525,,[],,,[street_address]
High Point Climbing - Birmingham,High Point Climbing and Fitness - Birmingham,[],"{'geoLoc': [33.4206544, -86.6982553], 'rating'...",0,highpointclimbing.com,"4766 US-280, Birmingham, AL 35242",ChIJdaFwuksWiYgRLNOnRSAvVAw,33.420654,33.420654,OPERATIONAL,[ATtYBwLlU6UklPCxhhf4S7ebNPfvSZtQFltLXEmh4gEn6...,4.7,137.0,"[gym, health, point_of_interest, establishment]"
Birmingham Boulders,Birmingham Boulders,[https://cdn2.apstatic.com/photos/climb/119942...,"{'geoLoc': [33.4517437, -86.8511012], 'rating'...",2,www.boulderingauthority.com,"136 INDUSTRIAL DRIVE BIRMINGHAM, AL 35211",ChIJcWRT3-QeiYgR3v4dEd-ZWh8,33.451744,33.451744,OPERATIONAL,[ATtYBwLijstEM4oxMPEOY7waLZhTAciYkmsau2NrvEjDD...,4.9,74.0,"[school, gym, health, point_of_interest, estab..."
Gadrock,Gadrock,[],"{'geoLoc': [33.9865983, -86.0056007], 'rating'...",0,www.climbgadrock.com,1403 Rainbow Drive Gadsden AL United States 3...,ChIJbe-sgFo5iogRiilt7YFURhs,33.986598,33.986598,OPERATIONAL,[ATtYBwI957mmNRCNWXDSPmnhx4NlhKKQblGkMaFXm9-YU...,4.9,79.0,"[point_of_interest, establishment]"
First Avenue Rocks,First Avenue Rocks,[],"{'geoLoc': [33.5146648, -86.7975348], 'rating'...",0,firstaverocks.com,"2417 1st Avenue South, Birmingham, Alabama 35...",ChIJccr7oL4biYgRIKx0BFFwVmQ,33.514665,33.514665,CLOSED_PERMANENTLY,[ATtYBwKLcUGalyUqN6wSLYi2Qr7XNdelrT_JKifkFO6rz...,4.8,31.0,"[school, gym, health, point_of_interest, store..."
High Point Climbing - Huntsville,High Point Climbing and Fitness - Huntsville,[https://cdn2.apstatic.com/photos/climb/118309...,"{'geoLoc': [34.73785, -86.66708129999999], 'ra...",2,highpointclimbing.com,1020 Nunnahsae Park Dr. NW (4.43 mi) Huntsvil...,ChIJf9WVenppYogR7zJB9wbuYPE,34.73785,34.73785,OPERATIONAL,[ATtYBwJ-yGf-wd1I0lXAmHirfCyZCls09eM2yPOxkZOaY...,4.7,207.0,"[gym, health, point_of_interest, establishment]"
Auburn University Recreation and Well…,Auburn University Campus Recreation and Wellne...,[],"{'geoLoc': [32.6012386, -85.4929846], 'rating'...",0,,"601 Heisman Dr, Auburn, AL 36849",ChIJ_QyXYw7zjIgRhUWzNxmHgME,32.601239,32.601239,OPERATIONAL,[ATtYBwJLuB-XnR3hPlm3XrT9KczDFPHh_LfcHGSLs9Ngj...,4.7,230.0,"[gym, health, point_of_interest, establishment]"
Faucett Brothers Activity Center,Faucett Brothers Activity Center,[],"{'geoLoc': [33.2868784, -87.6325483], 'rating'...",0,www.tcpara.org,"13040 Eugenia Faucett Drive Northport, AL 35473",ChIJV0bo9psbhogRBT112cf3WL8,33.286878,33.286878,OPERATIONAL,[ATtYBwLv7pOXkwjaZmvTnsfYD9Pam76RpvKKFf2cui1Pf...,4.6,125.0,"[gym, health, point_of_interest, establishment]"


In [30]:
df.iloc[1].locName

'Fortenberry-Colton Fitness Center'

In [9]:
photo_df = df[['locName', 'img_list', 'google_photoReferences']]

photo_df


Unnamed: 0,locName,img_list,google_photoReferences
Auburn University Recreation and Well…,Auburn University Campus Recreation and Wellne...,[],[ATtYBwJ2yiriwEs8bdKyTeobOdlmsGIcFpSmgfy1_zR-x...
Birmingham Boulders,Birmingham Boulders,[https://cdn2.apstatic.com/photos/climb/119942...,[ATtYBwJE6cPB56nn4lul-Nr_QeWWVBycpObxMOzW7QY7y...
Campus Recreation Center,UAB Campus Recreation,[],[ATtYBwJ-zH4Q7w7X_cqnNKXS-aWjcmG__2U0fecxUaFV7...
Faucett Brothers Activity Center,Faucett Brothers Activity Center,[],[ATtYBwJLw0bA-sTNa0oHN1DBrG0LQVvJPpX2AYGxWX57r...
First Avenue Rocks,First Avenue Rocks,[],[ATtYBwKB97rwACz1Hb6PNmSBQF3HsExACnMXIqofS6Qoq...
Fortenberry-Colton Fitness Center,Fortenberry-Colton Fitness Center,[],[]
Gadrock,Gadrock,[],[ATtYBwKucZPe49A7x6AzaBzM6DNN1XGqYceV2rcNbs5Zh...
High Point Climbing - Birmingham,High Point Climbing and Fitness - Birmingham,[],[ATtYBwJuxtk6JGtedCEAlG9apIDE8Ccih5JaogK9bQVR1...
High Point Climbing - Huntsville,High Point Climbing and Fitness - Huntsville,[https://cdn2.apstatic.com/photos/climb/118309...,[ATtYBwK4x7d4Rhb-lySWqr9hH3gk5bOsSRJW1ZqK4Xrki...
University of Alabama,Student Recreation Center,[],[]


In [54]:
fields = [
        "place_id",
        "business_status",
        "rating",
        "types",
        "user_ratings_total",
        "name",
        "geometry",
        'photos'
    ]


param={
    'key': GKEY,
    'query':'Fortenberry-Colton Fitness Center' + ' Bldg. 5900 on the corner of Skychief Street and 5th Avenue, Fort Rucker,Alabama 36362 ',
    'fields':fields
}

results = requests.get(placeTextSearchURL, params=param)

In [55]:
rjson = results.json()
print(json.dumps(rjson, indent=4))

{
    "html_attributions": [],
    "results": [],
    "status": "ZERO_RESULTS"
}


In [56]:
if not rjson['results']:
    param.update({'query':'Fortenberry-Colton Fitness Center' })
    print(param)
    rjson = requests.get(placeTextSearchURL, params=param).json()

print(json.dumps(rjson, indent=4))

{'key': 'AIzaSyBtXRN1ZzEVH0aohANn2Xf73EiXzwxRE_s', 'query': 'Fortenberry-Colton Fitness Center', 'fields': ['place_id', 'business_status', 'rating', 'types', 'user_ratings_total', 'name', 'geometry', 'photos']}
{
    "html_attributions": [],
    "results": [
        {
            "business_status": "OPERATIONAL",
            "formatted_address": "Skychief St, Fort Rucker, AL 36362, United States",
            "geometry": {
                "location": {
                    "lat": 31.3242639,
                    "lng": -85.7180348
                },
                "viewport": {
                    "northeast": {
                        "lat": 31.32559092989271,
                        "lng": -85.71685427010728
                    },
                    "southwest": {
                        "lat": 31.32289127010727,
                        "lng": -85.71955392989271
                    }
                }
            },
            "icon": "https://maps.gstatic.com/mapfiles/place_api/ico