# Web Scrapping Analysis

This notebook walks through the process of pulling, merging, and cleaning various datasets about climbing gyms around the US.

In [1]:
import pandas as pd
import numpy as np
import re
import requests
import json
from key import GKEY
import numpy as np


In [2]:
#load the scrapped data from mountainProject(mP)
df = pd.read_json('data.json', orient='index')

df

Unnamed: 0,locName,img_list,gInfo,gym_info
Campus Recreation Center,Campus Recreation Center,[],"{'geoLoc': [26.08135, -80.23653019999999], 'ra...","[studentaffairs.uab.edu, University of Alabam..."
Fortenberry-Colton Fitness Center,Fortenberry-Colton Fitness Center,[],"{'geoLoc': [31.3242639, -85.7180348], 'rating'...","[www.ftruckermwr.com, Bldg. 5900 on the corne..."
University of South Alabama - Student…,University of South Alabama - Student…,[],"{'geoLoc': [30.6959406, -88.184236], 'rating':...","[www.southalabama.edu, 307 N University Blvd,..."
High Point Climbing - Birmingham,High Point Climbing - Birmingham,[],"{'geoLoc': [33.4206544, -86.6982553], 'rating'...","[highpointclimbing.com, 4766 US-280, Birmingh..."
Birmingham Boulders,Birmingham Boulders,[https://cdn2.apstatic.com/photos/climb/119942...,"{'geoLoc': [33.4517437, -86.8511012], 'rating'...","[www.boulderingauthority.com, 136 INDUSTRIAL ..."
Gadrock,Gadrock,[],"{'geoLoc': [33.9865983, -86.0056007], 'rating'...","[www.climbgadrock.com, 1403 Rainbow Drive Gad..."
First Avenue Rocks,First Avenue Rocks,[],"{'geoLoc': [33.5146648, -86.7975348], 'rating'...","[firstaverocks.com, 2417 1st Avenue South, Bi..."
High Point Climbing - Huntsville,High Point Climbing - Huntsville,[https://cdn2.apstatic.com/photos/climb/118309...,"{'geoLoc': [34.73785, -86.66708129999999], 'ra...","[highpointclimbing.com, 1020 Nunnahsae Park D..."
Auburn University Recreation and Well…,Auburn University Recreation and Well…,[],"{'geoLoc': [32.6012386, -85.4929846], 'rating'...","[, 601 Heisman Dr, Auburn, AL 36849 ]"
Faucett Brothers Activity Center,Faucett Brothers Activity Center,[],"{'geoLoc': [33.2868784, -87.6325483], 'rating'...","[www.tcpara.org, 13040 Eugenia Faucett Drive ..."


In [3]:
#list how many picutures were on mP
df['NumberOfImages'] = df.img_list.map(len)

In [4]:
#make data more pandas friendly

def getFirstItem(gym_info: list):
    try:
        return gym_info[0]
    except:
        return ""

def getSecondItem(gym_info: list):
    try:
        return gym_info[1]
    except:
        return ""

df['gymURL'] = df['gym_info'].map(getFirstItem)
df['gym_address'] = df['gym_info'].map(getSecondItem)

del df['gym_info']

In [5]:
df["google_Place_ID"] = ""
df["locLat"] = ""
df["locLong"] = ""
df["business_status"] = ""
df["google_photoReferences"] = ""
df["googleRating"] = ""
df["numUsersRated"] = ""
df["typeList"] = ""

In [6]:
#grab non-nested values with try-except
def tryToGet(searchResults, resultKey: str, isList: bool = False):
    try:
        return searchResults[resultKey]
    except:
        if not isList:
            return ""
        else:
            return []
            

In [7]:
#specify the url endpoint to send the request to
placeTextSearchURL = 'https://maps.googleapis.com/maps/api/place/textsearch/json'

for idx, row in df.iterrows():

    #assign address to variable, and create params dict
    address = row['locName'].strip('…') + " " + row['gym_address']
    fields = [
        "place_id",
        "business_status",
        "rating",
        "types",
        "user_ratings_total",
        "name",
        "geometry",
        'photos'
    ]
    params = {
                "key": GKEY,
                "query": address
             }
    #get request to location text search api endpoint
    searchResults = requests.get(placeTextSearchURL, params=params)
    
    #conv to dict for easy extract
    searchResults = searchResults.json()
    
    
    if searchResults["results"]:
        searchResults = searchResults["results"][0]

    # extract non-nested value
    df.loc[idx,"google_Place_ID"] = tryToGet(searchResults,  "place_id")
    df.loc[idx,"business_status"] = tryToGet(searchResults,  "business_status")
    df.loc[idx,"googleRating"] = tryToGet(searchResults,"rating")
    df.at[idx,"typeList"] = tryToGet(searchResults, "types", True)
    df.loc[idx,"numUsersRated"] = tryToGet(searchResults, "user_ratings_total")
    
    #Try and update cleaner name
    try:
        df.loc[idx,"locName"] = searchResults["name"]
    except:
        pass
    
    #manual try blocks for nested values
    try:
        df.loc[idx,"locLat"] = searchResults["geometry"]["location"]["lat"]
        df.loc[idx,"locLong"] = searchResults["geometry"]["location"]["lat"]
    except:
        df.loc[idx,"locLat"] = np.nan
        df.loc[idx,"locLong"] = np.nan
        
           
    try:
        photoReferenceList = []
        for photo in searchResults['photos']:
            photoReferenceList.append(photo["photo_reference"])
        df.at[idx,"google_photoReferences"] = photoReferenceList
    except:
        df.at[idx,"google_photoReferences"] = []

In [19]:
df.iloc[1].gInfo

{'geoLoc': [31.3242639, -85.7180348], 'rating': 4.5}

In [9]:
photo_df = df[['locName', 'img_list', 'google_photoReferences']]

photo_df


Unnamed: 0,locName,img_list,google_photoReferences
Auburn University Recreation and Well…,Auburn University Campus Recreation and Wellne...,[],[ATtYBwJ2yiriwEs8bdKyTeobOdlmsGIcFpSmgfy1_zR-x...
Birmingham Boulders,Birmingham Boulders,[https://cdn2.apstatic.com/photos/climb/119942...,[ATtYBwJE6cPB56nn4lul-Nr_QeWWVBycpObxMOzW7QY7y...
Campus Recreation Center,UAB Campus Recreation,[],[ATtYBwJ-zH4Q7w7X_cqnNKXS-aWjcmG__2U0fecxUaFV7...
Faucett Brothers Activity Center,Faucett Brothers Activity Center,[],[ATtYBwJLw0bA-sTNa0oHN1DBrG0LQVvJPpX2AYGxWX57r...
First Avenue Rocks,First Avenue Rocks,[],[ATtYBwKB97rwACz1Hb6PNmSBQF3HsExACnMXIqofS6Qoq...
Fortenberry-Colton Fitness Center,Fortenberry-Colton Fitness Center,[],[]
Gadrock,Gadrock,[],[ATtYBwKucZPe49A7x6AzaBzM6DNN1XGqYceV2rcNbs5Zh...
High Point Climbing - Birmingham,High Point Climbing and Fitness - Birmingham,[],[ATtYBwJuxtk6JGtedCEAlG9apIDE8Ccih5JaogK9bQVR1...
High Point Climbing - Huntsville,High Point Climbing and Fitness - Huntsville,[https://cdn2.apstatic.com/photos/climb/118309...,[ATtYBwK4x7d4Rhb-lySWqr9hH3gk5bOsSRJW1ZqK4Xrki...
University of Alabama,Student Recreation Center,[],[]


{
    "business_status": "OPERATIONAL",
    "formatted_address": "136 Industrial Dr, Birmingham, AL 35211, United States",
    "geometry": {
        "location": {
            "lat": 33.4517437,
            "lng": -86.8511012
        },
        "viewport": {
            "northeast": {
                "lat": 33.45309402989272,
                "lng": -86.84954267010728
            },
            "southwest": {
                "lat": 33.45039437010728,
                "lng": -86.85224232989273
            }
        }
    },
    "icon": "https://maps.gstatic.com/mapfiles/place_api/icons/v1/png_71/generic_business-71.png",
    "name": "Birmingham Boulders",
    "opening_hours": {
        "open_now": true
    },
    "photos": [
        {
            "height": 640,
            "html_attributions": [
                "<a href=\"https://maps.google.com/maps/contrib/109182361649883573002\">Birmingham Boulders</a>"
            ],
            "photo_reference": "ATtYBwLQ0UZHjGGIyEijkqCV9iGVB6-PMcaqA