# Web Scrapping Analysis

This notebook walks through the process of pulling, merging, and cleaning various datasets about climbing gyms around the US.

In [2]:
import pandas as pd
import numpy as np
import re
import requests
import json
from keys import GKEY
import numpy as np


In [3]:
#load the scrapped data from mountainProject(mP)
df = pd.read_json('data.json', orient='index')

df

Unnamed: 0,locName,state,img_list,gym_info
Birmingham Boulders,Birmingham Boulders,Alabama,[https://cdn2.apstatic.com/photos/climb/119942...,"[www.boulderingauthority.com, 136 INDUSTRIAL ..."
Campus Recreation Center,Campus Recreation Center,Ohio,[],"[www.uc.edu, University of Cincinnati, 2820 B..."
Faucett Brothers Activity Center,Faucett Brothers Activity Center,Alabama,[],"[www.tcpara.org, 13040 Eugenia Faucett Drive ..."
First Avenue Rocks,First Avenue Rocks,Alabama,[],"[firstaverocks.com, 2417 1st Avenue South, Bi..."
Fortenberry-Colton Fitness Center,Fortenberry-Colton Fitness Center,Alabama,[],"[www.ftruckermwr.com, Bldg. 5900 on the corne..."
...,...,...,...,...
Elemental Training Center,Elemental Training Center,Wyoming,[],"[lmntl.net, 205 Lincoln St, Lander, Wyoming 8..."
Fall Hall,Fall Hall,Wyoming,[],"[ F.E.Warren AFB, Cheyenne, WY ]"
Gottsche Glyphs,Gottsche Glyphs,Wyoming,[],"[gottsche.org, 148 E. Arapahoe, Thermopolis, ..."
Pinedale Aquatic Center,Pinedale Aquatic Center,Wyoming,[https://cdn2.apstatic.com/photos/climb/107748...,"[www.pinedaleaquatic.com, 535 N. Tyler Ave., ..."


In [4]:
#list how many picutures were on mP
df['NumberOfImages'] = df.img_list.map(len)

In [5]:
#make data more pandas friendly

def getFirstItem(gym_info: list):
    try:
        return gym_info[0]
    except:
        return ""

def getSecondItem(gym_info: list):
    try:
        return gym_info[1]
    except:
        return ""

df['gymURL'] = df['gym_info'].map(getFirstItem)
df['gym_address'] = df['gym_info'].map(getSecondItem)

del df['gym_info']

In [15]:
df["google_Place_ID"] = ""
df["locLat"] = ""
df["locLong"] = ""
df["business_status"] = ""
df["google_photoReferences"] = ""
df["googleRating"] = ""
df["numUsersRated"] = ""
df["typeList"] = ""

In [16]:
#grab non-nested values with try-except
def tryToGet(searchResults, resultKey: str, isList: bool = False):
    try:
        return searchResults[resultKey]
    except:
        if not isList:
            return ""
        else:
            return []
            

In [17]:
df[df['state'] == 'Delaware']

Unnamed: 0,locName,state,img_list,NumberOfImages,gymURL,gym_address,google_Place_ID,locLat,locLong,business_status,google_photoReferences,googleRating,numUsersRated,typeList
Jewish Community Center Climbing Wall,Jewish Community Center Climbing Wall,Delaware,[],0,www.siegeljcc.org,"101 Garden of Eden Road, Wilmington, Delaware...",,,,,,,,
RISE Fitness + Adventure,RISE Fitness + Adventure,Delaware,[],0,,"35770 Airport Rd, Rehoboth Beach, DE 19971",,,,,,,,
The Delaware Rock Gym,The Delaware Rock Gym,Delaware,[],0,www.derockgym.com,"520 Carson Dr., Bear, Delaware 19701",,,,,,,,
University of Delaware Climbing Wall,University of Delaware Climbing Wall,Delaware,[https://cdn2.apstatic.com/photos/climb/112034...,2,www.udel.edu,"Carpenter Center, North College Street, Newar...",,,,,,,,


Index(['locName', 'state', 'img_list', 'NumberOfImages', 'gymURL',
       'gym_address', 'google_Place_ID', 'locLat', 'locLong',
       'business_status', 'google_photoReferences', 'googleRating',
       'numUsersRated', 'typeList'],
      dtype='object')

In [34]:
df.img_list.apply("".join).str.len().max()

1152

In [57]:
#specify the url endpoint to send the request to
placeTextSearchURL = 'https://maps.googleapis.com/maps/api/place/textsearch/json'
fields = [
        "place_id",
        "business_status",
        "rating",
        "types",
        "user_ratings_total",
        "name",
        "geometry",
        'photos'
    ]

for idx, row in df.iterrows():

    #assign address to variable, and create params dict
    address = row['locName'].strip('…') + " " + row['gym_address']
    
    params = {
                "key": GKEY,
                "query": address,
                'fields':fields
             }
    #get request to location text search api endpoint
    searchResults = requests.get(placeTextSearchURL, params=params)
    
    #conv to dict for easy extract
    searchResults = searchResults.json()
    
    #if results are empty, attmept a second search with just name
    if not searchResults["results"]:
        params.update({'query': row['locName'].strip('…')})
        searchResults = requests.get(placeTextSearchURL, params=params).json()
    
    if searchResults["results"]:
        searchResults = searchResults["results"][0]

    # extract non-nested value
    df.loc[idx,"google_Place_ID"] = tryToGet(searchResults,  "place_id")
    df.loc[idx,"business_status"] = tryToGet(searchResults,  "business_status")
    df.loc[idx,"googleRating"] = tryToGet(searchResults,"rating")
    df.at[idx,"typeList"] = tryToGet(searchResults, "types", True)
    df.loc[idx,"numUsersRated"] = tryToGet(searchResults, "user_ratings_total")
    
    #Try and update cleaner name
    try:
        df.loc[idx,"locName"] = searchResults["name"]
    except:
        pass
    
    #manual try blocks for nested values
    try:
        df.loc[idx,"locLat"] = searchResults["geometry"]["location"]["lat"]
        df.loc[idx,"locLong"] = searchResults["geometry"]["location"]["lat"]
    except:
        df.loc[idx,"locLat"] = np.nan
        df.loc[idx,"locLong"] = np.nan
        
    #
    try:
        photoReferenceList = []
        for photo in searchResults['photos']:
            photoReferenceList.append(photo["photo_reference"])
        df.at[idx,"google_photoReferences"] = photoReferenceList
    except:
        df.at[idx,"google_photoReferences"] = []

In [58]:
df

Unnamed: 0,locName,img_list,gInfo,NumberOfImages,gymURL,gym_address,google_Place_ID,locLat,locLong,business_status,google_photoReferences,googleRating,numUsersRated,typeList
Campus Recreation Center,UAB Campus Recreation,[],"{'geoLoc': [26.08135, -80.23653019999999], 'ra...",0,studentaffairs.uab.edu,"University of Alabama at Birmingham, 1501 Uni...",ChIJR93YI-kbiYgRvn3bM5fi5Bo,33.500956,33.500956,OPERATIONAL,[ATtYBwLGAWXpVf9ekPrO-UEtML6Im0pdWZP56CsTxMCVD...,4.6,343.0,"[gym, health, point_of_interest, establishment]"
Fortenberry-Colton Fitness Center,Fortenberry-Colton Physical Fitness Center,[],"{'geoLoc': [31.3242639, -85.7180348], 'rating'...",0,www.ftruckermwr.com,Bldg. 5900 on the corner of Skychief Street a...,ChIJgbAAIFdkkogRdo8uPDRqM7U,31.324264,31.324264,OPERATIONAL,[ATtYBwK7HKyV0WzCUmYKtJw1c2-_bEEWp8jTlbUo3FFNP...,4.5,52.0,"[gym, health, point_of_interest, establishment]"
University of South Alabama - Student…,307 N University Blvd,[],"{'geoLoc': [30.6959406, -88.184236], 'rating':...",0,www.southalabama.edu,"307 N University Blvd, Mobile, Alabama 36688-...",ChIJNf7vQamym4gRkPQKC3B5dv8,30.695525,30.695525,,[],,,[street_address]
High Point Climbing - Birmingham,High Point Climbing and Fitness - Birmingham,[],"{'geoLoc': [33.4206544, -86.6982553], 'rating'...",0,highpointclimbing.com,"4766 US-280, Birmingham, AL 35242",ChIJdaFwuksWiYgRLNOnRSAvVAw,33.420654,33.420654,OPERATIONAL,[ATtYBwLlU6UklPCxhhf4S7ebNPfvSZtQFltLXEmh4gEn6...,4.7,137.0,"[gym, health, point_of_interest, establishment]"
Birmingham Boulders,Birmingham Boulders,[https://cdn2.apstatic.com/photos/climb/119942...,"{'geoLoc': [33.4517437, -86.8511012], 'rating'...",2,www.boulderingauthority.com,"136 INDUSTRIAL DRIVE BIRMINGHAM, AL 35211",ChIJcWRT3-QeiYgR3v4dEd-ZWh8,33.451744,33.451744,OPERATIONAL,[ATtYBwLijstEM4oxMPEOY7waLZhTAciYkmsau2NrvEjDD...,4.9,74.0,"[school, gym, health, point_of_interest, estab..."
Gadrock,Gadrock,[],"{'geoLoc': [33.9865983, -86.0056007], 'rating'...",0,www.climbgadrock.com,1403 Rainbow Drive Gadsden AL United States 3...,ChIJbe-sgFo5iogRiilt7YFURhs,33.986598,33.986598,OPERATIONAL,[ATtYBwI957mmNRCNWXDSPmnhx4NlhKKQblGkMaFXm9-YU...,4.9,79.0,"[point_of_interest, establishment]"
First Avenue Rocks,First Avenue Rocks,[],"{'geoLoc': [33.5146648, -86.7975348], 'rating'...",0,firstaverocks.com,"2417 1st Avenue South, Birmingham, Alabama 35...",ChIJccr7oL4biYgRIKx0BFFwVmQ,33.514665,33.514665,CLOSED_PERMANENTLY,[ATtYBwKLcUGalyUqN6wSLYi2Qr7XNdelrT_JKifkFO6rz...,4.8,31.0,"[school, gym, health, point_of_interest, store..."
High Point Climbing - Huntsville,High Point Climbing and Fitness - Huntsville,[https://cdn2.apstatic.com/photos/climb/118309...,"{'geoLoc': [34.73785, -86.66708129999999], 'ra...",2,highpointclimbing.com,1020 Nunnahsae Park Dr. NW (4.43 mi) Huntsvil...,ChIJf9WVenppYogR7zJB9wbuYPE,34.73785,34.73785,OPERATIONAL,[ATtYBwJ-yGf-wd1I0lXAmHirfCyZCls09eM2yPOxkZOaY...,4.7,207.0,"[gym, health, point_of_interest, establishment]"
Auburn University Recreation and Well…,Auburn University Campus Recreation and Wellne...,[],"{'geoLoc': [32.6012386, -85.4929846], 'rating'...",0,,"601 Heisman Dr, Auburn, AL 36849",ChIJ_QyXYw7zjIgRhUWzNxmHgME,32.601239,32.601239,OPERATIONAL,[ATtYBwJLuB-XnR3hPlm3XrT9KczDFPHh_LfcHGSLs9Ngj...,4.7,230.0,"[gym, health, point_of_interest, establishment]"
Faucett Brothers Activity Center,Faucett Brothers Activity Center,[],"{'geoLoc': [33.2868784, -87.6325483], 'rating'...",0,www.tcpara.org,"13040 Eugenia Faucett Drive Northport, AL 35473",ChIJV0bo9psbhogRBT112cf3WL8,33.286878,33.286878,OPERATIONAL,[ATtYBwLv7pOXkwjaZmvTnsfYD9Pam76RpvKKFf2cui1Pf...,4.6,125.0,"[gym, health, point_of_interest, establishment]"


In [78]:
#create table for imgs
photo_df = df[['locName', 'img_list', 'google_photoReferences']]

#remove rows without pictures
nonEmptyMask = (photo_df.img_list.astype(bool) | photo_df.google_photoReferences.astype(bool))
photo_df = photo_df[nonEmptyMask]


Unnamed: 0,locName,img_list,google_photoReferences
Campus Recreation Center,UAB Campus Recreation,[],[ATtYBwLGAWXpVf9ekPrO-UEtML6Im0pdWZP56CsTxMCVD...
Fortenberry-Colton Fitness Center,Fortenberry-Colton Physical Fitness Center,[],[ATtYBwK7HKyV0WzCUmYKtJw1c2-_bEEWp8jTlbUo3FFNP...
High Point Climbing - Birmingham,High Point Climbing and Fitness - Birmingham,[],[ATtYBwLlU6UklPCxhhf4S7ebNPfvSZtQFltLXEmh4gEn6...
Birmingham Boulders,Birmingham Boulders,[https://cdn2.apstatic.com/photos/climb/119942...,[ATtYBwLijstEM4oxMPEOY7waLZhTAciYkmsau2NrvEjDD...
Gadrock,Gadrock,[],[ATtYBwI957mmNRCNWXDSPmnhx4NlhKKQblGkMaFXm9-YU...
First Avenue Rocks,First Avenue Rocks,[],[ATtYBwKLcUGalyUqN6wSLYi2Qr7XNdelrT_JKifkFO6rz...
High Point Climbing - Huntsville,High Point Climbing and Fitness - Huntsville,[https://cdn2.apstatic.com/photos/climb/118309...,[ATtYBwJ-yGf-wd1I0lXAmHirfCyZCls09eM2yPOxkZOaY...
Auburn University Recreation and Well…,Auburn University Campus Recreation and Wellne...,[],[ATtYBwJLuB-XnR3hPlm3XrT9KczDFPHh_LfcHGSLs9Ngj...
Faucett Brothers Activity Center,Faucett Brothers Activity Center,[],[ATtYBwLv7pOXkwjaZmvTnsfYD9Pam76RpvKKFf2cui1Pf...
YMCA Camp Cosby Alpine Tower,YMCA Camp Cosby,[],[ATtYBwIR-76vz-UK_23lKUcukRUCiQQjEEKOKn6Jnzm7d...


In [80]:
#create table with geographical information
geo_df = df[['locName', 'locLat', 'locLong', 'gym_address']]
geo_df

Unnamed: 0,locName,locLat,locLong,gym_address
Campus Recreation Center,UAB Campus Recreation,33.500956,33.500956,"University of Alabama at Birmingham, 1501 Uni..."
Fortenberry-Colton Fitness Center,Fortenberry-Colton Physical Fitness Center,31.324264,31.324264,Bldg. 5900 on the corner of Skychief Street a...
University of South Alabama - Student…,307 N University Blvd,30.695525,30.695525,"307 N University Blvd, Mobile, Alabama 36688-..."
High Point Climbing - Birmingham,High Point Climbing and Fitness - Birmingham,33.420654,33.420654,"4766 US-280, Birmingham, AL 35242"
Birmingham Boulders,Birmingham Boulders,33.451744,33.451744,"136 INDUSTRIAL DRIVE BIRMINGHAM, AL 35211"
Gadrock,Gadrock,33.986598,33.986598,1403 Rainbow Drive Gadsden AL United States 3...
First Avenue Rocks,First Avenue Rocks,33.514665,33.514665,"2417 1st Avenue South, Birmingham, Alabama 35..."
High Point Climbing - Huntsville,High Point Climbing and Fitness - Huntsville,34.73785,34.73785,1020 Nunnahsae Park Dr. NW (4.43 mi) Huntsvil...
Auburn University Recreation and Well…,Auburn University Campus Recreation and Wellne...,32.601239,32.601239,"601 Heisman Dr, Auburn, AL 36849"
Faucett Brothers Activity Center,Faucett Brothers Activity Center,33.286878,33.286878,"13040 Eugenia Faucett Drive Northport, AL 35473"


In [36]:
#create table with business info
business_df = df[['locName', 'google_Place_ID', "business_status", "typeList", "googleRating","numUsersRated"]]

#add columns to add info
business_df['phoneNumber'] = ""
business_df['adr_address'] = ""


business_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  business_df['phoneNumber'] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  business_df['adr_address'] = ""


Unnamed: 0,locName,google_Place_ID,business_status,typeList,googleRating,numUsersRated,phoneNumber,adr_address
Birmingham Boulders,Birmingham Boulders,,,,,,,
Campus Recreation Center,Campus Recreation Center,,,,,,,
Faucett Brothers Activity Center,Faucett Brothers Activity Center,,,,,,,
First Avenue Rocks,First Avenue Rocks,,,,,,,
Fortenberry-Colton Fitness Center,Fortenberry-Colton Fitness Center,,,,,,,
...,...,...,...,...,...,...,...,...
Elemental Training Center,Elemental Training Center,,,,,,,
Fall Hall,Fall Hall,,,,,,,
Gottsche Glyphs,Gottsche Glyphs,,,,,,,
Pinedale Aquatic Center,Pinedale Aquatic Center,,,,,,,


In [39]:
df['typeList']

Birmingham Boulders                   
Campus Recreation Center              
Faucett Brothers Activity Center      
First Avenue Rocks                    
Fortenberry-Colton Fitness Center     
                                    ..
Elemental Training Center             
Fall Hall                             
Gottsche Glyphs                       
Pinedale Aquatic Center               
The Rock                              
Name: typeList, Length: 938, dtype: object

In [None]:
define_review_df = {
    'google_Place_ID': [],
    'review_text' : [],
    'user_rating' : [],
    ''
}

review_df = pd.df(define_review_df)

In [54]:
fields = [
        "place_id",
        "business_status",
        "rating",
        "types",
        "user_ratings_total",
        "name",
        "geometry",
        'photos'
    ]


param={
    'key': GKEY,
    'query':'Fortenberry-Colton Fitness Center' + ' Bldg. 5900 on the corner of Skychief Street and 5th Avenue, Fort Rucker,Alabama 36362 ',
    'fields':fields
}

results = requests.get(placeTextSearchURL, params=param)

In [55]:
rjson = results.json()
print(json.dumps(rjson, indent=4))

{
    "html_attributions": [],
    "results": [],
    "status": "ZERO_RESULTS"
}
