# Web Scrapping Analysis

This notebook walks through the process of pulling, merging, and cleaning various datasets about climbing gyms around the US.

In [18]:
import pandas as pd
import re
import requests
import json
from key import gkey


In [2]:
df = pd.read_json('data.json', orient='index')

df

Unnamed: 0,locName,rating,img_list,gym_info
Auburn University Recreation and Well…,Auburn University Recreation and Well…,4.7,[],"[, 601 Heisman Dr, Auburn, AL 36849 ]"
Birmingham Boulders,Birmingham Boulders,4.9,[https://cdn2.apstatic.com/photos/climb/119942...,"[www.boulderingauthority.com, 136 INDUSTRIAL ..."
Campus Recreation Center,Campus Recreation Center,4.0,[],"[studentaffairs.uab.edu, University of Alabam..."
Faucett Brothers Activity Center,Faucett Brothers Activity Center,4.6,[],"[www.tcpara.org, 13040 Eugenia Faucett Drive ..."
First Avenue Rocks,First Avenue Rocks,4.8,[],"[firstaverocks.com, 2417 1st Avenue South, Bi..."
Fortenberry-Colton Fitness Center,Fortenberry-Colton Fitness Center,4.5,[],"[www.ftruckermwr.com, Bldg. 5900 on the corne..."
Gadrock,Gadrock,4.9,[],"[www.climbgadrock.com, 1403 Rainbow Drive Gad..."
High Point Climbing - Birmingham,High Point Climbing - Birmingham,4.7,[],"[highpointclimbing.com, 4766 US-280, Birmingh..."
High Point Climbing - Huntsville,High Point Climbing - Huntsville,4.7,[https://cdn2.apstatic.com/photos/climb/118309...,"[highpointclimbing.com, 1020 Nunnahsae Park D..."
University of Alabama,University of Alabama,Admi,[],"[urec.ua.edu, 401 5th Ave East, Tuscaloosa, A..."


In [4]:
df['NumberOfImages'] = df.img_list.map(len)

In [5]:
regex = r'[A-Za-z0-9]+[A-Za-z0-9\-\.][A-Za-z0-9]+\.\w{1,4}'
def extractSiteURLFromGymInfo(gym_info: list):

    for item in gym_info:
#         print(item)
        gymSiteURL = re.search(regex, item)
        if gymSiteURL is not None:
            return gymSiteURL[0]

    return None

In [6]:
df['gymURL'] = df.gym_info.map(extractSiteURLFromGymInfo)

In [7]:
df

Unnamed: 0,locName,rating,img_list,gym_info,NumberOfImages,gymURL
Auburn University Recreation and Well…,Auburn University Recreation and Well…,4.7,[],"[, 601 Heisman Dr, Auburn, AL 36849 ]",0,
Birmingham Boulders,Birmingham Boulders,4.9,[https://cdn2.apstatic.com/photos/climb/119942...,"[www.boulderingauthority.com, 136 INDUSTRIAL ...",2,www.boulderingauthority.com
Campus Recreation Center,Campus Recreation Center,4.0,[],"[studentaffairs.uab.edu, University of Alabam...",0,studentaffairs.uab.edu
Faucett Brothers Activity Center,Faucett Brothers Activity Center,4.6,[],"[www.tcpara.org, 13040 Eugenia Faucett Drive ...",0,www.tcpara.org
First Avenue Rocks,First Avenue Rocks,4.8,[],"[firstaverocks.com, 2417 1st Avenue South, Bi...",0,firstaverocks.com
Fortenberry-Colton Fitness Center,Fortenberry-Colton Fitness Center,4.5,[],"[www.ftruckermwr.com, Bldg. 5900 on the corne...",0,www.ftruckermwr.com
Gadrock,Gadrock,4.9,[],"[www.climbgadrock.com, 1403 Rainbow Drive Gad...",0,www.climbgadrock.com
High Point Climbing - Birmingham,High Point Climbing - Birmingham,4.7,[],"[highpointclimbing.com, 4766 US-280, Birmingh...",0,highpointclimbing.com
High Point Climbing - Huntsville,High Point Climbing - Huntsville,4.7,[https://cdn2.apstatic.com/photos/climb/118309...,"[highpointclimbing.com, 1020 Nunnahsae Park D...",2,highpointclimbing.com
University of Alabama,University of Alabama,Admi,[],"[urec.ua.edu, 401 5th Ave East, Tuscaloosa, A...",0,urec.ua.edu


In [8]:
df['gym_info']

Auburn University Recreation and Well…               [,  601 Heisman Dr, Auburn, AL 36849 ]
Birmingham Boulders                       [www.boulderingauthority.com,  136 INDUSTRIAL ...
Campus Recreation Center                  [studentaffairs.uab.edu,  University of Alabam...
Faucett Brothers Activity Center          [www.tcpara.org,  13040 Eugenia Faucett Drive ...
First Avenue Rocks                        [firstaverocks.com,  2417 1st Avenue South, Bi...
Fortenberry-Colton Fitness Center         [www.ftruckermwr.com,  Bldg. 5900 on the corne...
Gadrock                                   [www.climbgadrock.com,  1403 Rainbow Drive Gad...
High Point Climbing - Birmingham          [highpointclimbing.com,  4766 US-280, Birmingh...
High Point Climbing - Huntsville          [highpointclimbing.com,  1020 Nunnahsae Park D...
University of Alabama                     [urec.ua.edu,  401 5th Ave East, Tuscaloosa, A...
University of South Alabama - Student…    [www.southalabama.edu,  307 N Universi

In [14]:
def getSecondItem(gym_info: list):
    try:
        return gym_info[1]
    except:
        return ""

df['gym_address'] = df['gym_info'].map(getSecondItem)

In [16]:
df["Google_Place_ID"] = ""

In [17]:
#specify the url endpoint to send the request to
placeTextSearchURL = 'https://maps.googleapis.com/maps/api/place/textsearch/json'

for idx, row in df.iterrows():

    #assign address to variable, and create params dict
    address = row['gym_address']
    params = {
                "key": gkey,
                "query": address
             }
    
    #get request to location text search api endpoint
    searchResults = requests.get(placeTextSearchURL, params=params)
    
    searchResults = searchResults.json()
    
    
    

Unnamed: 0,locName,rating,img_list,gym_info,NumberOfImages,gymURL,gym_address,Google_Place_ID
Auburn University Recreation and Well…,Auburn University Recreation and Well…,4.7,[],"[, 601 Heisman Dr, Auburn, AL 36849 ]",0,,"601 Heisman Dr, Auburn, AL 36849",
Birmingham Boulders,Birmingham Boulders,4.9,[https://cdn2.apstatic.com/photos/climb/119942...,"[www.boulderingauthority.com, 136 INDUSTRIAL ...",2,www.boulderingauthority.com,"136 INDUSTRIAL DRIVE BIRMINGHAM, AL 35211",
Campus Recreation Center,Campus Recreation Center,4.0,[],"[studentaffairs.uab.edu, University of Alabam...",0,studentaffairs.uab.edu,"University of Alabama at Birmingham, 1501 Uni...",
Faucett Brothers Activity Center,Faucett Brothers Activity Center,4.6,[],"[www.tcpara.org, 13040 Eugenia Faucett Drive ...",0,www.tcpara.org,"13040 Eugenia Faucett Drive Northport, AL 35473",
First Avenue Rocks,First Avenue Rocks,4.8,[],"[firstaverocks.com, 2417 1st Avenue South, Bi...",0,firstaverocks.com,"2417 1st Avenue South, Birmingham, Alabama 35...",
Fortenberry-Colton Fitness Center,Fortenberry-Colton Fitness Center,4.5,[],"[www.ftruckermwr.com, Bldg. 5900 on the corne...",0,www.ftruckermwr.com,Bldg. 5900 on the corner of Skychief Street a...,
Gadrock,Gadrock,4.9,[],"[www.climbgadrock.com, 1403 Rainbow Drive Gad...",0,www.climbgadrock.com,1403 Rainbow Drive Gadsden AL United States 3...,
High Point Climbing - Birmingham,High Point Climbing - Birmingham,4.7,[],"[highpointclimbing.com, 4766 US-280, Birmingh...",0,highpointclimbing.com,"4766 US-280, Birmingham, AL 35242",
High Point Climbing - Huntsville,High Point Climbing - Huntsville,4.7,[https://cdn2.apstatic.com/photos/climb/118309...,"[highpointclimbing.com, 1020 Nunnahsae Park D...",2,highpointclimbing.com,1020 Nunnahsae Park Dr. NW (4.43 mi) Huntsvil...,
University of Alabama,University of Alabama,Admi,[],"[urec.ua.edu, 401 5th Ave East, Tuscaloosa, A...",0,urec.ua.edu,"401 5th Ave East, Tuscaloosa, Alabama 35487",


In [28]:
#specify the url endpoint to send the request to
placeTextSearchURL = 'https://maps.googleapis.com/maps/api/place/textsearch/json'


#assign address w/ gym name to variable, and create params dict
address =  'Birmingham Boulders' + '136 INDUSTRIAL DRIVE BIRMINGHAM, AL 35211'
params = {
            "key": gkey,
            "query": address
         }

#get request to location text search api endpoint
searchResults = requests.get(placeTextSearchURL, params=params)

searchResults = searchResults.json()


In [29]:
print(json.dumps(searchResults["results"][0], indent=4))

{
    "business_status": "OPERATIONAL",
    "formatted_address": "136 Industrial Dr, Birmingham, AL 35211, United States",
    "geometry": {
        "location": {
            "lat": 33.4517437,
            "lng": -86.8511012
        },
        "viewport": {
            "northeast": {
                "lat": 33.45309402989272,
                "lng": -86.84954267010728
            },
            "southwest": {
                "lat": 33.45039437010728,
                "lng": -86.85224232989273
            }
        }
    },
    "icon": "https://maps.gstatic.com/mapfiles/place_api/icons/v1/png_71/generic_business-71.png",
    "name": "Birmingham Boulders",
    "opening_hours": {
        "open_now": true
    },
    "photos": [
        {
            "height": 640,
            "html_attributions": [
                "<a href=\"https://maps.google.com/maps/contrib/109182361649883573002\">Birmingham Boulders</a>"
            ],
            "photo_reference": "ATtYBwKuHPPh3oSbu6p1Q5qqrFWAVfji5SJEs

In [27]:
searchResults["results"][0]

{'formatted_address': '136 Industrial Dr, Birmingham, AL 35211, USA',
 'geometry': {'location': {'lat': 33.4516826, 'lng': -86.85114120000001},
  'viewport': {'northeast': {'lat': 33.45303297989272,
    'lng': -86.84956257010728},
   'southwest': {'lat': 33.45033332010728, 'lng': -86.85226222989273}}},
 'icon': 'https://maps.gstatic.com/mapfiles/place_api/icons/v1/png_71/geocode-71.png',
 'name': '136 Industrial Dr',
 'photos': [{'height': 4032,
   'html_attributions': ['<a href="https://maps.google.com/maps/contrib/100455621706538937054">Mark Bolding</a>'],
   'photo_reference': 'ATtYBwI7n0nfDSSPl0nCPzyWRXxZbm342U0yDcThi-KqletJ9HJWCuxv7dbFHEIyBUw33iPQmM9YAcgRmroA-xs9r0vqza4ih44I8xVaDulorGhC8xUULr25PV7v6CQe1W9RkZ1wtps-Q_pI9QLTM4T_NQOUtJzrRjkfguBpWy9u5Tw5UrWL',
   'width': 3024}],
 'place_id': 'ChIJadtS3-QeiYgREAucuAhuCAM',
 'reference': 'ChIJadtS3-QeiYgREAucuAhuCAM',
 'types': ['premise']}