# Demo of scraping a website
## Let's create some Beautiful Soup
We will store our results in a csv file

# Checking robots.txt

In [1]:
!curl https://www.guitarcenter.com/robots.txt

User-agent: *

Disallow: */ajax/*
Disallow: */includes/*
Disallow: /account/
Disallow: /beacons/
Disallow: /browse/
Disallow: /catridges/
Disallow: /cko/
Disallow: /common/
Disallow: /crt/
Disallow: /emailtemplate/
Disallow: /international/
Disallow: /mobile/
Disallow: /pdp/
Disallow: /storeLocation/
Disallow: /templates/
Disallow: /pdp/inlcudes/
Disallow: /pdp/quickView/
Disallow: /pdp/reviews/
Disallow: /proweb/
Disallow: /trackingpixels/
Disallow: /sweepstakes/
Disallow: /resources/
Disallow: /errors/
Disallow: /nammPage/
Disallow: /marketing/
Disallow: /customBrands/
Disallow: /dev/
Disallow: /mf/
Disallow: /search?sb=r*
Disallow: /agent/dynaTraceMonitor
Disallow: /pages/rsvp.gc*

Sitemap: https://www.guitarcenter.com/gc-sitemap.xml

# Can also use robotparser

In [2]:
# Assessing whether we can scape a particular piece of information...

import urllib.robotparser
rp = urllib.robotparser.RobotFileParser()
rp.set_url("https://www.guitarcenter.com/robots.txt")
rp.read()
rp.can_fetch("*","https://www.guitarcenter.com/search?typeAheadSuggestion=true&typeAheadRedirect=true&isTypeAheadSearch=true&Ns=r&tAt=term&tNtt=MPC&tAv=mpc&Ntt=mpc")

True

In [3]:
from bs4 import BeautifulSoup as bsoup
import requests

In [4]:
response = requests.get('https://www.guitarcenter.com/search?typeAheadSuggestion=true&typeAheadRedirect=true&isTypeAheadSearch=true&Ns=r&tAt=term&tNtt=MPC&tAv=mpc&Ntt=mpc')

In [5]:
response.status_code

200

In [6]:
response.text

'<!DOCTYPE html>\n<html xmlns:fb="https://www.facebook.com/2008/fbml" xml:lang="en" lang="en">\n<head>\n<!-- Head - Dynamic Meta Data -->\n<title>"mpc" Results | Guitar Center</title>\n<meta name="description" content="Your mpc result provided by Guitar Center site.,templateDimensionStrategy\n1700003,,Search H1 Template,,Your Search Results for &quot;mpc"/>\n<!-- Head - Configurable Meta Data -->\n<meta name="pageTemplate" content="search-results" />\n<!-- Head - Manual Head Content -->\n<meta name="robots" content="noindex,follow" />\n <!-- create monetateQ window object -->\n<script>\n    window.monetateQ = window.monetateQ || [];\n    //Capture page Type\n    window.monetateQ.push([\n        "setPageType",\n        "searchPlp"\n    ]);\n\n    //add page type to event driven dataLayer\n    window.dataLayer = window.dataLayer || [];\n    window.dataLayer.push({\n        \'event\': \'pageData\',\n        \'pageType\': \'searchPlp\'\n    });\n    window.monetateQ.push([\n        "trackD

In [8]:
# the built-in 'html.parser' is also an option here
soup = bsoup(response.text, 'lxml')

In [10]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en" xml:lang="en" xmlns:fb="https://www.facebook.com/2008/fbml">
 <head>
  <!-- Head - Dynamic Meta Data -->
  <title>
   "mpc" Results | Guitar Center
  </title>
  <meta content='Your mpc result provided by Guitar Center site.,templateDimensionStrategy
1700003,,Search H1 Template,,Your Search Results for "mpc' name="description"/>
  <!-- Head - Configurable Meta Data -->
  <meta content="search-results" name="pageTemplate"/>
  <!-- Head - Manual Head Content -->
  <meta content="noindex,follow" name="robots"/>
  <!-- create monetateQ window object -->
  <script>
   window.monetateQ = window.monetateQ || [];
    //Capture page Type
    window.monetateQ.push([
        "setPageType",
        "searchPlp"
    ]);

    //add page type to event driven dataLayer
    window.dataLayer = window.dataLayer || [];
    window.dataLayer.push({
        'event': 'pageData',
        'pageType': 'searchPlp'
    });
    window.monetateQ.push([
        "trackData"
    ]);
  </sc

In [18]:
# right click on page and hit inspect...you can see selections changing on hover
# or just view page source...sometimes easier.
mpc_containers = soup.find_all('div', {'class': 'productDetails'})

# Note that bs4 result set objects have len attributes
print(type(mpc_containers))
print(len(mpc_containers))

# note that the result sets are iterable...we don't need this following component, but it's
# often to visually inspect the data you're dealing with when scraping.
for i in mpc_containers:
    print(i, '\n')

<class 'bs4.element.ResultSet'>
34
<div class="productDetails">
<div class="productTitle">
<strong>
<a href="/riffs/product-demos/keyboards--midi/yamaha-modx-plus-synthesizer">A Montage of Sound | Yamaha Introduces MODX+ Synthesizer</a>
</strong>
</div>
</div> 

<div class="productDetails">
<div class="productTitle">
<strong>
<a href="/riffs/product-demos/amplifiers--effects/line-6-catalyst-guitar-amplifier">Line 6 Catalyst | A Model Amplifier</a>
</strong>
</div>
</div> 

<div class="productDetails">
<div class="productTitle">
<strong>
<a href="/riffs/product-demos/amplifiers--effects/positive-grid-spark-mini">Ignite Your Playing | Positive Grid Spark MINI</a>
</strong>
</div>
</div> 

<div class="productDetails">
<div class="productTitle">
<strong>
<a href="/riffs/product-demos/live-sound/mackie-thump-powered-speakers">Mackie Thump Reimagined | Next Generation Loudspeakers Revealed</a>
</strong>
</div>
</div> 

<div class="productDetails">
<div class="productTitle">
<!-- For keyword 

In [20]:
# now we iterate over the bs4 ResultSet (this object is similar to a list)
# the object itslef doesn't have find and find_all methods, but its elements do!
# again, we're just printing out elements that we've isolated for demonstration.  We're not storing them yet.
for product in mpc_containers:
    temp_link = product.find_all('a', href=True) # finds a tags that have an href
    print(temp_link)
    print('\n')

[<a href="/riffs/product-demos/keyboards--midi/yamaha-modx-plus-synthesizer">A Montage of Sound | Yamaha Introduces MODX+ Synthesizer</a>]


[<a href="/riffs/product-demos/amplifiers--effects/line-6-catalyst-guitar-amplifier">Line 6 Catalyst | A Model Amplifier</a>]


[<a href="/riffs/product-demos/amplifiers--effects/positive-grid-spark-mini">Ignite Your Playing | Positive Grid Spark MINI</a>]


[<a href="/riffs/product-demos/live-sound/mackie-thump-powered-speakers">Mackie Thump Reimagined | Next Generation Loudspeakers Revealed</a>]


[<a href="/Akai-Professional/MPC-One-Standalone-Music-Production-Center.gc?rNtt=mpc&amp;index=1">
Akai Professional MPC One Standalone Music Production Center</a>, <a class="monthly-payments-details-info-icon" href="/Special-Financing-Offers.gc" target="_blank"><span class="screen-reader-only">Special Financing Offers (Open in new window)</span></a>, <a href="/Open-Box/Akai-Professional/MPC-One-Standalone-Music-Production-Center.gc">Open Box:</a>]


[<

In [21]:
# again, we're just printing things so you have a sense of what we're pulling out of our soup.
# specifically, we're pulling out href links.  Some of these are useful to us and others not.
for product in mpc_containers:
    temp_link = product.find_all('a', href=True) # finds a tags that have an href
    for i in temp_link:
        print(i['href'])

/riffs/product-demos/keyboards--midi/yamaha-modx-plus-synthesizer
/riffs/product-demos/amplifiers--effects/line-6-catalyst-guitar-amplifier
/riffs/product-demos/amplifiers--effects/positive-grid-spark-mini
/riffs/product-demos/live-sound/mackie-thump-powered-speakers
/Akai-Professional/MPC-One-Standalone-Music-Production-Center.gc?rNtt=mpc&index=1
/Special-Financing-Offers.gc
/Open-Box/Akai-Professional/MPC-One-Standalone-Music-Production-Center.gc
/Akai-Professional/MPC-Studio-Music-Production-Controller.gc?rNtt=mpc&index=2
/Open-Box/Akai-Professional/MPC-Studio-Music-Production-Controller.gc
/Akai-Professional/MPC-Live-2-Controller.gc?rNtt=mpc&index=3
/Special-Financing-Offers.gc
/Open-Box/Akai-Professional/MPC-Live-II-Controller.gc
/Blemished/Akai-Professional/MPC-Live-II-Controller.gc
/Akai-Professional/MPC-X.gc?rNtt=mpc&index=4
/Special-Financing-Offers.gc
/Open-Box/Akai-Professional/MPC-X.gc
/Akai-Professional/MPC-One-Gold-Standalone-Music-Production-Center.gc?rNtt=mpc&index=5
/S

In [22]:
# now we iterate over the bs4 ResultSet (this object is similar to a list)
# the object itslef doesn't have find and find_all methods, but its elements do!
# we'll iterate over the elements and find a tags that have an href.
product_links = []

for product in mpc_containers:
    temp_link = product.find_all('a', href=True) # finds a tags that have an href
    for i in temp_link:
        product_links.append(i['href'])

## Let's examine the links we've collected
Suppose we're only interested in Akai products

In [23]:
for i in product_links:
    print(i)

/riffs/product-demos/keyboards--midi/yamaha-modx-plus-synthesizer
/riffs/product-demos/amplifiers--effects/line-6-catalyst-guitar-amplifier
/riffs/product-demos/amplifiers--effects/positive-grid-spark-mini
/riffs/product-demos/live-sound/mackie-thump-powered-speakers
/Akai-Professional/MPC-One-Standalone-Music-Production-Center.gc?rNtt=mpc&index=1
/Special-Financing-Offers.gc
/Open-Box/Akai-Professional/MPC-One-Standalone-Music-Production-Center.gc
/Akai-Professional/MPC-Studio-Music-Production-Controller.gc?rNtt=mpc&index=2
/Open-Box/Akai-Professional/MPC-Studio-Music-Production-Controller.gc
/Akai-Professional/MPC-Live-2-Controller.gc?rNtt=mpc&index=3
/Special-Financing-Offers.gc
/Open-Box/Akai-Professional/MPC-Live-II-Controller.gc
/Blemished/Akai-Professional/MPC-Live-II-Controller.gc
/Akai-Professional/MPC-X.gc?rNtt=mpc&index=4
/Special-Financing-Offers.gc
/Open-Box/Akai-Professional/MPC-X.gc
/Akai-Professional/MPC-One-Gold-Standalone-Music-Production-Center.gc?rNtt=mpc&index=5
/S

In [24]:
links = [link for link in product_links if 'Akai' in link and 'MPC' in link]

In [25]:
for i in links:
    print(i)

/Akai-Professional/MPC-One-Standalone-Music-Production-Center.gc?rNtt=mpc&index=1
/Open-Box/Akai-Professional/MPC-One-Standalone-Music-Production-Center.gc
/Akai-Professional/MPC-Studio-Music-Production-Controller.gc?rNtt=mpc&index=2
/Open-Box/Akai-Professional/MPC-Studio-Music-Production-Controller.gc
/Akai-Professional/MPC-Live-2-Controller.gc?rNtt=mpc&index=3
/Open-Box/Akai-Professional/MPC-Live-II-Controller.gc
/Blemished/Akai-Professional/MPC-Live-II-Controller.gc
/Akai-Professional/MPC-X.gc?rNtt=mpc&index=4
/Open-Box/Akai-Professional/MPC-X.gc
/Akai-Professional/MPC-One-Gold-Standalone-Music-Production-Center.gc?rNtt=mpc&index=5
/Akai-Professional/MPC-Key-61-Production-Synthesizer.gc?rNtt=mpc&index=6
/Akai-Professional/MPC-Live-II-Controller-Gold.gc?rNtt=mpc&index=7
/Used/Akai-Professional/MPC-STUDIO-BLACK-Production-Controller-118112860.gc?rNtt=mpc&index=9
/Used/Akai-Professional/MPC-Renaissance-Production-Controller-118293204.gc?rNtt=mpc&index=10
/Used/Akai-Professional/MPC-Ren

In [44]:
# we could further filter our list of links with a python regex or otherwise

import re

r = re.compile('^(?!/Open-Box)')
newlinks = list(filter(r.match, links))
for i in newlinks:
    print(i)

/Akai-Professional/MPC-One-Standalone-Music-Production-Center.gc?rNtt=mpc&index=1
/Akai-Professional/MPC-Studio-Music-Production-Controller.gc?rNtt=mpc&index=2
/Akai-Professional/MPC-Live-2-Controller.gc?rNtt=mpc&index=3
/Blemished/Akai-Professional/MPC-Live-II-Controller.gc
/Akai-Professional/MPC-X.gc?rNtt=mpc&index=4
/Akai-Professional/MPC-One-Gold-Standalone-Music-Production-Center.gc?rNtt=mpc&index=5
/Akai-Professional/MPC-Key-61-Production-Synthesizer.gc?rNtt=mpc&index=6
/Akai-Professional/MPC-Live-II-Controller-Gold.gc?rNtt=mpc&index=7
/Used/Akai-Professional/MPC-STUDIO-BLACK-Production-Controller-118112860.gc?rNtt=mpc&index=9
/Used/Akai-Professional/MPC-Renaissance-Production-Controller-118293204.gc?rNtt=mpc&index=10
/Used/Akai-Professional/MPC-Renaissance-Production-Controller-117999217.gc?rNtt=mpc&index=11
/Used/Akai-Professional/MPC-ONE-Production-Controller-118218828.gc?rNtt=mpc&index=12
/Used/Akai-Professional/MPC-Studio-Slimline-Production-Controller-118068778.gc?rNtt=mpc&

## Now we could iterate over these links and read the source code from each of them
## That approach can be useful, but let's look at an alternative.

In [79]:
# right click on page and hit inspect...you can see selections changing on hover
mpc_containers = soup.find_all('div', {'class': 'productDetails'})

price_list = []
product_name = []

for i in mpc_containers:
    temp_price = i.find_all('span', {'class': 'productPrice'})
    temp_name = i.find_all('div', {'class': 'productTitle'}) # finds a tags that have an href
    if len(temp_price) > 0:
        for price in temp_price:
            print(price.text)
            #price_list.append(price.text[12:])
            price_list.append(price.text.split()[-1].strip('$')) # perhaps a better option
        for prod in temp_name:
            product_name.append(prod.text.strip())

print(price_list)
print(product_name)

Your Price $899.00
Your Price $269.00
Your Price $1,299.00
Your Price $2,299.00
Your Price $899.00
Your Price $1,899.00
Your Price $1,299.00
Your Price $4,289.99
Your Price $239.99
Your Price $649.99
Your Price $559.99
Your Price $639.99
Your Price $249.99
Your Price $1,099.99
Your Price $89.99
Your Price $999.99
Your Price $2,999.99
Your Price $36.99
Your Price $89.99
Your Price $139.99
Your Price $84.99
Your Price $515.99
Your Price $49.00
Your Price $82.99
From 
From Price $52.99
From 
From Price $51.99
Your Price $5.49
Your Price $674.99
Your Price $989.99
Your Price $239.99
['899.00', '269.00', '1,299.00', '2,299.00', '899.00', '1,899.00', '1,299.00', '4,289.99', '239.99', '649.99', '559.99', '639.99', '249.99', '1,099.99', '89.99', '999.99', '2,999.99', '36.99', '89.99', '139.99', '84.99', '515.99', '49.00', '82.99', '52.99', '51.99', '5.49', '674.99', '989.99', '239.99']
['Akai Professional MPC One Standalone Music Production Center', 'Akai Professional MPC Studio Music Producti

In [85]:
prod_data = {'name': product_name, 'price': price_list}

In [86]:
import csv

with open('prod_data.csv', 'w', newline='') as f:
    csvwriter = csv.writer(f, delimiter=',')
    csvwriter.writerow(('name', 'price'))
    csvwriter.writerows(zip(prod_data['name'], prod_data['price']))

In [95]:
!cat prod_data.csv

In [91]:
# we can clean up the errant quotes with sed if we like.
! sed 's/"//g' prod_data.csv > prod_data.csv