# Capstone Project - Scraping shoe names

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
from time import time, sleep
from random import randint
from IPython.core.display import clear_output
from warnings import warn

In [2]:
# Use the requests library to get the html from the home page
res = requests.get('https://runrepeat.com/ranking/rankings-of-running-shoes?gender=men&page=')

# Parse the HTML from our URL into the BeautifulSoup parse tree format
soup = bs(res.content, 'lxml')

In [3]:
# Start with empty list
shoe_list = []

# Preparing the monitoring of the loop
start_time = time()
req = 0

# Loop through all 69 web pages of shoe listings
for n in range(1, 70):
    
    # Use the requests library to get the html from the web page
    res = requests.get(f'https://runrepeat.com/ranking/rankings-of-running-shoes?gender=men&page={n}')
    
    # Pause the loop
    sleep(randint(1, 6))
    
    # Monitor the requests
    req += 1
    elapsed_time = time() - start_time
    print('Request:{}; Frequency: {} requests/s'.format(req, req/elapsed_time))
    clear_output(wait = True)
    
    # Throw a warning for non-200 status codes
    if res.status_code != 200:
        warn('Request: {}; Status code: {}'.format(req, res.status_code))
        
    # Break the loop if the number of requests is greater than expected
    if req > 69:
        warn('Number of requests was greater than expected.')
        break

    # Parse the HTML from our URL into the BeautifulSoup parse tree format
    soup = bs(res.content, 'lxml')

    # Isolate the div that has all the shoe names and slugs
    shoes_section = soup.find('div', {'class': 'col-sm-9 shoes_container'})

    # Loop through each shoe name and slug in the div
    for name in shoes_section.find_all('div', {'class': 'product-name hidden-sm hidden-xs'}):

        # Start with an empty dictionary
        shoe_name = {}

        # Add name
        shoe_name['name'] = name.find('span', {'itemprop': 'name'}).text

        # Add slug from the href
        shoe_name['slug'] = name.find('a', {'target': '_self'})['href'].split('/')[-1]

        # Add shoe to our list of shoes
        shoe_list.append(shoe_name)

Request:69; Frequency: 0.16473979600741154 requests/s


In [4]:
# Check scraped data
shoe_list

[{'name': 'Nike Air Zoom Pegasus 35', 'slug': 'nike-air-zoom-pegasus-35'},
 {'name': 'Brooks Ghost 11', 'slug': 'brooks-ghost-11'},
 {'name': 'Asics Gel Kayano 25', 'slug': 'asics-gel-kayano-25'},
 {'name': 'Brooks Adrenaline GTS 19', 'slug': 'brooks-adrenaline-gts-19'},
 {'name': 'Adidas Ultra Boost', 'slug': 'adidas-ultra-boost'},
 {'name': 'On Cloud', 'slug': 'on-cloud'},
 {'name': 'Asics Gel Venture 6', 'slug': 'asics-gel-venture-6'},
 {'name': 'Hoka One One Bondi 6', 'slug': 'hoka-one-one-bondi-6'},
 {'name': 'Nike Downshifter 7', 'slug': 'nike-downshifter-7'},
 {'name': 'Asics Gel Excite 6', 'slug': 'asics-gel-excite-6'},
 {'name': 'Merrell Trail Glove 4', 'slug': 'merrell-trail-glove-4'},
 {'name': 'Nike Air Zoom Pegasus 36', 'slug': 'nike-air-zoom-pegasus-36'},
 {'name': 'Nike Air Zoom Winflo 6', 'slug': 'nike-air-zoom-winflo-6'},
 {'name': 'Brooks Adrenaline GTS 18', 'slug': 'brooks-adrenaline-gts-18'},
 {'name': 'Xero Shoes Prio', 'slug': 'xero-shoes-prio'},
 {'name': 'New Ba

In [5]:
# Check correct number of shoes scraped
len(shoe_list)

2066

In [6]:
df = pd.DataFrame(shoe_list)
df

Unnamed: 0,name,slug
0,Nike Air Zoom Pegasus 35,nike-air-zoom-pegasus-35
1,Brooks Ghost 11,brooks-ghost-11
2,Asics Gel Kayano 25,asics-gel-kayano-25
3,Brooks Adrenaline GTS 19,brooks-adrenaline-gts-19
4,Adidas Ultra Boost,adidas-ultra-boost
5,On Cloud,on-cloud
6,Asics Gel Venture 6,asics-gel-venture-6
7,Hoka One One Bondi 6,hoka-one-one-bondi-6
8,Nike Downshifter 7,nike-downshifter-7
9,Asics Gel Excite 6,asics-gel-excite-6


In [7]:
# Export to csv
df.to_csv('shoe_list.csv', index=False)