# Capstone Project - Scraping shoe features

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import datetime
from time import time, sleep
from random import randint
from IPython.core.display import clear_output
from warnings import warn

### Reading in previously scraped shoe names and slugs

In [2]:
shoe_list = pd.read_csv("shoe_list.csv")

In [3]:
print(shoe_list.shape)
shoe_list.head(3)

(2066, 2)


Unnamed: 0,name,slug
0,Nike Air Zoom Pegasus 35,nike-air-zoom-pegasus-35
1,Brooks Ghost 11,brooks-ghost-11
2,Asics Gel Kayano 25,asics-gel-kayano-25


### Preparation for scraping shoe features

Data of 2066 running shoes will be scraped from the RunRepeat shoe review website.

In [4]:
# Start with an empty list
shoes = []

# Specify the order of columns for creating DataFrame later
columns = [
    'name',
    'brand',
    'core_score',
    'user_review_count',
    'expert_review_count',
    'user_rating',
    'expert_rating',
    'user_5star',
    'user_4star',
    'user_3star',
    'user_2star',
    'user_1star',
    'discontinued',
    'terrain',
    'arch_support',
    'use',
    'waterproof',
    'water_repellent',
    'maximalist',
    'minimalist',
    'triathlon',
    'price',
    'weight',
    'toe_drop',
    'heel_height',
    'forefoot_height',
    'width',
    'good_summary',
    'bad_summary',
    'summary',
    'info',
    'size_fit',
    'outsole',
    'midsole',
    'upper']

In [18]:
# Define function to scrape data of up to 500 shoes at a time
def scrape_shoes(start_id, end_id):

    # Preparing the monitoring of the loop
    time_started = datetime.datetime.now()
    start_time = time()
    req = 0

    # Loop through each shoe in the given range
    for s in range(start_id, end_id):

        # Use requests library to get the content from each shoe page
        res = requests.get('https://runrepeat.com/{}'.format(shoe_list.iloc[s,1]))

        # Space out each request so as not to overwhelm the server
        sleep(randint(2,5))

        # Monitor the requests
        req += 1
        elapsed_time = round((time() - start_time),3)
        freq = round((req/elapsed_time),3)
        print('Request: {}; Frequency: {} requests/s; Index: {}, {}; Elapsed time: {}s'.format(req, freq, len(shoes)+1, shoe_list.iloc[s,0], elapsed_time))
        clear_output(wait = True)

        # Throw a warning for non-200 status codes
        if res.status_code != 200:
            warn('Request: {}; Status code: {}'.format(req, res.status_code))

        # Break the loop if the number of requests is greater than expected
        if req > 500:
            warn('Number of requests was greater than expected.')
            break

        # Create soup object from shoe html
        shoe_soup = bs(res.content, 'lxml')

        # Start with an empty shoe dictionary
        shoe = {}

        # Shoe name
        shoe['name'] = shoe_list.iloc[s,0]

        # Shoe brand
        try:
            shoe['brand'] = " ".join(shoe_soup.find_all('span', {'class': 'rank-text'})[1].text.split()[2:])
        except Exception as e:
            shoe['brand'] = ""

        # Core score given by website
        try:
            shoe['core_score'] = shoe_soup.find('span', {'class': 'overall_score_number'}).text
        except Exception as e:
            shoe['core_score'] = ""

        rating_container = shoe_soup.find('div', {'class': 'rating-container w100'})

        # Number of user reviews
        try:
            shoe['user_review_count'] = rating_container.find('a', {'href': 'javascript:void(0)'}).text.split(" ")[0]
        except Exception as e:
            shoe['user_review_count'] = ""

        # Number of expert reviews
        try:
            shoe['expert_review_count'] = rating_container.find('a', {'href': '#expert_reviews'}).text.split(" ")[0]
        except Exception as e:
            shoe['expert_review_count'] = ""

        # User rating
        try:
            shoe['user_rating'] = rating_container.find_all('span')[0].text.split("/")[0]
        except Exception as e:
            shoe['user_rating'] = ""

        # Expert rating
        try:
            shoe['expert_rating'] = rating_container.find_all('span')[1].text.split("/")[0]
        except Exception as e:
            shoe['expert_rating'] = ""

        # 5-star user rating (percentage of ratings given 5-star)
        try:
            shoe['user_5star'] = shoe_soup.find('div', {'id': 'rcs_item_5'}).find('div', {'class': 'rcs_percentage'}).text
        except Exception as e:
            shoe['user_5star'] = ""

        # 4-star user rating (percentage of ratings given 4-star)
        try:
            shoe['user_4star'] = shoe_soup.find('div', {'id': 'rcs_item_4'}).find('div', {'class': 'rcs_percentage'}).text
        except Exception as e:
            shoe['user_4star'] = ""

        # 3-star user rating (percentage of ratings given 3-star)
        try:
            shoe['user_3star'] = shoe_soup.find('div', {'id': 'rcs_item_3'}).find('div', {'class': 'rcs_percentage'}).text
        except Exception as e:
            shoe['user_3star'] = ""

        # 2-star user rating (percentage of ratings given 2-star)
        try:
            shoe['user_2star'] = shoe_soup.find('div', {'id': 'rcs_item_2'}).find('div', {'class': 'rcs_percentage'}).text
        except Exception as e:
            shoe['user_2star'] = ""

        # 1-star user rating (percentage of ratings given 1-star)
        try:
            shoe['user_1star'] = shoe_soup.find('div', {'id': 'rcs_item_1'}).find('div', {'class': 'rcs_percentage'}).text
        except Exception as e:
            shoe['user_1star'] = ""

        fact_list = shoe_soup.find('ul', {'class': 'fact-list'})

        # Check if shoe is discontinued
        try:
            itemFound = fact_list.find('span', {'style': 'color: red;'}).text
            shoe['discontinued'] = "Y"
        except Exception as e:
            shoe['discontinued'] = "N"

        # Terrain
        try:
            shoe['terrain'] = fact_list.find('li', {'class': 'fact-item fact-item_terrain'}).find('div', {'class': 'fact-value text-left'}).text
        except Exception as e:
            shoe['terrain'] = ""

        # Arch support
        try:
            shoe['arch_support'] = fact_list.find('li', {'class': 'fact-item fact-item_arch-support'}).find('div', {'class': 'fact-value text-left'}).text
        except Exception as e:
            shoe['arch_support'] = ""

        # Use
        try:
            shoe['use'] = fact_list.find('li', {'class': 'fact-item fact-item_use'}).find('div', {'class': 'fact-value text-left'}).text
        except Exception as e:
            shoe['use'] = ""

        # Waterproof
        try:
            shoe['waterproof'] = fact_list.find('li', {'class': 'fact-item fact-item_waterproof'}).find('div', {'class': 'fact-value text-left'}).text
        except Exception as e:
            shoe['waterproof'] = ""
            
        # Water repellent
        try:
            shoe['water_repellent'] = fact_list.find('li', {'class': 'fact-item fact-item_water-repellent'}).find('div', {'class': 'fact-value text-left'}).text
        except Exception as e:
            shoe['water_repellent'] = ""
            
        # Maximalist
        try:
            shoe['maximalist'] = fact_list.find('li', {'class': 'fact-item fact-item_maximalist'}).find('div', {'class': 'fact-value text-left'}).text
        except Exception as e:
            shoe['maximalist'] = ""
            
        # Minimalist
        try:
            shoe['minimalist'] = fact_list.find('li', {'class': 'fact-item fact-item_minimalist'}).find('div', {'class': 'fact-value text-left'}).text
        except Exception as e:
            shoe['minimalist'] = ""
            
        # Triathlon
        try:
            shoe['triathlon'] = fact_list.find('li', {'class': 'fact-item fact-item_triathlon'}).find('div', {'class': 'fact-value text-left'}).text
        except Exception as e:
            shoe['triathlon'] = ""
        
        # Price
        try:
            shoe['price'] = fact_list.find('li', {'class': 'fact-item fact-item_price'}).find('div', {'class': 'fact-value text-left'}).text.replace("$","")
        except Exception as e:
            shoe['price'] = ""

        # Weight
        try:
            shoe['weight'] = ""
            weight_list = fact_list.find('li', {'class': 'fact-item fact-item_weight'}).find('div', {'class': 'fact-value text-left'}).find_all('div')
            for i in range(len(weight_list)):
                shoe['weight'] += weight_list[i].text + ";"
        except Exception as e:
            shoe['weight'] = ""    
            
#         # Weight for men's
#         try:
#             shoe['weight_m'] = fact_list.find('li', {'class': 'fact-item fact-item_weight'}).find('div', {'class': 'fact-value text-left'}).find_all('div')[0].text
#         except Exception as e:
#             shoe['weight_m'] = ""
            
#         # Weight for women's
#         try:
#             shoe['weight_w'] = fact_list.find('li', {'class': 'fact-item fact-item_weight'}).find('div', {'class': 'fact-value text-left'}).find_all('div')[1].text
#         except Exception as e:
#             shoe['weight_w'] = ""

        # Heel to Toe drop
        try:
            shoe['toe_drop'] = ""
            toe_drop_list = fact_list.find('li', {'class': 'fact-item fact-item_heel-to-toe-drop'}).find('div', {'class': 'fact-value text-left'}).find_all('div')
            for i in range(len(toe_drop_list)):
                shoe['toe_drop'] += toe_drop_list[i].text + ";"
        except Exception as e:
            shoe['toe_drop'] = ""

        # Heel height
        try:
            shoe['heel_height'] = ""
            heel_height_list = fact_list.find('li', {'class': 'fact-item fact-item_heel-height'}).find('div', {'class': 'fact-value text-left'}).find_all('div')
            for i in range(len(heel_height_list)):
                shoe['heel_height'] += heel_height_list[i].text + ";"
        except Exception as e:
            shoe['heel_height'] = ""

        # Forefoot height
        try:
            shoe['forefoot_height'] = ""
            forefoot_height_list = fact_list.find('li', {'class': 'fact-item fact-item_forefoot-height'}).find('div', {'class': 'fact-value text-left'}).find_all('div')
            for i in range(len(forefoot_height_list)):
                shoe['forefoot_height'] += forefoot_height_list[i].text + ";"
        except Exception as e:
            shoe['forefoot_height'] = ""

        # Width
        try:
            shoe['width'] = ""
            width_list = fact_list.find('li', {'class': 'fact-item fact-item_width'}).find('div', {'class': 'fact-value text-left'}).find_all('div')
            for i in range(len(width_list)):
                shoe['width'] += width_list[i].text + ";"
        except Exception as e:
            shoe['witdth'] = ""

        # review_section = shoe_soup.find('section', {'id': 'bottom_line_section'})

        # product_section = shoe_soup.find('section', {'id': 'rr_text'})

        # Summary of good reviews
        try:
            shoe['good_summary'] = ""
            good_list = shoe_soup.find('div', {'id': 'the_good'}).find_all('li')
            for i in range(len(good_list)):
                shoe['good_summary'] += good_list[i].text + ";"
        except Exception as e:
            shoe['good_summary'] = ""

        # Summary of bad reviews
        try:
            shoe['bad_summary'] = ""
            bad_list = shoe_soup.find('div', {'id': 'the_bad'}).find_all('li')
            for i in range(len(bad_list)):
                shoe['bad_summary'] += bad_list[i].text + ";"
        except Exception as e:
            shoe['bad_summary'] = ""

        # Overall summary
        try:
            shoe['summary'] = ""
            summary_list = shoe_soup.find('div', {'itemprop': 'description'}).find_all('p')
            for i in range(len(summary_list)):
                shoe['summary'] += summary_list[i].text + ";"
        except Exception as e:
            shoe['summary'] = ""

        # Product information
        try:
            shoe['info'] = ""
            info_list = shoe_soup.find('div', {'class': 'product-updates'}).find_all('li')
            for i in range(len(info_list)):
                shoe['info'] += info_list[i].text + ";"
        except Exception as e:
            shoe['info'] = ""

        # Size and fit
        try:
            shoe['size_fit'] = ""
            size_list = shoe_soup.find('div', {'class': 'size-and-fit'}).find_all('p')
            for i in range(len(size_list)):
                shoe['size_fit'] += size_list[i].text + ";"
        except Exception as e:
            shoe['size_fit'] = ""

        # Outsole
        try:
            shoe['outsole'] = ""
            outsole_list = shoe_soup.find('div', {'class': 'outsole'}).find_all('p')
            for i in range(len(outsole_list)):
                shoe['outsole'] += outsole_list[i].text + ";"
        except Exception as e:
            shoe['outsole'] = ""

        # Midsole
        try:
            shoe['midsole'] = ""
            midsole_list = shoe_soup.find('div', {'class': 'midsole'}).find_all('p')
            for i in range(len(midsole_list)):
                shoe['midsole'] += midsole_list[i].text + ";"
        except Exception as e:
            shoe['midsole'] = ""

        # Upper
        try:
            shoe['upper'] = ""
            upper_list = shoe_soup.find('div', {'class': 'upper'}).find_all('p')
            for i in range(len(upper_list)):
                shoe['upper'] += upper_list[i].text + ";"
        except Exception as e:
            shoe['upper'] = ""

        # Add the shoe dictionary to our list of shoes
        shoes.append(shoe)
        
    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(shoes, columns = columns)
    
    # Save to csv
    df.to_csv('shoe_data.csv', index=False)

### Calling function to periodically scrape and save data to csv

`time.sleep()` method is additionally used to space out each calling of the scraping function, so as not to overwhelm the server.

Errors encountered will likely be due to sending too many requests from the same ip address in a short period of time.

In [6]:
scrape_shoes(0,100)

Request: 100; Frequency: 0.155 requests/s; Index: 100, New Balance Fresh Foam Vongo v3; Elapsed time: 643.648s


In [7]:
sleep(180)
scrape_shoes(100,200)

Request: 100; Frequency: 0.158 requests/s; Index: 200, Nike Revolution 3; Elapsed time: 631.42s


In [8]:
sleep(180)
scrape_shoes(200,300)

Request: 100; Frequency: 0.165 requests/s; Index: 300, Nike Air Zoom Vomero 12; Elapsed time: 605.257s


In [9]:
scrape_shoes(300,400)

Request: 100; Frequency: 0.16 requests/s; Index: 400, Nike Air Zoom Pegasus 32; Elapsed time: 623.671s


In [10]:
sleep(180)
scrape_shoes(400,500)

Request: 100; Frequency: 0.158 requests/s; Index: 500, Adidas Terrex Skychaser GTX; Elapsed time: 634.343s


In [11]:
sleep(180)
scrape_shoes(500,600)

Request: 100; Frequency: 0.158 requests/s; Index: 600, Asics Gel Surveyor 5; Elapsed time: 634.168s


In [12]:
sleep(180)
scrape_shoes(600,700)

Request: 100; Frequency: 0.155 requests/s; Index: 700, Nike Air Zoom Winflo 2; Elapsed time: 647.085s


In [13]:
sleep(180)
scrape_shoes(700,800)

Request: 100; Frequency: 0.154 requests/s; Index: 800, Skechers GOrun MaxRoad 3 Ultra; Elapsed time: 650.021s


In [14]:
sleep(180)
scrape_shoes(800,900)

Request: 100; Frequency: 0.14 requests/s; Index: 900, Under Armour Scorpio 2; Elapsed time: 714.574s


In [15]:
sleep(180)
scrape_shoes(900,1000)

Request: 100; Frequency: 0.037 requests/s; Index: 1000, Asics Gel Quantum 360 Shift; Elapsed time: 2728.317s


In [17]:
scrape_shoes(1000,1100)

Request: 100; Frequency: 0.134 requests/s; Index: 1100, Reebok Print Run Prime Ultraknit; Elapsed time: 747.989s


In [19]:
sleep(150)
scrape_shoes(1100,1200)

Request: 100; Frequency: 0.166 requests/s; Index: 1200, Adidas Energy Bounce; Elapsed time: 601.964s


In [20]:
sleep(150)
scrape_shoes(1200,1300)

Request: 100; Frequency: 0.172 requests/s; Index: 1300, Asics Gel Quantum 360 Shift MX; Elapsed time: 582.16s


In [21]:
sleep(150)
scrape_shoes(1300,1400)

Request: 100; Frequency: 0.175 requests/s; Index: 1400, Adidas Duramo 6; Elapsed time: 569.961s


In [22]:
sleep(150)
scrape_shoes(1400,1500)

Request: 100; Frequency: 0.178 requests/s; Index: 1500, Vibram FiveFingers Spyridon MR Elite; Elapsed time: 562.891s


In [23]:
sleep(150)
scrape_shoes(1500,1600)

Request: 100; Frequency: 0.181 requests/s; Index: 1600, Altra Instinct 3.0; Elapsed time: 550.995s


In [24]:
sleep(150)
scrape_shoes(1600,1700)

Request: 100; Frequency: 0.177 requests/s; Index: 1700, Adidas Pure Boost R; Elapsed time: 565.17s


In [25]:
sleep(150)
scrape_shoes(1700,1800)

Request: 100; Frequency: 0.174 requests/s; Index: 1800, Reebok RealFlex Run Tempo 2.0; Elapsed time: 575.788s


In [26]:
sleep(150)
scrape_shoes(1800,1900)

Request: 100; Frequency: 0.174 requests/s; Index: 1900, Under Armour Charged Assert 8; Elapsed time: 573.884s


In [27]:
sleep(150)
scrape_shoes(1900,2000)

Request: 100; Frequency: 0.176 requests/s; Index: 2000, Puma Hybrid Rocket Desert ; Elapsed time: 569.032s


In [28]:
sleep(150)
scrape_shoes(2000,2067)

IndexError: single positional indexer is out-of-bounds

In [29]:
# Check total number of shoes.
len(shoes)

2066

### Final check

In [31]:
# Final look at DataFrame of scraped shoe data
df = pd.DataFrame(shoes, columns = columns)
print(f"Scraped data for total of {df.shape[0]} shoes, comprising {df.shape[1]} features.")
df.sample(5)

Scraped data for total of 2066 shoes, comprising 35 features.


Unnamed: 0,name,brand,core_score,user_review_count,expert_review_count,user_rating,expert_rating,user_5star,user_4star,user_3star,...,forefoot_height,width,good_summary,bad_summary,summary,info,size_fit,outsole,midsole,upper
1856,Reebok Carthage 3.0,Reebok,79,8,,4.6,,75%,13%,13%,...,,Men: Standard;Women: Standard;,The upper unit of this running shoe makes use ...,Some runners thought that the Carthage 3.0 loo...,The Reebok Carthage 3.0 is a prime choice for ...,The Reebok Carthage 3.0 is a go-to shoe for an...,The Reebok Carthage 3.0 has a standard running...,Carbon rubber a durable compound that’s used i...,The DMXRide is an underfoot cushioning system ...,"The upper unit uses an Open Weave Mesh, which ..."
813,Adidas Response Boost 2,Adidas,91,184,1.0,4.5,83.0,76%,13%,4%,...,Men: 20mm;Women: 20mm;,Men: Standard;Women: Standard;,The word ‘comfortable’ was used many times by ...,There were reviewers who noted that it was one...,The 2nd version of the Response Boost from Adi...,Adidas makes several changes in the 2nd iterat...,Adidas patterned the fit and sizing of the Res...,The outsole configuration shows rubber treads ...,A firm foam and a plastic shank in the midfoot...,The Techfit upper encapsulates the use of brea...
1386,Saucony Linchpin,Saucony,80,60,,4.0,,52%,18%,13%,...,Men: 18mm;Women: 18mm;,Men: Standard;Women: Standard;,A few reviewers felt that the Saucony Linchpin...,Various users felt that Saucony skimped on the...,The Saucony Linchpin was meant to be an all-ar...,,The Saucony Linchpin comes in standard shoe le...,The outsole of the Saucony Linchpin was crafte...,The midsole unit features Saucony’s PowerGrid ...,The upper of the shoe is made with a breathabl...
1020,Nike Air Max Sequent,Nike,86,1154,,4.3,,67%,15%,6%,...,,Men: Standard;Women: Standard;,The Nike Sequent is amazingly comfortable for ...,"Based on several remarks, the Air Max Sequent ...",Nike adds another Air Max entry in the form of...,Nike delivers a version of the Air Max that ma...,The fit of the Nike Sequent is designed for th...,Nike keeps it basic in the outsole as it uses ...,The midsole features soft cushioning with a to...,The totally seamless upper is an exercise in s...
969,Nike Air Zoom All Out Flyknit,Nike,86,89,7.0,4.3,76.0,63%,19%,9%,...,Men: 21mm;Women: 21mm;,"Men: Narrow, Standard;Women: Standard, Wide;",The Flyknit material used for the upper feels ...,There were those who felt the rear section of ...,Natural motion and flexibility are afforded by...,Natural foot motion is a design philosophy tha...,The Nike Air Zoom All Out Flyknit has a standa...,The BRS 1000 is an outsole layer that’s made f...,The Cushlon is a lightweight and responsive fo...,The Flyknit is a comfortable fabric in the upp...


In [32]:
# Final save to csv
df.to_csv('shoe_data.csv', index=False)