In [1]:
# UT-TOR-DATA-PT-01-2020-U-C Week 12
# ETL Project
# Scraping http://www.leopardmotors.ca/inventory/
# (c) Boris Smirnov

In [2]:
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import json
import time
import datetime as dt
import re

In [3]:
# Global constants
dealer_name = 'auto8000'
dealer_json = 'data/' + dealer_name + '.json'
dealer_cars_json = 'data/' + dealer_name + '_cars.json'
base_url = 'https://www.auto8000.ca'
inventory_url = base_url + '/inventory/'

In [4]:
# Dealers description
dealer_dct = {
    'id': dealer_name,
    'name': 'Auto 8000 Wholesaler & Services Inc.',
    'street': '260 Dundas St. West',
    'city': 'Mississauga',
    'zip': 'L5B1J2',
    'province': 'ON',
    'phone': '(905) 566-7888',
    'url': base_url,
    'latitude': '43.574655',
    'longitude': '-79.622358',
}

print(json.dumps(dealer_dct, indent=4))

with open(dealer_json, 'w', encoding='utf-8') as fp:
    fp.writelines(json.dumps(dealer_dct) + "\n")

{
    "id": "auto8000",
    "name": "Auto 8000 Wholesaler & Services Inc.",
    "street": "260 Dundas St. West",
    "city": "Mississauga",
    "zip": "L5B1J2",
    "province": "ON",
    "phone": "(905) 566-7888",
    "url": "https://www.auto8000.ca",
    "latitude": "43.574655",
    "longitude": "-79.622358"
}


In [5]:
# Open inventory in separate automated browser window
browser = Browser('chrome', executable_path='chromedriver.exe', headless=False)
browser.visit(inventory_url)

In [6]:
# Working around Infinite Scroll
# The strategy:
#    scroll down with JavaScript to the bottom of the page,
#    Infinite Scroll will load some items
#    count the items
#    repeat until number of items on the page doesn't change
# https://towardsdatascience.com/elevate-your-webscraping-with-splinter-a926eee7f7d9
scrollJS = "window.scroll({top: document.body.scrollHeight, left: 0, behavior: 'smooth'});"
browser.execute_script(scrollJS) # this first scroll doesn't load new items

soup = BeautifulSoup(browser.html, 'lxml')
inventory_nodes = soup.find_all('li', class_='inventory-item')
new_inv_count = len(inventory_nodes)
inventory_count = 0

while inventory_count < new_inv_count:
    print(f"Number of cars on the inventory page: {new_inv_count} (was {inventory_count})")
    inventory_count = new_inv_count
    time.sleep(1)
    browser.execute_script(scrollJS)
    soup = BeautifulSoup(browser.html, 'lxml')
    inventory_nodes = soup.find_all('li', class_='inventory-item')
    new_inv_count = len(inventory_nodes)

# At this point we should have complete list of the inventory cars in inventory_nodes
# Next step: scraping URLs of car pages

Number of cars on the inventory page: 6 (was 0)
Number of cars on the inventory page: 12 (was 6)
Number of cars on the inventory page: 18 (was 12)
Number of cars on the inventory page: 24 (was 18)
Number of cars on the inventory page: 30 (was 24)
Number of cars on the inventory page: 36 (was 30)
Number of cars on the inventory page: 42 (was 36)
Number of cars on the inventory page: 44 (was 42)


In [7]:
# Build a list URLs to cars' pages
vehicle_urls = []

for node in inventory_nodes:

    onclick = node['onclick']
    url = base_url + onclick.split("'")[1]
    vehicle_urls.append(url)
    # print(url)

print(len(vehicle_urls))

44


In [8]:
# Function opens car's web page with browser
# Then uses BeautifulSoup to extract the data
# Returns a dictionaray with car data
# If car data is invalid, returns a dict with key "invalid"
# The data may be invalid if there is no pricing information
def parse_car_infopage(url):

    # Open the page
    time.sleep(3)
    browser.visit(url)
    soup = BeautifulSoup(browser.html, 'lxml')
    
    # Basic info
    res = {
        'timestamp': dt.datetime.now().isoformat(),
        'dealer': dealer_name,
        'url': url
    }

    # Year is in the header
    year_node = soup.find('div', class_='header_title').h1
    s = year_node.text
    res['year'] = s.split(' ', 1)[0]
    
    # Price
    price_node = soup.find('div', class_='offer_price').strong
    s = price_node.text
    price = re.sub('[^0-9]', '', s.split('.')[0])
    if len(price) == 0: # This dealer advertises a car without price information
        res['invalid'] = 'invalid'
    else:
        res['price'] = price
    
    # Translation table from their names to our key names
    translation_table = {
        'Stock #:': 'stock_id',
        'Make:': 'make',
        'Model:': 'model',
        'Trim:': 'submodel',
        'Odometer:': 'mileage', # Note: special treatment
        'Drivetrain:': 'drivetrain', # Note: special treatment
        'Body type:': 'body_style',
        'Engine:': 'cylinders', # Note: special treatment
        'Transmission:': 'transmission',
        'Doors:': 'doors',
        'Passengers:': 'passengers',
        'Exterior color:': 'ext_color', # can be Unknown
        'Interior color:': 'int_color', # can be Unknown

        # These arn't in the specs
        'Year:': 'year',
        'Engine Size:': 'displacement',
        'VIN Number:': 'vin'
    }

    drivetrain_vocabulary = {
        'Rear Wheel Drive': 'RWD',
        'Front Wheel Drive': 'FWD',
        'All Wheel Drive': 'AWD',
    }
    
    specs_nodes = soup.find('div', class_="offer_specification").find_all('li')
    for li in specs_nodes:
        if li.span.text in translation_table:
            key = translation_table[li.span.text]
            value = li.strong.text

            # Ad Hoc treatment
            if key in ['mileage', 'cylinders']:
                value = re.sub('[^0-9]', '', value) # Remove all non numbers using regexp substitution
            elif key in ['ext_color', 'int_color'] and value == 'Unknown':
                key = ''
            elif key == 'drivetrain':
                if value in drivetrain_vocabulary:
                    value = drivetrain_vocabulary[value]
                
            if len(key) > 0:
                res[key] = value
        
    
    # Features and options
    features_nodes = soup.find('div', id='t_overview').ul.find_all('li')
    res['features'] = []
    for li in features_nodes:
        res['features'].append(li.text)

    # Printing result (for debugging purposes)
    print(json.dumps(res, indent=4))
    
    # Done
    return res

In [9]:
# Scraping cars
vehicle_data = []

for url in vehicle_urls:
    result = parse_car_infopage(url)
    if 'invalid' not in result:
        vehicle_data.append(result)

# Done
browser.quit()
print(len(vehicle_data))

with open(dealer_cars_json, 'w', encoding='utf-8') as fp:
    for v in vehicle_data:
        fp.writelines(json.dumps(v) + "\n")

{
    "timestamp": "2020-04-25T22:42:07.915185",
    "dealer": "auto8000",
    "url": "https://www.auto8000.ca/inventory/vehicle/2593/2000-honda-accord",
    "year": "2000",
    "invalid": "invalid",
    "stock_id": "HO001",
    "make": "Honda",
    "model": "Accord",
    "submodel": "EX-V6",
    "mileage": "130000",
    "drivetrain": "FWD",
    "body_style": "Coupe",
    "cylinders": "6",
    "transmission": "Automatic",
    "doors": "4",
    "passengers": "5",
    "features": [
        "Air Conditioning",
        "AM/FM/CD"
    ]
}
{
    "timestamp": "2020-04-25T22:42:11.516725",
    "dealer": "auto8000",
    "url": "https://www.auto8000.ca/inventory/vehicle/2591/2015-audi-q5",
    "year": "2015",
    "price": "20990",
    "stock_id": "AU101",
    "make": "Audi",
    "model": "Q5",
    "submodel": "2.0T Premium Plus",
    "mileage": "140001",
    "drivetrain": "AWD",
    "body_style": "SUV",
    "cylinders": "4",
    "transmission": "Automatic",
    "doors": "4",
    "passengers": "5