In [1]:
# UT-TOR-DATA-PT-01-2020-U-C Week 12
# ETL Project
# Scraping http://www.leopardmotors.ca/inventory/
# (c) Boris Smirnov

In [2]:
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import json
import time
import datetime as dt
import re

In [3]:
# Global constants
dealer_name = 'leopardmotors'
dealer_json = 'data/' + dealer_name + '.json'
dealer_cars_json = 'data/' + dealer_name + '_cars.json'
base_url = 'http://www.leopardmotors.ca'
inventory_url = base_url + '/inventory/'

In [4]:
# Dealers description
dealer_dct = {
    'id': dealer_name,
    'name': 'Leopard Motors Inc',
    'street': '379 Dundas Street East',
    'city': 'Mississauga',
    'zip': 'L5A 1X4',
    'province': 'ON',
    'phone': '1-888-646-3865',
    'url': base_url,
    'latitude': 43.588799,
    'longitude': -79.607378,
}

print(json.dumps(dealer_dct, indent=4))

with open(dealer_json, 'w', encoding='utf-8') as fp:
    fp.writelines(json.dumps(dealer_dct) + "\n")

{
    "id": "leopardmotors",
    "name": "Leopard Motors Inc",
    "street": "379 Dundas Street East",
    "city": "Mississauga",
    "zip": "L5A 1X4",
    "province": "ON",
    "phone": "1-888-646-3865",
    "url": "http://www.leopardmotors.ca",
    "latitude": 43.588799,
    "longitude": -79.607378
}


In [5]:
# Open inventory in separate automated browser window
browser = Browser('chrome', executable_path='chromedriver.exe', headless=False)
browser.visit(inventory_url)

In [6]:
# Build a list URLs to cars' pages
vehicle_urls = []
page_count = 1

while True:
    print(f"Page {page_count}:", end=" ")

    # I put timeout here because it looks like the page is dynamically populated
    # and it might take some time to get the DOM tree fully populated
    time.sleep(5)

    # Get the page from the browser
    soup = BeautifulSoup(browser.html, 'lxml')
    
    # Get the list of links to the cars in the inventory
    inventory_nodes = soup.find_all('a', class_='inventory')
    for i in inventory_nodes:
        vehicle_urls.append(i['href'])

    print(f"{len(inventory_nodes)} cars")
        
    # Find a link to the next page, check if it isn't disabled, and click
    next_page = soup.find('a', class_='right-arrow')
    if 'disabled' in next_page['class']:
           break
    next_page_lnk = browser.find_by_css('.fa-angle-right')
    next_page_lnk[0].click()
    
    page_count += 1

Page 1: 10 cars
Page 2: 10 cars
Page 3: 10 cars
Page 4: 10 cars
Page 5: 10 cars
Page 6: 10 cars
Page 7: 10 cars
Page 8: 10 cars
Page 9: 10 cars
Page 10: 10 cars
Page 11: 10 cars
Page 12: 10 cars
Page 13: 10 cars
Page 14: 10 cars
Page 15: 10 cars
Page 16: 4 cars


In [7]:
#for url in vehicle_urls:
#    print(url)

In [8]:
# Function uses soup object to extract car data from the web page
# Returns a dictionaray with car data
def parse_car_infopage(soup, url):
    # Translation table from their names to our key names
    translation_table = {
        'Make: ': 'make',
        'Model: ': 'model',
        'Year: ': 'year',
        'Body Style: ': 'body_style',
        'Mileage: ': 'mileage',
        'Transmission: ': 'transmission',
        'Engine: ': 'cylinders', # Note: special treatment
        'Engine Size:': 'displacement', # in description only as free text
        'Drivetrain: ': 'drivetrain',
        'Exterior Color: ': 'ext_color', 
        'Interior Color: ': 'int_color',
        'Doors: ': 'doors',
        'Passengers:': 'passengers', # no data
        'Stock Number: ': 'stock_id',
        'VIN Number: ': 'vin'
    }

    # Parsing
    price_node = soup.find('span', attrs={'itemprop': "price"})
    details_node = soup.find('div', class_="car-info")
    descr_node = soup.find('div', id="vehicle").p
    features_node = soup.find('div', id='features')

    # Basic info
    res = {
        'timestamp': dt.datetime.now().isoformat(),
        'dealer': dealer_name,
        'url': url
    }

    # Price
    try:
        res['price'] = int(price_node['content'])
    except:
        res['invalid'] = 'invalid'
        return res

    # Reading core parameters
    details = details_node.find_all('tr')
    for tr in details:
        if tr.contents[0].text in translation_table:
            key = translation_table[tr.contents[0].text]
            value = tr.contents[1].text

            # Ad Hoc treatment
            if key in ['cylinders']:
                try:
                    value = int(re.sub('[^0-9]', '', value)) # Remove all non numbers using regexp substitution
                except:
                    key = ''
            elif key in ['year', 'mileage', 'doors', 'passengers']:
                try:
                    value = int(value)
                except:
                    key = ''
            elif key in ['displacement']:
                try:
                    value = float(value)
                except:
                    key = ''

            if key:
                res[key] = value
    
    # Free text description  
    s = descr_node.text
    # let's skip it, it's full of junk
    # About 70% of the contents in that descriptions is useless junk,
    # and this a futile attempt to clean it up a little
    # res['description'] = s.split("**TRADE IN**", 1)[0].strip()

    # Features and options
    features_str = soup.find('div', id='features').ul['data-list']
    features_lst = features_str.split(',')
    res['features'] = [s.strip() for s in features_lst]
    
    # Printing result (for debugging purposes)
    print(json.dumps(res, indent=4))
    
    # Done
    return res

In [9]:
# Scraping cars
vehicle_data = []

for url in vehicle_urls:
    time.sleep(5)
    browser.visit(url)
    soup = BeautifulSoup(browser.html, 'lxml')
    vehicle_data.append(parse_car_infopage(soup, url))

# Done
browser.quit()
print(len(vehicle_data))

with open(dealer_cars_json, 'w', encoding='utf-8') as fp:
    for v in vehicle_data:
        fp.writelines(json.dumps(v) + "\n")

{
    "timestamp": "2020-04-30T17:37:27.586400",
    "dealer": "leopardmotors",
    "url": "http://www.leopardmotors.ca/listings/2007-lexus-es-350/",
    "price": 8988,
    "year": 2007,
    "make": "Lexus",
    "model": "ES 350",
    "body_style": "Sedan",
    "mileage": 111281,
    "transmission": "Automatic",
    "drivetrain": "FWD",
    "cylinders": 6,
    "ext_color": "Grey",
    "int_color": "Light Grey",
    "doors": 4,
    "stock_id": "LZF01",
    "vin": "jthbj46g772041448",
    "features": [
        "Air Conditioning",
        "All Equipped",
        "Alloy Wheels",
        "AM/FM Stereo",
        "Anti-Lock Brakes (ABS)",
        "Auxiliary 12v Outlet",
        "Bluetooth",
        "CD Player",
        "Child-Safety Locks",
        "Climate Control",
        "Cruise Control",
        "Cup Holder",
        "Daytime Running Lights",
        "Digital Clock",
        "Dual Airbag",
        "Dual Climate Control",
        "Front Wheel Drive",
        "Heated Seats",
        "Keyle