In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import json
import time
import datetime as dt
import re

In [3]:
# Dealers description
dealer_dct = {
    'id': 'simpleautos',
    'name': 'Simple Auto',
    'street': '2829 Derry Rd E.',
    'city': 'Mississauga',
    'zip': 'L4T1A5',
    'province': 'ON',
    'phone': '905-965-7800',
    'url': 'https://www.simpleautos.ca',
    'latitude': '43.701760',
    'longitude': '-79.645490'
}

print(json.dumps(dealer_dct, indent=4))

with open('data/simpleautos.json', 'w', encoding='utf-8') as fp:
    fp.writelines(json.dumps(dealer_dct) + "\n")

{
    "id": "simpleautos",
    "name": "Simple Auto",
    "street": "2829 Derry Rd E.",
    "city": "Mississauga",
    "zip": "L4T1A5",
    "province": "ON",
    "phone": "905-965-7800",
    "url": "https://www.simpleautos.ca",
    "latitude": "43.701760",
    "longitude": "-79.645490"
}


In [4]:
base_url ='https://www.simpleautos.ca/'
inventory_url = 'https://www.simpleautos.ca/used-cars'

In [5]:
def parse_inventory(query_args):
    global base_url, inventory_url
    res = []

    time.sleep(1.1)
    response = requests.get(inventory_url, query_args)
    
    soup = BeautifulSoup(response.text, 'lxml')
    vehicles = soup.find_all('div', class_='vehicle search-result-item vehicleList')
    if vehicles is None:
        return res, False
    
    for v in vehicles:
        # check if it isn't sold
        sold = v.find('div', class_='SoldVehicle')
        if sold is not None:
            return res, False # all the rest of the inventory is a list of sold vehicles, no need to go further

        # get vehicle description page url
        url = base_url + v.div.contents[1].div.div.a['href']
        
        # Printing URL (for debugginh purposes)
        print(url)
        res.append(url)
    
    return res, True

In [6]:
# Making a list of inventory URLs
vehicle_urls = []
cpage = 1

urls, has_more = parse_inventory({'ppage': 100, 'cpage': cpage})

while has_more:
    vehicle_urls.extend(urls)
    cpage += 1
    urls, has_more = parse_inventory({'cpage': cpage})
    
vehicle_urls.extend(urls)
print(len(vehicle_urls))

https://www.simpleautos.ca//cars/used/2013-Nissan-Altima-Sedan-114169
https://www.simpleautos.ca//cars/used/2019-Hyundai-Elantra-Sedan-114162
https://www.simpleautos.ca//cars/used/2018-Honda-Civic-Sedan-Sedan-114159
https://www.simpleautos.ca//cars/used/2016-Volkswagen-Passat-Sedan-114156
https://www.simpleautos.ca//cars/used/2018-Toyota-Corolla-Sedan-114153
https://www.simpleautos.ca//cars/used/2017-Honda-Accord-Sedan-Sedan-111421
https://www.simpleautos.ca//cars/used/2017-Honda-Accord-Sedan-Sedan-110440
https://www.simpleautos.ca//cars/used/2010-Ford-F-150-Truck-Extended-Cab-110187
https://www.simpleautos.ca//cars/used/2016-Volkswagen-Jetta-Sedan-Sedan-106068
https://www.simpleautos.ca//cars/used/2017-Hyundai-Sonata-Sedan-106070
https://www.simpleautos.ca//cars/used/2017-Toyota-Camry-Sedan-106069
https://www.simpleautos.ca//cars/used/2017-Honda-Civic-Sedan-Sedan-106118
https://www.simpleautos.ca//cars/used/2008-Ford-Super-Duty-F-250-Truck-Extended-Cab-106056
https://www.simpleautos.c

KeyboardInterrupt: 

In [8]:
# Function parses vehicle description page and returns a dictionary with its parameters
def parse_car_infopage(url):
    # Translation table from their names to our key names
    translation_table = {
        'Make:': 'make',
        'Model:': 'model',
        'Year:': 'year',
        'Body Style:': 'body_style',
        'Odometer:': 'mileage', # Note: special treatment
        'Transmission:': 'transmission',
        'Engine:': 'cylinders', # Note: special treatment
        'Engine Size:': 'displacement',
        'Driveline:': 'drivetrain',
        'Exterior Color:': 'ext_color', 
        'Interior Color:': 'int_color',
        'Doors:': 'doors', # Note: special treatment
        'Passengers:': 'passengers',
        'Stock Number:': 'stock_id',
        'VIN:': 'vin'
    }
    
    # Fetching the page
    time.sleep(1.1)
    response = requests.get(url)

    # Parsing
    soup = BeautifulSoup(response.text, 'lxml')
    price_node = soup.find('span', class_='PriceValue')
    details_node = soup.find('div', class_='VehicleInfoDetails')
    descr_node = soup.find('div', class_='seller_comments')
    features_node = soup.find('ul', class_='VehicleOptions')

    # Basic info
    res = {
        'timestamp': dt.datetime.now().isoformat(),
        'dealer': 'simpleautos',
        'url': url
    }

    # Price
    price = re.sub('[^0-9]', '', price_node.contents[0]) # Cleaning junk, leaving only numbers
    res['price'] = price

    # Reading core parameters
    spans = details_node.find_all('span')

    for s in spans:
        if s.text in translation_table:
            key = translation_table[s.text]
            value = s.next_sibling.text

            # Ad Hoc treatment
            if key in ['cylinders', 'doors']:
                value = value.split()[0]
            elif key == 'mileage':
                value = re.sub('[^0-9]', '', value) # Remove all non numbers using regexp substitution

            res[key] = value
    
    # Free text description        
    res['description'] = descr_node.find('p').text

    # Features and options
    res['features'] = []
    items = features_node.find_all('li')
    for i in items:
        res['features'].append(i.text)
    
    # Printing result (for debugging purposes)
    print(json.dumps(res, indent=4))
    
    # Done
    return res

In [9]:
# Now scraping vehicle data, one by one
vehicle_data = []

for url in vehicle_urls:
    vehicle_data.append(parse_car_infopage(url))
    
print(len(vehicle_data))

with open('data/simpleautos_cars.json', 'w', encoding='utf-8') as fp:
    for v in vehicle_data:
        fp.writelines(json.dumps(v) + "\n")

{
    "timestamp": "2020-04-25T13:15:08.387764",
    "dealer": "simpleautos",
    "url": "https://www.simpleautos.ca//cars/used/2013-Nissan-Altima-Sedan-114169",
    "price": "8499",
    "make": "Nissan",
    "model": "Altima",
    "year": "2013",
    "body_style": "Sedan",
    "mileage": "159208",
    "transmission": "CVT",
    "cylinders": "4",
    "displacement": "2.5",
    "drivetrain": "FWD",
    "ext_color": "White",
    "int_color": "Beige",
    "doors": "4",
    "passengers": "5",
    "stock_id": "291",
    "vin": "1N4AL3AP0DN418291",
    "description": "CERTIFIED",
    "features": [
        "ABS Brakes",
        "Air Conditioning",
        "Alloy Wheels",
        "AM/FM Stereo",
        "Backup Camera",
        "Bluetooth",
        "CD Player",
        "Child-Safety Locks",
        "Cruise Control",
        "Cup Holder",
        "Daytime Running Lights",
        "Driver Side Airbag",
        "Entertainment System",
        "Fog Lights",
        "Folding Rear Seat",
        "He

{
    "timestamp": "2020-04-25T13:15:19.485489",
    "dealer": "simpleautos",
    "url": "https://www.simpleautos.ca//cars/used/2017-Honda-Accord-Sedan-Sedan-110440",
    "price": "22899",
    "make": "Honda",
    "model": "Accord Sedan",
    "year": "2017",
    "body_style": "Sedan",
    "mileage": "51020",
    "transmission": "CVT",
    "cylinders": "4",
    "displacement": "2.4",
    "drivetrain": "FWD",
    "ext_color": "Silver",
    "int_color": "Black",
    "doors": "4",
    "passengers": "5",
    "stock_id": "17ACCORDSILVER",
    "vin": "1HGCR2F58HA801722",
    "description": "Clean Ontario Local Vehicle",
    "features": [
        "ABS Brakes",
        "Air Conditioning",
        "Alloy Wheels",
        "AM/FM Stereo",
        "Automatic Headlight",
        "Backup Camera",
        "Bluetooth",
        "CD Player",
        "Center Arm Rest",
        "Child-Safety Locks",
        "Climate Control",
        "Cloth Interior",
        "Cruise Control",
        "Cup Holder",
       

{
    "timestamp": "2020-04-25T13:15:30.377010",
    "dealer": "simpleautos",
    "url": "https://www.simpleautos.ca//cars/used/2008-Ford-Super-Duty-F-250-Truck-Extended-Cab-106056",
    "price": "8799",
    "make": "Ford",
    "model": "Super Duty F-250",
    "year": "2008",
    "body_style": "Truck Extended Cab",
    "mileage": "166000",
    "transmission": "Automatic",
    "cylinders": "8",
    "displacement": "5.4",
    "drivetrain": "4X4",
    "ext_color": "Red",
    "int_color": "Beige",
    "doors": "2",
    "passengers": "6",
    "stock_id": "105",
    "vin": "1FTSX21518EE15515",
    "description": "SOLD \"AS IS\"",
    "features": [
        "Air Conditioning",
        "Alloy Wheels",
        "AM/FM Stereo",
        "CD Player",
        "Center Arm Rest",
        "Cup Holder",
        "Daytime Running Lights",
        "Driver Side Airbag",
        "Fog Lights",
        "Heated Exterior Mirrors",
        "Keyless Entry",
        "Leather Seats",
        "Memory Seat",
        "P

{
    "timestamp": "2020-04-25T13:15:43.635334",
    "dealer": "simpleautos",
    "url": "https://www.simpleautos.ca//cars/used/2017-Volkswagen-Passat-Sedan-106061",
    "price": "15999",
    "make": "Volkswagen",
    "model": "Passat",
    "year": "2017",
    "body_style": "Sedan",
    "mileage": "74610",
    "transmission": "6 Speed Automatic",
    "cylinders": "4",
    "displacement": "1.8",
    "drivetrain": "FWD",
    "ext_color": "Silver",
    "int_color": "-",
    "doors": "4",
    "passengers": "-",
    "stock_id": "737",
    "vin": "1VWAT7A37HC083737",
    "description": "CERTIFIED",
    "features": [
        "ABS Brakes",
        "Air Conditioning",
        "Alloy Wheels",
        "AM/FM Stereo",
        "Backup Camera",
        "Bluetooth",
        "CD Player",
        "Cruise Control",
        "Cup Holder",
        "Digital Clock",
        "Driver Side Airbag",
        "Entertainment System",
        "Heated Exterior Mirrors",
        "Keyless Entry",
        "Leather Steer

{
    "timestamp": "2020-04-25T13:15:57.133423",
    "dealer": "simpleautos",
    "url": "https://www.simpleautos.ca//cars/used/2014-Dodge-Grand-Caravan-Van-Minivan-106121",
    "price": "12899",
    "make": "Dodge",
    "model": "Grand Caravan",
    "year": "2014",
    "body_style": "Van-Minivan",
    "mileage": "109450",
    "transmission": "6 Speed Automatic",
    "cylinders": "6",
    "displacement": "3.6",
    "drivetrain": "FWD",
    "ext_color": "Black",
    "int_color": "Black",
    "doors": "4",
    "passengers": "-",
    "stock_id": "362",
    "vin": "2C4RDGBG9ER378362",
    "description": "CERTIFIED",
    "features": [
        "ABS Brakes",
        "AM/FM Stereo",
        "CD Player",
        "Cruise Control",
        "Cup Holder",
        "Digital Clock",
        "Driver Side Airbag",
        "Entertainment System",
        "Heated Exterior Mirrors",
        "Passenger Airbag",
        "Power Locks",
        "Power Mirrors",
        "Power-Assist Disc Brakes",
        "Rear

{
    "timestamp": "2020-04-25T13:16:10.101284",
    "dealer": "simpleautos",
    "url": "https://www.simpleautos.ca//cars/used/2016-Toyota-Camry-Sedan-106076",
    "price": "16299",
    "make": "Toyota",
    "model": "Camry",
    "year": "2016",
    "body_style": "Sedan",
    "mileage": "89847",
    "transmission": "6 Speed Automatic",
    "cylinders": "4",
    "displacement": "2.5",
    "drivetrain": "FWD",
    "ext_color": "White",
    "int_color": "Black",
    "doors": "4",
    "passengers": "-",
    "stock_id": "804",
    "vin": "4T1BF1FK8GU217302",
    "description": "CERTIFIED",
    "features": [
        "ABS Brakes",
        "Air Conditioning",
        "Alloy Wheels",
        "AM/FM Stereo",
        "Backup Camera",
        "Bluetooth",
        "CD Player",
        "Cruise Control",
        "Cup Holder",
        "Digital Clock",
        "Driver Side Airbag",
        "Entertainment System",
        "Heated Exterior Mirrors",
        "Keyless Entry",
        "Leather Steering Whe

{
    "timestamp": "2020-04-25T13:16:22.859031",
    "dealer": "simpleautos",
    "url": "https://www.simpleautos.ca//cars/used/2014-Lexus-IS-250-Sedan-106101",
    "price": "20999",
    "make": "Lexus",
    "model": "IS 250",
    "year": "2014",
    "body_style": "Sedan",
    "mileage": "104884",
    "transmission": "6 Speed Automatic",
    "cylinders": "6",
    "displacement": "2.5",
    "drivetrain": "AWD",
    "ext_color": "Black",
    "int_color": "Black",
    "doors": "4",
    "passengers": "-",
    "stock_id": "500",
    "vin": "jthcf1d20e5001851",
    "description": "CERTIFIED",
    "features": [
        "ABS Brakes",
        "Air Conditioning",
        "Alloy Wheels",
        "AM/FM Stereo",
        "Auto Dimming Mirrors",
        "Backup Camera",
        "Bluetooth",
        "CD Player",
        "Child-Safety Locks",
        "Climate Control",
        "Cruise Control",
        "Cup Holder",
        "Daytime Running Lights",
        "Digital Clock",
        "Driver Side Airbag

AttributeError: 'NoneType' object has no attribute 'contents'