In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import json
import time
import datetime as dt
import re

In [2]:
# Dealers description
dealer_dct = {
    'id': 'tabangimotors',
    'name': 'Tabangi Motors',
    'street': '5926 Shawson Dr',
    'city': 'Mississauga',
    'zip': 'L4W 3W5',
    'province': 'ON',
    'phone': '905-670-3738',
    'url': 'https://www.tabangimotors.com',
    'latitude': 43.646760,
    'longitude': -79.653690
}

print(json.dumps(dealer_dct, indent=4))

with open('data/tabangimotors.json', 'w', encoding='utf-8') as fp:
    fp.writelines(json.dumps(dealer_dct) + "\n")

{
    "id": "tabangimotors",
    "name": "Tabangi Motors",
    "street": "5926 Shawson Dr",
    "city": "Mississauga",
    "zip": "L4W 3W5",
    "province": "ON",
    "phone": "905-670-3738",
    "url": "https://www.tabangimotors.com",
    "latitude": 43.64676,
    "longitude": -79.65369
}


In [3]:
base_url ='https://www.tabangimotors.com'
inventory_url = 'https://www.tabangimotors.com/used-cars'

In [4]:
# Function uses inventory url (defined above), takes url parameters (such as a page number) as a dict,
# parses the inventory page and returns a list of URLs to car description pages
# Parsing stops when first sold vehicle found or when no search results found on a page
# The function returns a list or URLs and a boolean: True if there might be more data, False when done
def parse_inventory(query_args):
    global base_url, inventory_url
    res = []

    time.sleep(1)
    response = requests.get(inventory_url, query_args)
    
    soup = BeautifulSoup(response.text, 'lxml')
    vehicles = soup.find_all('div', class_='vehicle search-result-item vehicleList')
    if not vehicles: # empty list
        return res, False
    
    for v in vehicles:
        # check if it isn't sold
        sold = v.find('div', class_='SoldVehicle')
        if sold is not None:
            return res, False # all the rest of the inventory is a list of sold vehicles, no need to go further

        # get vehicle description page url
        url = base_url + v.div.contents[1].div.div.a['href']
        
        # Printing URL (for debugginh purposes)
        print(url)
        res.append(url)
    
    return res, True

In [5]:
# Making a list of inventory URLs
vehicle_urls = []
query_args = {'ppage': 100, 'cpage': 1}

urls, has_more = parse_inventory(query_args)

while has_more:
    vehicle_urls.extend(urls)
    query_args['cpage'] += 1
    urls, has_more = parse_inventory(query_args)
    
vehicle_urls.extend(urls)
print(len(vehicle_urls))

https://www.tabangimotors.com/cars/used/2015-Honda-Civic-Sedan-94639
https://www.tabangimotors.com/cars/used/2016-Volkswagen-Jetta-Sedan-103540
https://www.tabangimotors.com/cars/used/2017-Ford-Edge-SUV-112015
https://www.tabangimotors.com/cars/used/2018-Subaru-Outback-SUV-89377
https://www.tabangimotors.com/cars/used/2013-Nissan-Rogue-SUV-113669
https://www.tabangimotors.com/cars/used/2013-Ford-Escape-SUV-113677
https://www.tabangimotors.com/cars/used/2017-Honda-Accord-Sedan-110070
https://www.tabangimotors.com/cars/used/2015-Nissan-Rogue-SUV-102822
https://www.tabangimotors.com/cars/used/2018-Ford-Escape-SUV-109248
https://www.tabangimotors.com/cars/used/2017-Mercedes-Benz-C-Class-Sedan-105611
https://www.tabangimotors.com/cars/used/2015-Mercedes-Benz-CLA-Class-Sedan-105664
https://www.tabangimotors.com/cars/used/2016-Mercedes-Benz-CLA-Class-Sedan-101035
https://www.tabangimotors.com/cars/used/2017-Nissan-Rogue-SUV-109253
https://www.tabangimotors.com/cars/used/2016-Honda-Odyssey-Van

In [6]:
# Function parses vehicle description page and returns a dictionary with its parameters
def parse_car_infopage(url):
    # Translation table from their names to our key names
    translation_table = {
        'Make:': 'make',
        'Model:': 'model',
        'Year:': 'year',
        'Body Style:': 'body_style',
        'Odometer:': 'mileage', # Note: special treatment
        'Transmission:': 'transmission',
        'Engine:': 'cylinders', # Note: special treatment
        'Engine Size:': 'displacement',
        'Driveline:': 'drivetrain',
        'Exterior Color:': 'ext_color', 
        'Interior Color:': 'int_color',
        'Doors:': 'doors', # Note: special treatment
        'Passengers:': 'passengers',
        'Stock Number:': 'stock_id',
        'VIN:': 'vin'
    }
    
    # Fetching the page
    time.sleep(.3)
    response = requests.get(url)

    # Parsing
    soup = BeautifulSoup(response.text, 'lxml')
    price_node = soup.find('span', class_='PriceValue')
    details_node = soup.find('div', class_='VehicleInfoDetails')
    features_node = soup.find('ul', class_='VehicleOptions')

    # Basic info
    res = {
        'timestamp': dt.datetime.now().isoformat(),
        'dealer': 'simpleautos',
        'url': url
    }

    if (price_node is None) or (details_node is None) or (features_node is None):
        res["invalid"] = "invalid"
        return res

    # Price
    try:
        price = re.sub('[^0-9]', '', price_node.contents[0]) # Cleaning junk, leaving only numbers
        res['price'] = int(price)
    except:
        res["invalid"] = "invalid"
        return res

    # Reading core parameters
    spans = details_node.find_all('span')

    for s in spans:
        if s.text in translation_table:
            key = translation_table[s.text]
            value = s.next_sibling.text

            try:
                # Ad Hoc treatment and type conversion
                if key in ['cylinders', 'doors']:
                    value = int(value.split()[0])
                elif key == 'mileage':
                    value = int(re.sub('[^0-9]', '', value)) # Remove all non numbers using regexp substitution
                elif key in ['year', 'passengers']:
                    value = int(value)
                elif key in ['displacement']:
                    value = float(value)
            except:
                key = ''

            if key:
                res[key] = value
    
    # Features and options
    res['features'] = []
    items = features_node.find_all('li')
    for i in items:
        res['features'].append(i.text)
    
    # Printing result (for debugging purposes)
    print(json.dumps(res, indent=4))
    
    # Done
    return res

In [7]:
# Now scraping vehicle data, one by one
vehicle_data = []

for url in vehicle_urls:
    res = parse_car_infopage(url)
    if 'invalid' not in res:
        vehicle_data.append(res)
    
print(len(vehicle_data))

with open('data/tabangimotors_cars.json', 'w', encoding='utf-8') as fp:
    for v in vehicle_data:
        fp.writelines(json.dumps(v) + "\n")

{
    "timestamp": "2020-04-30T17:00:09.521258",
    "dealer": "simpleautos",
    "url": "https://www.tabangimotors.com/cars/used/2015-Honda-Civic-Sedan-94639",
    "price": 8995,
    "make": "Honda",
    "model": "Civic",
    "year": 2015,
    "body_style": "Sedan",
    "mileage": 133160,
    "transmission": "Automatic",
    "cylinders": 4,
    "displacement": 1.8,
    "drivetrain": "FWD",
    "ext_color": "Black",
    "int_color": "Beige",
    "doors": 4,
    "stock_id": "8283",
    "vin": "2HGFB2F44FH011289",
    "features": [
        "ABS Brakes",
        "Air Conditioning",
        "AM/FM Stereo",
        "Backup Camera",
        "Bluetooth",
        "CD Player",
        "Climate Control",
        "Cruise Control",
        "Cup Holder",
        "Digital Clock",
        "Driver Side Airbag",
        "Electronic  Stability Control",
        "Fog Lights",
        "Heated Exterior Mirrors",
        "Heated Seats",
        "Keyless Entry",
        "Multi-Zone A/C",
        "Passenger A