In [2]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import json
import time
import datetime as dt
import re

In [3]:
# Dealers description
dealer_dct = {
    'id': 'khushiauto',
    'name': 'Khushi Auto',
    'street': '2783 Derry Rd E.',
    'city': 'Mississauga',
    'zip': 'L4T1A3',
    'province': 'ON',
    'phone': '905-460-9624',
    'url': 'https://www.khushiauto.ca',
    'latitude': '43.700880',
    'longitude': '-79.646490',
}

print(json.dumps(dealer_dct, indent=4))

with open('data/khushiauto.json', 'w', encoding='utf-8') as fp:
    fp.writelines(json.dumps(dealer_dct) + "\n")

{
    "id": "khushiauto",
    "name": "Khushi Auto",
    "street": "2783 Derry Rd E.",
    "city": "Mississauga",
    "zip": "L4T1A3",
    "province": "ON",
    "phone": "905-460-9624",
    "url": "https://www.khushiauto.ca",
    "latitude": "43.700880",
    "longitude": "-79.646490"
}


In [4]:
base_url ='https://www.khushiauto.ca/'
inventory_url = 'https://www.khushiauto.ca/used-cars'

In [5]:
# Function uses inventory url (defined above), takes url parameters (such as a page number) as a dict,
# parses the inventory page and returns a list of URLs to car description pages
# Parsing stops when first sold vehicle found or when no search results found on a page
# The function returns a list or URLs and a boolean: True if there might be more data, False when done
def parse_inventory(query_args):
    global base_url, inventory_url
    res = []

    time.sleep(1.1)
    response = requests.get(inventory_url, query_args)
    
    soup = BeautifulSoup(response.text, 'lxml')
    vehicles = soup.find_all('div', class_='vehicle search-result-item vehicleList')
    if vehicles is None:
        return res, False
    
    for v in vehicles:
        # check if it isn't sold
        sold = v.find('div', class_='SoldVehicle')
        if sold is not None:
            return res, False # all the rest of the inventory is a list of sold vehicles, no need to go further

        # get vehicle description page url
        url = base_url + v.div.contents[1].div.div.a['href']
        
        # Printing URL (for debugginh purposes)
        print(url)
        res.append(url)
    
    return res, True

In [6]:
# Making a list of inventory URLs
vehicle_urls = []
cpage = 1

urls, has_more = parse_inventory({'ppage': 100, 'cpage': cpage})

while has_more:
    vehicle_urls.extend(urls)
    cpage += 1
    urls, has_more = parse_inventory({'cpage': cpage})
    
vehicle_urls.extend(urls)
print(len(vehicle_urls))

https://www.khushiauto.ca//cars/used/2016-Nissan-Altima-Sedan-114128
https://www.khushiauto.ca//cars/used/2017-Hyundai-Elantra-Sedan-114118
https://www.khushiauto.ca//cars/used/2012-Mercedes-Benz-CL-Class-Coupe-113364
https://www.khushiauto.ca//cars/used/2015-Acura-ILX-Sedan-113362
https://www.khushiauto.ca//cars/used/2009-Honda-Accord-Sedan-Sedan-111398
https://www.khushiauto.ca//cars/used/2010-Nissan-Altima-Coupe-111392
https://www.khushiauto.ca//cars/used/2008-Nissan-Rogue-SUV-75061
https://www.khushiauto.ca//cars/used/2014-Audi-Q5-SUV-97354
https://www.khushiauto.ca//cars/used/2014-BMW-3-Series-Sedan-75072
https://www.khushiauto.ca//cars/used/2012-Mercedes-Benz-C-Class-Coupe-101233
https://www.khushiauto.ca//cars/used/2014-BMW-3-Series-Sedan-108795
https://www.khushiauto.ca//cars/used/2014-Chevrolet-Cruze-Sedan-103052
https://www.khushiauto.ca//cars/used/2014-Nissan-Pathfinder-SUV-104208
https://www.khushiauto.ca//cars/used/2011-Land-Rover-Range-Rover-SUV-100807
https://www.khushia

In [11]:
# Function parses vehicle description page and returns a dictionary with its parameters
def parse_car_infopage(url):
    # Translation table from their names to our key names
    translation_table = {
        'Make:': 'make',
        'Model:': 'model',
        'Year:': 'year',
        'Body Style:': 'body_style',
        'Odometer:': 'mileage', # Note: special treatment
        'Transmission:': 'transmission',
        'Engine:': 'cylinders', # Note: special treatment
        'Engine Size:': 'displacement',
        'Driveline:': 'drivetrain',
        'Exterior Color:': 'ext_color', 
        'Interior Color:': 'int_color',
        'Doors:': 'doors', # Note: special treatment
        'Passengers:': 'passengers',
        'Stock Number:': 'stock_id',
        'VIN:': 'vin'
    }
    
    # Fetching the page
    time.sleep(1.1)
    response = requests.get(url)

    # Parsing
    soup = BeautifulSoup(response.text, 'lxml')
    price_node = soup.find('span', class_='PriceValue')
    details_node = soup.find('div', class_='VehicleInfoDetails')
    descr_node = soup.find('div', class_='seller_comments')
    features_node = soup.find('ul', class_='VehicleOptions')

    # Basic info
    res = {
        'timestamp': dt.datetime.now().isoformat(),
        'dealer': 'khushiauto',
        'url': url
    }

    # Price
    price = re.sub('[^0-9]', '', price_node.contents[0]) # Cleaning junk, leaving only numbers
    res['price'] = price

    # Reading core parameters
    spans = details_node.find_all('span')

    for s in spans:
        if s.text in translation_table:
            key = translation_table[s.text]
            value = s.next_sibling.text

            # Ad Hoc treatment
            if key in ['cylinders', 'doors']:
                value = value.split()[0]
            elif key == 'mileage':
                value = re.sub('[^0-9]', '', value) # Remove all non numbers using regexp substitution

            res[key] = value
    
    # Free text description        
    res['description'] = descr_node.find('p').text

    # Features and options
    res['features'] = []
    items = features_node.find_all('li')
    for i in items:
        res['features'].append(i.text)
    
    # Printing result (for debugging purposes)
    print(json.dumps(res, indent=4))
    
    # Done
    return res

In [12]:
# Now scraping vehicle data, one by one
vehicle_data = []

for url in vehicle_urls:
    vehicle_data.append(parse_car_infopage(url))
    
#print(len(vehicle_data))
for v in vehicle_data:
    print(v)
with open('data/khushiauto_cars.json', 'w', encoding='utf-8') as fp:
    for v in vehicle_data:
        fp.writelines(json.dumps(v) + "\n")

{
    "timestamp": "2020-04-27T07:59:06.395527",
    "dealer": "khushiauto",
    "url": "https://www.khushiauto.ca//cars/used/2016-Nissan-Altima-Sedan-114128",
    "price": "9488",
    "make": "Nissan",
    "model": "Altima",
    "year": "2016",
    "body_style": "Sedan",
    "mileage": "136236",
    "transmission": "Automatic",
    "cylinders": "4",
    "displacement": "2.5",
    "drivetrain": "FWD",
    "ext_color": "Silver",
    "int_color": "Beige",
    "doors": "4",
    "passengers": "-",
    "stock_id": "2624",
    "vin": "1N4AL3AP5GN359677",
    "description": "2016 Nissan Altima 2.5S Accident Free vehicle and in extra clean condition all around. This vehicle is equipped with a Back Up Camera, Power Windows and Locks, Cruise Control, Bluetooth, Key-less Entry, Push Start, Power Driver Seat, and much more. WE HAVE THE LOWEST PRICE IN THE GTA! No hassles, no haggles, just our best price first. Price includes, Certification, 1 year engine and transmission warranty, Carproof history

{
    "timestamp": "2020-04-27T07:59:16.602812",
    "dealer": "khushiauto",
    "url": "https://www.khushiauto.ca//cars/used/2010-Nissan-Altima-Coupe-111392",
    "price": "5999",
    "make": "Nissan",
    "model": "Altima",
    "year": "2010",
    "body_style": "Coupe",
    "mileage": "150300",
    "transmission": "CVT",
    "cylinders": "4",
    "displacement": "2.5",
    "drivetrain": "FWD",
    "ext_color": "Silver",
    "int_color": "Black",
    "doors": "2",
    "passengers": "-",
    "stock_id": "2604",
    "vin": "1N4AL2EP7AC186932",
    "description": "2010 Altima 2.5S Coupe equipped with Power Sunroof, Heated Seats, Power Seats, Power Windows, Cruise Control, Bluetooth, Key-less Entry, and much more. Canadian vehicle and in excellent condition all around.\u00a0 LOWEST PRICE IN THE GTA No hassles, no haggles, just our best price first. Price includes: Certification, 1 Year Engine and Transmission Warranty, Carfax Report and Detailing.\u00a0",
    "features": [
        "ABS Br

{
    "timestamp": "2020-04-27T07:59:26.716341",
    "dealer": "khushiauto",
    "url": "https://www.khushiauto.ca//cars/used/2014-BMW-3-Series-Sedan-108795",
    "price": "14500",
    "make": "BMW",
    "model": "3 Series",
    "year": "2014",
    "body_style": "Sedan",
    "mileage": "102000",
    "transmission": "Automatic",
    "cylinders": "4",
    "displacement": "2.0",
    "drivetrain": "AWD",
    "ext_color": "White",
    "int_color": "Beige",
    "doors": "4",
    "passengers": "5",
    "stock_id": "2622",
    "vin": "WBA3B3G53ENR83208",
    "description": "2014 328i xDrive Carproof verified. Equipped with\u00a0Power Sunroof, Navigation, Back up Cam, Power Folding Mirrors, Parking Sensors, Leather Interior, Bluetooth Audio/Phone\u00a0and much more! Very clean condition from inside and out. No hassles, no haggles, just our best price first. Price includes Safety Certification, 1 Year Warranty, Full Detail, and CarProof History Report.",
    "features": [
        "ABS Brakes",
 

{
    "timestamp": "2020-04-27T07:59:34.910286",
    "dealer": "khushiauto",
    "url": "https://www.khushiauto.ca//cars/used/2015-Honda-Civic-Sedan-Sedan-101920",
    "price": "11850",
    "make": "Honda",
    "model": "Civic Sedan",
    "year": "2015",
    "body_style": "Sedan",
    "mileage": "87100",
    "transmission": "Automatic",
    "cylinders": "4",
    "displacement": "1.8",
    "drivetrain": "FWD",
    "ext_color": "Black",
    "int_color": "Black",
    "doors": "4",
    "passengers": "-",
    "stock_id": "2605",
    "vin": "2HGFB2F54FH046956",
    "description": "Extra clean vehicle carfax verified clean title. 2015 Civic Sedan equipped with Power Windows, Back Up Camera, Remote Start, Push Start, Sunroof, Bluetooth, Cruise Control, Lane Departure Camera, and much more. The vehicle is in absolute pristine condition all around, interior is also very well kept. No hassles, no haggles, just our best price first. Price includes, Certification, Carfax Report, 1 Year Powertrain W

{
    "timestamp": "2020-04-27T07:59:44.810149",
    "dealer": "khushiauto",
    "url": "https://www.khushiauto.ca//cars/used/2010-Dodge-Journey-SUV-75044",
    "price": "4999",
    "make": "Dodge",
    "model": "Journey",
    "year": "2010",
    "body_style": "SUV",
    "mileage": "151444",
    "transmission": "6 Speed Automatic",
    "cylinders": "6",
    "displacement": "3.5",
    "drivetrain": "FWD",
    "ext_color": "Gray",
    "int_color": "Gray",
    "doors": "4",
    "passengers": "5",
    "stock_id": "2509",
    "vin": "3D4PG5FV0AT123363",
    "description": "2010\u00a0Dodge Journey 5 Passenger\u00a0Equipped With Power Windows, Power Locks, Air Conditioning, Cruise Control,\u00a0and Much More! This is a Canadian\u00a0Vehicle With a Clean Title. Take Advantage of our Low Priceed SUV,\u00a0It Won't Last Long!! WE FINANCE INTERNATINAL STUDENTS - ZERO DOWN INSTANT APPROVALS!\u00a0",
    "features": [
        "3rd Row Seating",
        "ABS Brakes",
        "Air Conditioning",
    

{
    "timestamp": "2020-04-27T07:59:56.670360",
    "dealer": "khushiauto",
    "url": "https://www.khushiauto.ca//cars/used/2012-Volkswagen-Passat-Sedan-75041",
    "price": "10999",
    "make": "Volkswagen",
    "model": "Passat",
    "year": "2012",
    "body_style": "Sedan",
    "mileage": "87000",
    "transmission": "Automatic",
    "cylinders": "4",
    "displacement": "2.0",
    "drivetrain": "FWD",
    "ext_color": "Silver",
    "int_color": "Black",
    "doors": "4",
    "passengers": "-",
    "stock_id": "2062",
    "vin": "1VWBN7A30CC007428",
    "description": "ABSOLUTE MINT CONDITION, PRE-INSPECTED, BEAUTIFUL 2012 VOLKSWAGEN PASSAT TDI. THIS VEHICLES EQUIPPED WITH LEATHER, NAVIGATION, FENDER SOUND SYSTEM, VOICE ACTIVATION, POWER SUNROOF , AND MUCH MORE. No hassles, no haggles, just our best price first. Price includes Safety Certification, 1 Year Warranty, Full Detail, and CarProof History Report.  LOWEST PRICE IN ALL OF GTA. No hassles, no haggles, just our best price f