In [None]:
!pip3 install selenium
!pip3 install webdriver-manager

In [4]:
# Import libraries
import calendar
import time
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

In [None]:
# Define base url
base_url = "https://www.cazoo.co.uk"

In [None]:
# Create Pandas Dataframe
vehicle_df = pd.DataFrame(columns=['Make', 'Model', 'Price (£)', 'Milage', 'Registration Year', 'Registration Month', 'Gearbox', 'Fuel Type', 'N of Owners'])

In [None]:
# Get response code
response = requests.get(base_url)
print(response.status_code)

In [None]:
# Create page counter
page_no = 1

In [None]:
# Loop for pages
for page in range(10):

    # Define url
    listings_url = f"https://www.cazoo.co.uk/cars/?page={page_no}"

    # Get page url and parse it
    response = requests.get(listings_url)
    soup = BeautifulSoup(response.text, features="html.parser")
    
    # Find listings
    listings = soup.find_all(class_='vehicle-cardstyles__InfoWrap-sc-1bxv5iu-2 laxUtI')

    # Listing counter
    listing_count = 1

    # Print status
    print(f"Scraping from page {page_no}...")

    # Get a tags (urls) for each listing
    for listing in listings:

        # Print listing number
        print(f"listing {listing_count}...")

        # Find 'a' tags containing the URLs to each listing
        a_tags = listing.find_all('a')

        # In each url scrape the data
        for car_url in a_tags:

            # Parse html
            response = requests.get(base_url + car_url['href'])
            soup = BeautifulSoup(response.content, features='html.parser')

            # Get make and model
            try:
                make_model = soup.find('h1', class_='sc-yrk414-0 Abyeg').text.strip()
                make, model = make_model.split(' ', 1)
            except:
                make, model = "n/a"

            # Get price
            try:
                driver = webdriver.Chrome()
                driver.get(base_url + car_url['href'])
                time.sleep(1)
                price = driver.find_element(By.CSS_SELECTOR, "div.md\:px-l:nth-child(3) > p:nth-child(1) > small:nth-child(1) > small:nth-child(2) > strong:nth-child(1)").text
                price = int(price[1:].replace(',', ''))
            except:
                price = "n/a"

            # Get mileage
            try:
                mileage = int(soup.find_all('li', class_="sc-1qzxxja-0 sc-17oqk9n-0 cMysQa etVBLA")[0].text.strip().replace(' miles','').replace(',',''))
            except:
                mileage = "n/a"

            # Get registration year
            try:
                reg_year = soup.find_all('li', class_="sc-1qzxxja-0 sc-17oqk9n-0 cMysQa etVBLA")[1].text.strip()
                reg_year = int(reg_year[-4:])
            except:
                reg_year = "n/a"

            # Get registration month
            try:
                reg_month = soup.find_all('li', class_="sc-1qzxxja-0 sc-17oqk9n-0 cMysQa etVBLA")[1].text.strip()
                reg_month = reg_month[:3]
                reg_month = int(list(calendar.month_abbr).index(reg_month))
            except:
                reg_month = "n/a"

            # Get gearbox
            try:
                gearbox = soup.find_all('li', class_="sc-1qzxxja-0 sc-17oqk9n-0 cMysQa etVBLA")[2].text.strip()
            except:
                gearbox = "n/a"

            # Get fuel type
            try:
                fuel_type = soup.find_all('li', class_="sc-1qzxxja-0 sc-17oqk9n-0 cMysQa etVBLA")[3].text.strip()
            except:
                fuel_type = "n/a"

            # Get number of owners
            try:
                number_of_owners = soup.select("dl.divide-y:nth-child(2) > div:nth-child(3) > dd:nth-child(2)")[0]
                number_of_owners = int(number_of_owners.text)
            except:
                number_of_owners = "n/a"

            # Create row
            row = pd.Series({'Make': make,
                             'Model': model,
                             'Price (£)': price,
                             'Milage': mileage,
                             'Registration Year': reg_year,
                             'Registration Month': reg_month,
                             'Gearbox': gearbox,
                             'Fuel Type': fuel_type,
                             'N of Owners': number_of_owners
                            })

            # Add row to pandas data frame
            vehicle_df.loc[len(vehicle_df)] = row
        
        # There are 47 listings per page. Break inner loop when 47 is reached
        listing_count += 1
        if listing_count > 47:
            break

    # Go to next page
    page_no += 1

In [None]:
# Save to csv
vehicle_df.to_csv("cazoo_used_cars_dataset.csv")

In [15]:
# Read dataset
df = pd.read_csv("cazoo_used_cars_dataset.csv")
df

Unnamed: 0.1,Unnamed: 0,Make,Model,Price (£),Milage,Registration Year,Registration Month,Gearbox,Fuel Type,N of Owners
0,0,Ford,Ka,6350.0,56530,2016,4,Manual,Petrol,3
1,1,Citroen,C4,6750.0,50727,2015,9,Manual,Petrol,1
2,2,Suzuki,Celerio,6800.0,23190,2017,3,Manual,Petrol,3
3,3,Dacia,Sandero,,10837,2016,5,Manual,Petrol,1
4,4,Suzuki,Celerio,7000.0,23778,2016,3,Manual,Petrol,1
...,...,...,...,...,...,...,...,...,...,...
465,465,Vauxhall,Mokka X,10450.0,64188,2017,9,Manual,Diesel,1
466,466,Smart,fortwo,10450.0,16862,2018,10,Manual,Petrol,1
467,467,Renault,Captur,10450.0,51912,2019,9,Manual,Petrol,1
468,468,Volkswagen,up!,10450.0,6526,2018,9,Manual,Petrol,1


In [16]:
# Drop rows with 'n/a'
df.dropna()

Unnamed: 0.1,Unnamed: 0,Make,Model,Price (£),Milage,Registration Year,Registration Month,Gearbox,Fuel Type,N of Owners
0,0,Ford,Ka,6350.0,56530,2016,4,Manual,Petrol,3
1,1,Citroen,C4,6750.0,50727,2015,9,Manual,Petrol,1
2,2,Suzuki,Celerio,6800.0,23190,2017,3,Manual,Petrol,3
4,4,Suzuki,Celerio,7000.0,23778,2016,3,Manual,Petrol,1
5,5,Fiat,500,7000.0,41216,2015,9,Manual,Petrol,3
...,...,...,...,...,...,...,...,...,...,...
465,465,Vauxhall,Mokka X,10450.0,64188,2017,9,Manual,Diesel,1
466,466,Smart,fortwo,10450.0,16862,2018,10,Manual,Petrol,1
467,467,Renault,Captur,10450.0,51912,2019,9,Manual,Petrol,1
468,468,Volkswagen,up!,10450.0,6526,2018,9,Manual,Petrol,1


In [12]:
# Top 10 models by average price
model_averages = df.groupby('Model')['Price (£)'].mean()
top_10_models_by_average_price = model_averages.sort_values(ascending=False).head(10)
top_10_models_by_average_price

Model
Yaris            10450.000000
Jetta            10300.000000
Tipo             10250.000000
Mokka X          10208.333333
Octavia          10150.000000
C3               10087.500000
Sportage         10075.000000
Vitara           10050.000000
Logan MCV        10025.000000
Zafira Tourer    10000.000000
Name: Price (£), dtype: float64

In [13]:
# Top 10 brands by average price
brand_averages = df.groupby('Make')['Price (£)'].mean()
top_10_brands_by_average_price = brand_averages.sort_values(ascending=False).head(10)
top_10_brands_by_average_price

Make
Mini          9800.000000
Jeep          9750.000000
Nissan        9745.238095
Peugeot       9707.575758
Renault       9642.857143
Volkswagen    9617.647059
Vauxhall      9543.442623
Honda         9533.333333
Hyundai       9496.666667
DS            9437.500000
Name: Price (£), dtype: float64

In [18]:
# Number of cars for each fuel type
fuel_count = df['Fuel Type'].value_counts()
fuel_count

Petrol    384
Diesel     86
Name: Fuel Type, dtype: int64

In [14]:
# Average price by fuel type
fuel_type_average = df.groupby('Fuel Type')['Price (£)'].mean()
fuel_type_average

Fuel Type
Diesel    9561.046512
Petrol    9379.634465
Name: Price (£), dtype: float64

In [23]:
# Count by registration year
registration_year_count = df['Registration Year'].value_counts().sort_index(ascending=False)
registration_year_count

2020      6
2019     76
2018    120
2017     93
2016    139
2015     34
2014      2
Name: Registration Year, dtype: int64