In [21]:
import re
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time, os

In [22]:
def scrape_initial_details(soup, car):
   advert_title = soup.find(attrs={"data-testid": "advert-title"})
   car['name'] = advert_title.text

   reg_age = advert_title.find_next_sibling('p').text
   car['year'] = reg_age.split()[0]

   miles_string = soup.find(id='ks-mileage').parent.find_next_sibling('span').text
   amount_only = miles_string.split()[0]
   amount_no_comma = amount_only.replace(',', '')
   car['mileage'] = int(amount_no_comma)

   price_string = soup.find(attrs={"data-testid": "advert-price"}).text
   price_digits_only = price_string.replace(',', '').replace('£', '')
   car['price'] = int(price_digits_only)

   car['advert_subtitle'] = soup.find(attrs={"data-testid": "advert-subtitle"}).text

   key_specs = []
   children = soup.find(attrs={"aria-label": "Key Specifications"}).findChildren("li")
   for child in children:
      key_specs.append(child.text.strip())

   car['body_type'] = key_specs[0]
   car['engine_size'] = float(key_specs[1].replace("L", ""))
   car['gearbox'] = key_specs[2]
   car['fuel_type'] = key_specs[3]
   car['doors'] = int(key_specs[4].split()[0])

   return car

In [23]:
def scrape_performance(soup, car):
   performance_expander = soup.find(attrs={"data-gui": "performance-expander"})
   lines = performance_expander.findChildren()[2].find_all('li')

   for line in lines:
      key = line.contents[0].text
      value = line.contents[1].text
      if "0-6" in key:
         car["0-6Xmph"] = float(value.split()[0])
      elif key == "Top Speed":
         car["top_speed"] = int(value.split()[0])
      elif key == "Cylinders":
         car["cylinders"] = int(value)
      elif key == "Engine power":
         car["power"] = int(value.split()[0])

   return car

In [54]:
def scrape_pdp(chromedriver, url, car, car_id):
   driver = webdriver.Chrome(chromedriver)
   driver.implicitly_wait(10)

   driver.get(url)
   driver.maximize_window()
   soup = bs(driver.page_source)

   try:
      car = scrape_initial_details(soup, car)
   except:
      print(f"Error scraping initial details. Car id: {car_id}")
      driver.quit()
      return car, (car_id, "intial_details")

   iframe = driver.find_element_by_xpath("//iframe[@title='SP Consent Message']")
   driver.switch_to.frame(iframe)
   driver.find_element_by_css_selector("*[title='Accept All']").click()

   driver.switch_to.default_content()
   driver.execute_script("window.scrollTo(0, 1080)")
   driver.find_element_by_css_selector("*[data-testid='tech-spec-link']").click()

   time.sleep(1)
   soup_spec_modal = bs(driver.page_source)
   try:
      car = scrape_performance(soup_spec_modal, car)
   except:
      print(f"Error scraping performance. Car id: {car_id}")
      driver.quit()
      return car, (car_id, "performance spec")

   driver.quit()
   return car, None

In [60]:
def scrape_paths(chromedriver, pdp_paths, cars):
   incomplete_scrapes = []
   already_scraped = []

   pattern = r'(\d+)\?'

   for path in pdp_paths:
      match = re.search(pattern, path)
      car_id = match.group(1)

      if car_id not in cars:
         car = {}
         car['id'] = car_id

         url = "https://www.autotrader.co.uk" + path
         try:
            car, incomplete_scrape = scrape_pdp(chromedriver, url, car, car_id)
            if incomplete_scrape:
               incomplete_scrapes.append(incomplete_scrape)
            else:
               cars[car_id] = car
         except:
            incomplete_scrapes.append((car_id, "navigating"))
      else:
         already_scraped.append(car_id)

   return cars, incomplete_scrapes, already_scraped

In [64]:
chromedriver = "/Users/william.bell/opt/anaconda3/envs/metis/lib/python3.8/site-packages/chromedriver_binary/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

path1 = '/car-details/202105202850513?model=911&onesearchad=New&onesearchad=Nearly%20New&onesearchad=Used&make=Porsche&radius=1500&year-to=2021&include-delivery-option=on&postcode=bh205lg&sort=price-desc&advertising-location=at_cars&page=1'
path2 = '/car-details/202110048110259?year-to=2021&advertising-location=at_cars&model=911&make=Porsche&include-delivery-option=on&postcode=bh205lg&onesearchad=New&onesearchad=Nearly%20New&onesearchad=Used&radius=1501&sort=relevance&page=1'
pdp_paths = [path1, path2]

cars = {}

%store -r pdp_urls
pdp_urls

['/car-details/202110218731995?include-delivery-option=on&model=911&postcode=bh205lg&sort=relevance&radius=1501&advertising-location=at_cars&onesearchad=New&onesearchad=Nearly%20New&onesearchad=Used&make=Porsche&page=1',
 '/car-details/202108276699111?include-delivery-option=on&model=911&postcode=bh205lg&sort=relevance&radius=1501&advertising-location=at_cars&onesearchad=New&onesearchad=Nearly%20New&onesearchad=Used&make=Porsche&page=1',
 '/car-details/202109187510335?include-delivery-option=on&model=911&postcode=bh205lg&sort=relevance&radius=1501&advertising-location=at_cars&onesearchad=New&onesearchad=Nearly%20New&onesearchad=Used&make=Porsche&page=1',
 '/car-details/202109026919221?include-delivery-option=on&model=911&postcode=bh205lg&sort=relevance&radius=1501&advertising-location=at_cars&onesearchad=New&onesearchad=Nearly%20New&onesearchad=Used&make=Porsche&page=1',
 '/car-details/202109227667959?include-delivery-option=on&model=911&postcode=bh205lg&sort=relevance&radius=1501&adve

In [76]:


start = time.time()
cars, incomplete_scrapes, already_scraped = scrape_paths(chromedriver, pdp_urls, cars)
end = time.time()
print(f"Time taken: {end - start}" )

user agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36
Error scraping initial details. Car id: 202109026919221
user agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36
Error scraping initial details. Car id: 202108065928317
Time taken: 43.530699014663696


In [73]:
incomplete_scrapes

[('202109026919221', 'intial_details'), ('202108065928317', 'intial_details')]

In [74]:
len(already_scraped)

9

In [75]:
cars

{'202110218731995': {'id': '202110218731995',
  'name': 'Porsche 911',
  'year': '1998',
  'mileage': 104000,
  'price': 12995,
  'advert_subtitle': '3.4 996 Carrera 2 Tiptronic S 2dr',
  'body_type': 'Coupe',
  'engine_size': 3.4,
  'gearbox': 'Automatic',
  'fuel_type': 'Petrol',
  'doors': 2,
  '0-6Xmph': 5.2,
  'top_speed': 175,
  'cylinders': 6,
  'power': 300},
 '202108276699111': {'id': '202108276699111',
  'name': 'Porsche 911',
  'year': '2001',
  'mileage': 96000,
  'price': 16995,
  'advert_subtitle': '3.6 CARRERA 2 TIPTRONIC S 2d 316 BHP',
  'body_type': 'Convertible',
  'engine_size': 3.6,
  'gearbox': 'Automatic',
  'fuel_type': 'Petrol',
  'doors': 2,
  '0-6Xmph': 5.7,
  'top_speed': 174,
  'cylinders': 6,
  'power': 315},
 '202109187510335': {'id': '202109187510335',
  'name': 'Porsche 911',
  'year': '2003',
  'mileage': 83000,
  'price': 18750,
  'advert_subtitle': '3.6 996 Carrera 4 Tiptronic S AWD 2dr',
  'body_type': 'Coupe',
  'engine_size': 3.6,
  'gearbox': 'Aut

In [44]:
import pickle

# Store data
with open('cars.pickle', 'wb') as handle:
   pickle.dump(cars, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [45]:
# Load data
with open('cars.pickle', 'rb') as handle:
   unserialized_data = pickle.load(handle)

unserialized_data

{'202105202850513': {'id': '202105202850513',
  'name': 'Porsche 911',
  'year': '2011',
  'mileage': 6699,
  'price': 459950,
  'advert_subtitle': '4.0 997 GT3 RS 2dr',
  'body_type': 'Coupe',
  'engine_size': 4.0,
  'gearbox': 'Manual',
  'fuel_type': 'Petrol',
  'doors': 2,
  '0-6Xmph': 3.9,
  'top_speed': 193,
  'cylinders': 6,
  'power': 500},
 '202110048110259': {'id': '202110048110259',
  'name': 'Porsche 911',
  'year': '2021',
  'mileage': 4000,
  'price': 117500,
  'advert_subtitle': '3.0 CARRERA PDK 2d 380 BHP Very Low Mileage, Presented in Immaculate Condit',
  'body_type': 'Convertible',
  'engine_size': 3.0,
  'gearbox': 'Automatic',
  'fuel_type': 'Petrol',
  'doors': 2,
  '0-6Xmph': 4.4,
  'top_speed': 181,
  'cylinders': 6,
  'power': 380}}