In [15]:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time, os
import pickle

In [19]:
cars = {}

In [20]:
def scrape_initial_details(soup, car):
   advert_title = soup.find(attrs={"data-testid": "advert-title"})
   car['name'] = advert_title.text

   reg_age = advert_title.find_next_sibling('p').text
   car['year'] = reg_age.split()[0]

   miles_string = soup.find(id='ks-mileage').parent.find_next_sibling('span').text
   amount_only = miles_string.split()[0]
   amount_no_comma = amount_only.replace(',', '')
   car['mileage'] = int(amount_no_comma)

   price_string = soup.find(attrs={"data-testid": "advert-price"}).text
   price_digits_only = price_string.replace(',', '').replace('£', '')
   car['price'] = int(price_digits_only)

   car['advert_subtitle'] = soup.find(attrs={"data-testid": "advert-subtitle"}).text

   key_specs = []
   children = soup.find(attrs={"aria-label": "Key Specifications"}).findChildren("li")
   for child in children:
      key_specs.append(child.text.strip())

   car['body_type'] = key_specs[0]
   car['engine_size'] = float(key_specs[1].replace("L", ""))
   car['gearbox'] = key_specs[2]
   car['fuel_type'] = key_specs[3]
   car['doors'] = int(key_specs[4].split()[0])

   return car

In [21]:
def scrape_performance(soup, car):
   performance_expander = soup.find(attrs={"data-gui": "performance-expander"})
   lines = performance_expander.findChildren()[2].find_all('li')

   for line in lines:
      key = line.contents[0].text
      value = line.contents[1].text
      if "0-6" in key:
         car["0-6Xmph"] = float(value.split()[0])
      elif key == "Top Speed":
         car["top_speed"] = int(value.split()[0])
      elif key == "Cylinders":
         car["cylinders"] = int(value)
      elif key == "Engine power":
         car["power"] = int(value.split()[0])

   return car

In [45]:
def scrape_pdp(driver, url, car, car_id):
   driver.get(url)
   driver.maximize_window()
   time.sleep(0.6)
   soup = bs(driver.page_source)

   try:
      car = scrape_initial_details(soup, car)
   except Exception as e:
      print(f"Error scraping initial details. Car id: {car_id}")
      return car, (car_id, "intial_details", e)

   try:
      driver.execute_script("window.scrollTo(0, 1080)")
      driver.find_element_by_css_selector("*[data-testid='tech-spec-link']").click()
   except:
      iframe = driver.find_element_by_xpath("//iframe[@title='SP Consent Message']")
      driver.switch_to.frame(iframe)
      driver.find_element_by_css_selector("*[title='Accept All']").click()

      driver.switch_to.default_content()
      driver.execute_script("window.scrollTo(0, 1080)")
      driver.find_element_by_css_selector("*[data-testid='tech-spec-link']").click()

   time.sleep(0.5)
   soup_spec_modal = bs(driver.page_source)
   try:
      car = scrape_performance(soup_spec_modal, car)
   except Exception as e:
      print(f"Error scraping performance. Car id: {car_id}")
      return car, (car_id, "performance spec", e)

   return car, None

In [23]:
def scrape_paths(driver, car_ids, cars):
   incomplete_scrapes = []
   already_scraped = []

   for car_id in car_ids.keys():
      if car_id not in cars:
         car = {}
         car['id'] = car_id

         url = "https://www.autotrader.co.uk/car-details/" + car_id
         try:
            car, incomplete_scrape = scrape_pdp(driver, url, car, car_id)
            if incomplete_scrape:
               incomplete_scrapes.append(incomplete_scrape)
            else:
               cars[car_id] = car
         except Exception as e:
            incomplete_scrapes.append((car_id, "navigating", e))
      else:
         already_scraped.append(car_id)
           
   driver.quit()
   return cars, incomplete_scrapes, already_scraped

In [38]:
# Load data
with open('car_ids.pickle', 'rb') as handle:
   car_ids = pickle.load(handle)

car_ids

{'202109207563077': None,
 '202109227674550': None,
 '202108136210396': None,
 '202106043455421': None,
 '202105082360787': None,
 '202102239402326': None,
 '202110258855609': None,
 '202107215302341': None,
 '202110138442617': None,
 '202108306788653': None,
 '202110128395470': None,
 '202110188623506': None,
 '202108186392335': None,
 '202011256463759': None,
 '202108276699111': None,
 '202108065933948': None,
 '202106103693119': None,
 '202105293249710': None,
 '202105092394078': None,
 '202106294400981': None,
 '202105032133215': None,
 '202106304436928': None,
 '202109307982338': None,
 '202108065928307': None,
 '202110088266125': None,
 '202108266653969': None,
 '202109227667959': None,
 '202108116107113': None,
 '202109157400086': None,
 '202110218731995': None,
 '202105122515940': None,
 '202109016858651': None,
 '202110038063357': None,
 '202109177467070': None,
 '202107145034939': None,
 '202110238822508': None,
 '202110048110259': None,
 '202110138439423': None,
 '2021100180

In [39]:
len(car_ids)

1742

In [46]:
chromedriver = "/Users/william.bell/opt/anaconda3/envs/metis/lib/python3.8/site-packages/chromedriver_binary/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
driver.implicitly_wait(3)

In [47]:
start = time.time()
cars, incomplete_scrapes, already_scraped = scrape_paths(driver, car_ids, cars)
end = time.time()
print(f"Time taken: {end - start}" )

Error scraping initial details. Car id: 202108065928307
Error scraping initial details. Car id: 202108065928317
Error scraping initial details. Car id: 202109026919221
Error scraping initial details. Car id: 202107275528014
Error scraping initial details. Car id: 202006109985445
Error scraping initial details. Car id: 202105102436593
Error scraping initial details. Car id: 202108276703564
Error scraping initial details. Car id: 202108045833776
Error scraping initial details. Car id: 202106103707766
Error scraping initial details. Car id: 202110208690942
Error scraping initial details. Car id: 202109267824672
Error scraping initial details. Car id: 202110208695852
Error scraping performance. Car id: 202110058154612
Error scraping initial details. Car id: 202109077102396
Error scraping initial details. Car id: 202109177481131
Error scraping performance. Car id: 202109107220100
Error scraping initial details. Car id: 202109147356226
Error scraping initial details. Car id: 201905228233441


In [48]:
incomplete_scrapes

[('202108065933948',
  'navigating',
  selenium.common.exceptions.NoSuchElementException('no such element: Unable to locate element: {"method":"xpath","selector":"//iframe[@title=\'SP Consent Message\']"}\n  (Session info: chrome=95.0.4638.54)',
                                                    None,
                                                    None)),
 ('202108065928307',
  'intial_details',
  ValueError("could not convert string to float: 'Manual'")),
 ('202110218751352',
  'navigating',
  selenium.common.exceptions.NoSuchElementException('no such element: Unable to locate element: {"method":"xpath","selector":"//iframe[@title=\'SP Consent Message\']"}\n  (Session info: chrome=95.0.4638.54)',
                                                    None,
                                                    None)),
 ('202107155054619',
  'navigating',
  selenium.common.exceptions.NoSuchElementException('no such element: Unable to locate element: {"method":"xpath","selector":"//ifra

In [52]:
len(incomplete_scrapes)

133

In [49]:
len(already_scraped)

3

In [50]:
len(cars)

1609

In [None]:
cars

In [51]:
# Store data
with open('more_cars.pickle', 'wb') as handle:
   pickle.dump(cars, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [53]:
# Load data
with open('cars.pickle', 'rb') as handle:
   unserialized_data = pickle.load(handle)

unserialized_data

{'202109207563077': {'id': '202109207563077',
  'name': 'Porsche 911',
  'year': '2021',
  'mileage': 1821,
  'price': 116900,
  'advert_subtitle': '2dr PDK 3.0',
  'body_type': 'Convertible',
  'engine_size': 3.0,
  'gearbox': 'Automatic',
  'fuel_type': 'Petrol',
  'doors': 2,
  '0-6Xmph': 4.4,
  'top_speed': 179,
  'cylinders': 6,
  'power': 380},
 '202109227674550': {'id': '202109227674550',
  'name': 'Porsche 911',
  'year': '1999',
  'mileage': 101000,
  'price': 15995,
  'advert_subtitle': '3.4 996 Carrera 4 Tiptronic S AWD 2dr',
  'body_type': 'Coupe',
  'engine_size': 3.4,
  'gearbox': 'Automatic',
  'fuel_type': 'Petrol',
  'doors': 2,
  '0-6Xmph': 5.2,
  'top_speed': 175,
  'cylinders': 6,
  'power': 300},
 '202108136210396': {'id': '202108136210396',
  'name': 'Porsche 911',
  'year': '2000',
  'mileage': 104825,
  'price': 15995,
  'advert_subtitle': 'CARRERA 4 TIP S 3.4 2dr',
  'body_type': 'Coupe',
  'engine_size': 3.4,
  'gearbox': 'Automatic',
  'fuel_type': 'Petrol',
