In [1]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.19.0-py3-none-any.whl (10.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.25.0-py3-none-any.whl (467 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m467.2/467.2 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?

In [2]:
import selenium
from selenium.webdriver.chrome.options import Options
from PIL import Image
from selenium.webdriver.common.by import By
import csv
import time
from bs4 import BeautifulSoup
import requests
import re
from concurrent.futures import ThreadPoolExecutor

In [37]:
column_names = ["title", "year_brand_model", "mileage", "buyer_rating", "num_reviews", "percent_recommend", "review_breakdown", "basic_description_keys", "basic_description_values", "history_description_keys", "history_description_values", "price"]

with open("/content/drive/MyDrive/scraped_used_cars_data.csv", "a", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(column_names)

In [3]:
def get_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = selenium.webdriver.Chrome(options=options)
    return driver

### Test webdriver

In [15]:
url = "https://www.cars.com/"

test_driver = get_driver()
test_driver.get(url)
print("Website title: ", test_driver.title)

test_driver.save_screenshot("website_page.png")
screenshot = Image.open("website_page.png")
screenshot.show()

Website title:  New Cars, Used Cars, Car Dealers, Prices & Reviews | Cars.com


### Function for getting the url for each page

In [4]:
def get_item_urls_on_page(page_url, driver):
    driver.get(page_url)
    time.sleep(1)

    links_on_page = driver.execute_script("""
        let data_list = document.querySelectorAll(".vehicle-card-link")
        let links = []
            for (let i = 0; i < data_list.length; i++){
                let link = data_list[i].href
                links.push(link)
            }
        return links
    """)

    return links_on_page

In [5]:
def get_url_of_page(page_number):
    page = str(page_number)
    left_url_part = "https://www.cars.com/shopping/results/?city_name=New+York%2C+NY&city_slug[]=new_york-ny&page="
    right_url_part = "&page_size=100&stock_type=used&zip=11221"
    search_url = left_url_part + page + right_url_part
    return search_url

In [6]:
get_url_of_page(3)

'https://www.cars.com/shopping/results/?city_name=New+York%2C+NY&city_slug[]=new_york-ny&page=3&page_size=100&stock_type=used&zip=11221'

In [7]:
def get_urls_by_page(page_num, driver):
    # get the page url
    page_url = get_url_of_page(page_num)

    # get all data links on the page
    item_urls = get_item_urls_on_page(page_url, driver)

    return item_urls

In [8]:
main_driver = get_driver()
item_urls = get_urls_by_page(1, main_driver)
print("Number of urls:", len(item_urls))
print(item_urls)

Number of urls: 98
['https://www.cars.com/vehicledetail/24634e62-6c36-4289-8dfe-030c2dc65bb2/?attribution_type=isa', 'https://www.cars.com/vehicledetail/4363a3e9-5cc1-4f91-9a3e-10fe3a589f32/', 'https://www.cars.com/vehicledetail/33ddfdc6-9c5b-4bb6-8332-b12028474673/', 'https://www.cars.com/vehicledetail/d91e8a5e-0ade-408d-b638-43a46373f6db/', 'https://www.cars.com/vehicledetail/4dc3b61c-b202-40ea-baed-1d997244b20c/', 'https://www.cars.com/vehicledetail/4395ced9-0953-4f43-8a61-84a7e7d1f797/', 'https://www.cars.com/vehicledetail/75ae9203-f8a3-4064-a2f7-c2793c2dfc67/', 'https://www.cars.com/vehicledetail/d605bb8c-0f34-4927-948c-48cc73c59ea9/', 'https://www.cars.com/vehicledetail/14e55309-a608-44a1-a864-b93680aa5e66/', 'https://www.cars.com/vehicledetail/abb7b84b-ff77-496c-b60c-33c80b474b52/', 'https://www.cars.com/vehicledetail/300085c3-f82f-4255-a4a5-0ca83d06127a/', 'https://www.cars.com/vehicledetail/f29b1a97-2c90-4e71-b6fb-03b46802d98a/', 'https://www.cars.com/vehicledetail/ebe9b135-16

In [9]:
def get_relevant_data(page_html):

    title = page_html.css.select(".listing-title")
    title = title[0].text if len(title) > 0 else ""

    year_brand_model = page_html.css.select(".consumer-reviews-subheading")
    year_brand_model = year_brand_model[0].text if len(year_brand_model) > 0 else ""
    year_brand_model = re.sub("See[^>]+the|\n", " ", year_brand_model).strip()

    mileage = page_html.css.select(".listing-mileage")
    mileage = mileage[0].text if len(mileage) > 0 else ""

    price = page_html.css.select(".primary-price")
    price = price[0].text if len(price) > 0 else ""

    buyer_rating = page_html.css.select(".sds-rating--big .sds-rating__count")
    buyer_rating = buyer_rating[0].text if len(buyer_rating) > 0 else ""

    num_reviews = page_html.css.select(".sds-rating--big .sds-rating__link")
    num_reviews = num_reviews[0].text if len(num_reviews) > 0 else ""

    percent_recommend = page_html.css.select(".reviews-recommended")
    percent_recommend = percent_recommend[0].text if len(percent_recommend) > 0 else ""

    review_breakdown = page_html.css.select(".review-breakdown--list")
    review_breakdown = review_breakdown[0].text if len(review_breakdown) > 0 else ""
    review_breakdown = re.sub("\n", " ", review_breakdown).strip()

    description = page_html.css.select(".fancy-description-list")
    if len(description) > 0:
        basic_description_keys = description[0].find_all("dt")
        basic_description_keys = [html_tag.text for html_tag in basic_description_keys]

        basic_description_values = description[0].find_all("dd")
        basic_description_values = [re.sub("\n", " ", html_tag.text).strip() for html_tag in basic_description_values]
        basic_description_values[3] = basic_description_values[3][:5]
    else:
        basic_description_keys = ""
        basic_description_values = ""

    if len(description) > 2:
        history_description_keys = description[2].find_all("dt")
        history_description_keys = [html_tag.text for html_tag in history_description_keys]

        history_description_values = description[2].find_all("dd")
        history_description_values = [html_tag.text for html_tag in history_description_values]
    else:
        history_description_keys = ""
        history_description_values = ""

    all_features = [title, year_brand_model, mileage, buyer_rating, num_reviews, percent_recommend, review_breakdown, basic_description_keys, basic_description_values, history_description_keys, history_description_values, price]

    return all_features

In [10]:
def extract_data_from_url(item_url):
    response = requests.get(item_url)
    page_html = BeautifulSoup(response.content, "html.parser")
    data = get_relevant_data(page_html)

    with open("/content/drive/MyDrive/scraped_used_cars_data.csv", "a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(data)

In [11]:
def get_data_on_page(page_num):
    all_data_on_page = []
    main_driver = get_driver()
    item_urls = get_urls_by_page(page_num, main_driver)

    with ThreadPoolExecutor(max_workers=len(item_urls)) as executor:
        executor.map(extract_data_from_url, item_urls)

In [12]:
def get_all_data(starting_page, ending_page):

    for page_num in range(starting_page, ending_page + 1):
        get_data_on_page(page_num)

        print("data points written on page:", page_num)

In [52]:
get_all_data(1, 1)

data points written on page: 1


In [53]:
get_all_data(2, 10)

data points written on page: 2
data points written on page: 3
data points written on page: 4
data points written on page: 5
data points written on page: 6
data points written on page: 7
data points written on page: 8
data points written on page: 9
data points written on page: 10


In [54]:
get_all_data(11, 100)

data points written on page: 11
data points written on page: 12
data points written on page: 13
data points written on page: 14
data points written on page: 15
data points written on page: 16
data points written on page: 17
data points written on page: 18
data points written on page: 19
data points written on page: 20
data points written on page: 21
data points written on page: 22
data points written on page: 23
data points written on page: 24
data points written on page: 25
data points written on page: 26
data points written on page: 27
data points written on page: 28
data points written on page: 29
data points written on page: 30
data points written on page: 31
data points written on page: 32
data points written on page: 33
data points written on page: 34
data points written on page: 35
data points written on page: 36
data points written on page: 37
data points written on page: 38
data points written on page: 39
data points written on page: 40
data points written on page: 41
data poi

In [13]:
get_all_data(101, 110)

data points written on page: 101
data points written on page: 102
data points written on page: 103
data points written on page: 104
data points written on page: 105
data points written on page: 106
data points written on page: 107
data points written on page: 108
data points written on page: 109
data points written on page: 110


In [15]:
get_all_data(111, 120)

data points written on page: 111
data points written on page: 112
data points written on page: 113
data points written on page: 114
data points written on page: 115
data points written on page: 116
data points written on page: 117
data points written on page: 118
data points written on page: 119
data points written on page: 120


In [16]:
get_all_data(121, 130)

data points written on page: 121
data points written on page: 122
data points written on page: 123
data points written on page: 124
data points written on page: 125
data points written on page: 126
data points written on page: 127
data points written on page: 128
data points written on page: 129
data points written on page: 130


In [17]:
get_all_data(131, 140)

data points written on page: 131
data points written on page: 132
data points written on page: 133
data points written on page: 134
data points written on page: 135
data points written on page: 136
data points written on page: 137
data points written on page: 138
data points written on page: 139
data points written on page: 140


In [18]:
get_all_data(141, 150)

data points written on page: 141
data points written on page: 142
data points written on page: 143
data points written on page: 144
data points written on page: 145
data points written on page: 146
data points written on page: 147
data points written on page: 148
data points written on page: 149
data points written on page: 150


In [None]:
get_all_data(151, 200)

In [14]:
import pandas as pd

pd.read_csv("/content/drive/MyDrive/scraped_used_cars_data.csv")

Unnamed: 0,title,year_brand_model,mileage,buyer_rating,num_reviews,percent_recommend,review_breakdown,basic_description_keys,basic_description_values,history_description_keys,history_description_values,price
0,1997 Chevrolet S-10 LS Extended Cab,1997 Chevrolet S-10.,"195,000 mi.",4.9,(8 reviews),100% of drivers recommend this car,Comfort 4.4 Interior 3.8 Performance 4.7 ...,"['Exterior color', 'Interior color', 'Drivetra...","['Red', 'Gray', 'Rear-wheel Drive', 'Gasol', '...",,,"$3,999"
1,2021 Ford Expedition Max Limited,,"61,277 mi.",,,,,"['Exterior color', 'Interior color', 'Drivetra...","['Gray', 'Ebony', 'Four-wheel Drive', '16–21',...","['Accidents or damage', '1-owner vehicle', 'Pe...","['At least 1 accident or damage reported', 'Ye...","$37,450"
2,2021 Lamborghini Huracan EVO Base,2021 Lamborghini Huracan EVO.,"8,970 mi.",5.0,(1 review),100% of drivers recommend this car,Comfort 4.0 Interior 5.0 Performance 5.0 ...,"['Exterior color', 'Interior color', 'Drivetra...","['Arancio Borealis Pearl', 'Nero Ade', 'Rear-w...","['Accidents or damage', 'Clean title', '1-owne...","['None reported', 'Yes', 'Yes', 'Yes']","$275,995"
3,2014 Subaru XV Crosstrek Hybrid 2.0i Hybrid,2014 Subaru XV Crosstrek Hybrid.,"117,260 mi.",4.4,(11 reviews),81% of drivers recommend this car,Comfort 4.3 Interior 4.3 Performance 4.1 ...,"['Exterior color', 'Interior color', 'Drivetra...","['Plasma Green Pearl', 'Black', 'All-wheel Dri...","['Accidents or damage', 'Clean title', '1-owne...","['At least 1 accident or damage reported', 'Ye...","$9,499"
4,2015 Scion tC Release Series 9.0,2015 Scion tC.,"135,098 mi.",4.5,(23 reviews),91% of drivers recommend this car,Comfort 4.3 Interior 4.3 Performance 4.4 ...,"['Exterior color', 'Interior color', 'Drivetra...","['Magma', 'Dark Charcoal', 'Front-wheel Drive'...","['Accidents or damage', 'Clean title', '1-owne...","['At least 1 accident or damage reported', 'Ye...","$8,995"
...,...,...,...,...,...,...,...,...,...,...,...,...
10649,2023 Mercedes-Benz CLA 250 Base 4MATIC,,"6,602 mi.",,,,,"['Exterior color', 'Interior color', 'Drivetra...","['Mountain Grey Metallic', 'Black', 'All-wheel...","['Accidents or damage', '1-owner vehicle', 'Pe...","['None reported', 'Yes', 'Yes']","$39,295"
10650,2021 Hyundai Palisade Limited,2021 Hyundai Palisade.,"34,628 mi.",4.7,(95 reviews),91% of drivers recommend this car,Comfort 4.9 Interior 4.9 Performance 4.8 ...,"['Exterior color', 'Interior color', 'Drivetra...","['Becketts Black', 'Beige', 'All-wheel Drive',...","['Accidents or damage', '1-owner vehicle', 'Pe...","['At least 1 accident or damage reported', 'Ye...","$34,960"
10651,2008 Mercury Milan V6 Premier,2008 Mercury Milan.,"121,212 mi.",4.6,(30 reviews),96% of drivers recommend this car,Comfort 4.8 Interior 4.6 Performance 4.5 ...,"['Exterior color', 'Interior color', 'Drivetra...","['Dune Pearl Clearcoat Metallic', 'Camel', 'Fr...","['Accidents or damage', '1-owner vehicle', 'Pe...","['None reported', 'Yes', 'Yes']","$5,900"
10652,2023 BMW X1 xDrive28i,2023 BMW X1.,"2,827 mi.",2.8,(5 reviews),40% of drivers recommend this car,Comfort 4.2 Interior 3.8 Performance 3.6 ...,"['Exterior color', 'Interior color', 'Drivetra...","['Orange Metallic', 'Black', 'All-wheel Drive'...","['Accidents or damage', '1-owner vehicle', 'Pe...","['None reported', 'Yes', 'No', 'At least 1 ope...","$41,897"
