# Import Libraries

In [61]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from login_info import cr_username
from login_info import cr_password
import time

# Obtain Data

Create Selenium webdriver

In [62]:
driver_path = 'chromedriver/chromedriver'
options = Options()
options.headless = False
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(options=options, executable_path=driver_path)

Log into Consumer Reports website

In [63]:
driver.get('https://www.consumerreports.org/cro/modal-login/index.htm')
driver.find_element_by_xpath('/html/body/div/div/div/div/div[1]/form/div[1]/input').send_keys(cr_username)
driver.find_element_by_xpath('/html/body/div/div/div/div/div[1]/form/div[2]/input').send_keys(cr_password)
driver.find_element_by_xpath('/html/body/div/div/div/div/div[1]/form/div[6]/input').click()

Scrape Consume Reports vehicle classees

In [64]:
driver.get('https://www.consumerreports.org/cars/types/used/convertibles')
classes_html = driver.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div').get_attribute('innerHTML')
vehicle_class_soup = BeautifulSoup(classes_html, 'html.parser')
vehicle_classes=['Convertibles']
for element in vehicle_class_soup.find_all('li'):
    vehicle_classes.append(element.text.replace('\n', ''))
vehicle_classes

['Convertibles',
 'Hybrids/EVs',
 'Luxury Cars',
 'Minivans',
 'Pickup Trucks',
 'Sedans',
 'Small Cars',
 'Sports Cars',
 'SUVs',
 'Wagons']

Create list of vehicle class urls

In [65]:
vehicle_class_urls=[]
for vehicle_class in vehicle_classes:
    formatted_vehicle_class = vehicle_class.replace('/', '-').replace(' ', '-').lower()
    vehicle_class_urls.append(f'https://www.consumerreports.org/cars/types/used/{formatted_vehicle_class}')
vehicle_class_urls

['https://www.consumerreports.org/cars/types/used/convertibles',
 'https://www.consumerreports.org/cars/types/used/hybrids-evs',
 'https://www.consumerreports.org/cars/types/used/luxury-cars',
 'https://www.consumerreports.org/cars/types/used/minivans',
 'https://www.consumerreports.org/cars/types/used/pickup-trucks',
 'https://www.consumerreports.org/cars/types/used/sedans',
 'https://www.consumerreports.org/cars/types/used/small-cars',
 'https://www.consumerreports.org/cars/types/used/sports-cars',
 'https://www.consumerreports.org/cars/types/used/suvs',
 'https://www.consumerreports.org/cars/types/used/wagons']

Get html for each vehicle class url with list of vehicles for that class

In [66]:
soups = []
for url in vehicle_class_urls:
    driver.get(url)
    time.sleep(1)
    table_html = driver.find_element_by_xpath('/html/body/div[4]/div/div[2]/div/div[2]/table/tbody').get_attribute('innerHTML')
    soups.append(BeautifulSoup(table_html,'html.parser'))

Create list of all Consumer Reports rated used vehicles from vehicle class urls html

In [67]:
cars_list = []
for soup in soups:
    for item in soup.find_all('tr'):
        if len(item.find_all('td', {'rowspan': '4'}))>0:
            model = item.find('td').find('div').find('div').find('div').find('a').find('span').text
            brand = item.find('td').find('div').find('div').find('div').find('a').text.replace(model, '')
            cars_list.append(brand+':'+model)

cars_list = list(set(cars_list))
cars_list[0:10]

['Mercedes-Benz:B-Class Electric Drive',
 'Volvo:XC70',
 'Scion:tC',
 'Volkswagen:Atlas',
 'Toyota:Prius V',
 'Kia:Forte',
 'Jeep:Grand Cherokee L',
 'Mercury:Villager',
 'Chevrolet:TrailBlazer',
 'Toyota:Camry Solara']

Create list of urls to scrape from cars list

In [68]:
reviews_urls_list = []
for car in cars_list:
    car = car.replace(' ', '-').lower()
    car_brand = car.split(':')[0]
    car_model = car.split(':')[1]
    reviews_urls_list.append(f'https://www.consumerreports.org/cars/{car_brand}/{car_model}/2019/overview/')
reviews_urls_list[0:10]

['https://www.consumerreports.org/cars/mercedes-benz/b-class-electric-drive/2019/overview/',
 'https://www.consumerreports.org/cars/volvo/xc70/2019/overview/',
 'https://www.consumerreports.org/cars/scion/tc/2019/overview/',
 'https://www.consumerreports.org/cars/volkswagen/atlas/2019/overview/',
 'https://www.consumerreports.org/cars/toyota/prius-v/2019/overview/',
 'https://www.consumerreports.org/cars/kia/forte/2019/overview/',
 'https://www.consumerreports.org/cars/jeep/grand-cherokee-l/2019/overview/',
 'https://www.consumerreports.org/cars/mercury/villager/2019/overview/',
 'https://www.consumerreports.org/cars/chevrolet/trailblazer/2019/overview/',
 'https://www.consumerreports.org/cars/toyota/camry-solara/2019/overview/']

Write code to retrieve datapoints from overview review pages

In [108]:
driver.get(reviews_urls_list[3])

In [109]:
data = []
data.append([driver.find_element_by_xpath('/html/body/div[6]/div/div/div/div/div/div[1]/div[1]/a/div[1]').text.title(),
        driver.find_element_by_xpath('/html/body/div[6]/div/div/div/div/div/div[1]/div[1]/a/div[2]/div/div/div[2]/span[1]').text.strip()])
for index in [2, 7]:
    data.append([driver.find_element_by_xpath(f'/html/body/div[7]/div[1]/div[1]/div[2]/div[1]/div[2]/div/div[{index}]/div[1]/span').text,
           driver.find_element_by_xpath(f'/html/body/div[7]/div[1]/div[1]/div[2]/div[1]/div[2]/div/div[{index}]/div[2]/div/div/div/div/div[2]/span/span').text])
for index in [2, 4, 6, 8, 10, 12, ]:
    data.append([driver.find_element_by_xpath(f'/html/body/div[7]/div[1]/div[1]/div[2]/div[2]/div[2]/div/div[{index}]/div[1]/span').text,
           driver.find_element_by_xpath(f'/html/body/div[7]/div[1]/div[1]/div[2]/div[2]/div[2]/div/div[{index}]/div[2]/div/div/div/div/div[2]/span/span').text])
data.append([driver.find_element_by_xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[3]/div[2]/div/div[2]/div[1]/span').text,
        driver.find_element_by_xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[3]/div[2]/div/div[2]/div[2]/div/div/div/div/div[2]/span/span').text])
data

[['Reliability', '2'],
 ['Acceleration', '3'],
 ['Braking', '4'],
 ['Ride', '4'],
 ['Noise', '4'],
 ['Front Seat Comfort', '4'],
 ['Rear Seat Comfort', '5'],
 ['Interior Fit and Finish', '4'],
 ['Trunk/Cargo Area', '4'],
 ['Fuel Economy', '2']]

Write code to retrieve datapoints from ratings/specs review pages

In [124]:
driver.get(reviews_urls_list[3].replace('overview/', 'ratings-specs/?pagestop'))
driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/h3/div[2]').click()
driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[7]/h3/div[2]').click()

In [127]:
data_2 = []
data_2.append([driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[2]/div/div/div[3]/div[1]/div').text.title(),
        driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[2]/div/div/div[3]/div[2]/div/div/div[2]/span[1]').text.strip()])
data_2.append([driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[2]/div/div/div[7]/div[1]/div').text.title(),
        driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[2]/div/div/div[7]/div[2]/div/div/div[2]/span[1]').text.strip()])
data_2.append([driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/div/div/div[3]/div[1]/div').text.title(),
        driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/div/div/div[4]/div[2]/div/div/div[2]/span[1]').text.strip()])
data_2.append([driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/div/div/div[6]/div[1]/div').text.title(),
        driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/div/div/div[6]/div[2]/div/div/div[2]/span[1]').text.strip()])
data_2.append([driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/div/div/div[10]/div[1]/div').text.title(),
        driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/div/div/div[10]/div[2]/div/div/div[2]/span[1]').text.strip()])
data_2.append([driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[7]/div/div/div[7]/div[1]/div').text.title(),
        driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[7]/div/div/div[7]/div[2]/div').text.strip()])

data_2

[['Routine Handling', '4'],
 ['Headlights', '3'],
 ['Driving Position', '4'],
 ['Third Seat Comfort', '3'],
 ['Usability', '4'],
 ['Max. Load', '1215 lb.']]