# Import Libraries

In [13]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from login_info import cr_username
from login_info import cr_password
import time

# Obtain Data

Create Selenium webdriver

In [14]:
driver_path = 'chromedriver/chromedriver'
options = Options()
options.headless = False
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(options=options, executable_path=driver_path)

Log into Consumer Reports website

In [15]:
driver.get('https://www.consumerreports.org/cro/modal-login/index.htm')
driver.find_element_by_xpath('/html/body/div/div/div/div/div[1]/form/div[1]/input').send_keys(cr_username)
driver.find_element_by_xpath('/html/body/div/div/div/div/div[1]/form/div[2]/input').send_keys(cr_password)
driver.find_element_by_xpath('/html/body/div/div/div/div/div[1]/form/div[6]/input').click()

Scrape Consume Reports vehicle classees

In [16]:
driver.get('https://www.consumerreports.org/cars/types/used/convertibles')
classes_html = driver.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div').get_attribute('innerHTML')
vehicle_class_soup = BeautifulSoup(classes_html, 'html.parser')
vehicle_classes=['Convertibles']
for element in vehicle_class_soup.find_all('li'):
    vehicle_classes.append(element.text.replace('\n', ''))
vehicle_classes

['Convertibles',
 'Hybrids/EVs',
 'Luxury Cars',
 'Minivans',
 'Pickup Trucks',
 'Sedans',
 'Small Cars',
 'Sports Cars',
 'SUVs',
 'Wagons']

Create list of vehicle class urls

In [17]:
vehicle_class_urls=[]
for vehicle_class in vehicle_classes:
    formatted_vehicle_class = vehicle_class.replace('/', '-').replace(' ', '-').lower()
    vehicle_class_urls.append(f'https://www.consumerreports.org/cars/types/used/{formatted_vehicle_class}')
vehicle_class_urls

['https://www.consumerreports.org/cars/types/used/convertibles',
 'https://www.consumerreports.org/cars/types/used/hybrids-evs',
 'https://www.consumerreports.org/cars/types/used/luxury-cars',
 'https://www.consumerreports.org/cars/types/used/minivans',
 'https://www.consumerreports.org/cars/types/used/pickup-trucks',
 'https://www.consumerreports.org/cars/types/used/sedans',
 'https://www.consumerreports.org/cars/types/used/small-cars',
 'https://www.consumerreports.org/cars/types/used/sports-cars',
 'https://www.consumerreports.org/cars/types/used/suvs',
 'https://www.consumerreports.org/cars/types/used/wagons']

Get html for each vehicle class url with list of vehicles for that class

In [18]:
soups = []
for url in vehicle_class_urls:
    driver.get(url)
    time.sleep(1)
    table_html = driver.find_element_by_xpath('/html/body/div[4]/div/div[2]/div/div[2]/table/tbody').get_attribute('innerHTML')
    soups.append(BeautifulSoup(table_html,'html.parser'))

Create list of all Consumer Reports rated used vehicles from vehicle class urls html

In [19]:
cars_list = []
for soup in soups:
    for item in soup.find_all('tr'):
        if len(item.find_all('td', {'rowspan': '4'}))>0:
            model = item.find('td').find('div').find('div').find('div').find('a').find('span').text
            brand = item.find('td').find('div').find('div').find('div').find('a').text.replace(model, '')
            cars_list.append(brand+':'+model)

cars_list = list(set(cars_list))
cars_list[0:10]

['Chevrolet:Silverado 1500',
 'Land Rover:Range Rover Velar',
 'Toyota:Prius C',
 'Mercedes-Benz:B-Class Electric Drive',
 'Chrysler:Pacifica',
 'Audi:A7',
 'Porsche:Panamera',
 'Cadillac:CT4',
 'Honda:Insight',
 'Ram:1500']

Create list of urls to scrape from cars list

In [20]:
reviews_urls_list = []
for car in cars_list:
    car = car.replace(' ', '-').lower()
    car_brand = car.split(':')[0]
    car_model = car.split(':')[1]
    reviews_urls_list.append(f'https://www.consumerreports.org/cars/{car_brand}/{car_model}/2019/overview/')
reviews_urls_list[0:10]

['https://www.consumerreports.org/cars/chevrolet/silverado-1500/2019/overview/',
 'https://www.consumerreports.org/cars/land-rover/range-rover-velar/2019/overview/',
 'https://www.consumerreports.org/cars/toyota/prius-c/2019/overview/',
 'https://www.consumerreports.org/cars/mercedes-benz/b-class-electric-drive/2019/overview/',
 'https://www.consumerreports.org/cars/chrysler/pacifica/2019/overview/',
 'https://www.consumerreports.org/cars/audi/a7/2019/overview/',
 'https://www.consumerreports.org/cars/porsche/panamera/2019/overview/',
 'https://www.consumerreports.org/cars/cadillac/ct4/2019/overview/',
 'https://www.consumerreports.org/cars/honda/insight/2019/overview/',
 'https://www.consumerreports.org/cars/ram/1500/2019/overview/']

Get html for each car model's review

In [25]:
reviews_soups = []
for review_url in reviews_urls_list:
    driver.get(review_url)
    soup_html = driver.page_source
    reviews_soups.append(BeautifulSoup(soup_html,'html.parser'))