# Import Libraries

In [24]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from login_info import cr_username
from login_info import cr_password
import time

# Obtain Data

Create Selenium webdriver

In [25]:
driver_path = 'chromedriver/chromedriver'
options = Options()
options.headless = False
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(options=options, executable_path=driver_path)

Log into Consumer Reports website

In [26]:
driver.get('https://www.consumerreports.org/cro/modal-login/index.htm')
driver.find_element_by_xpath('/html/body/div/div/div/div/div[1]/form/div[1]/input').send_keys(cr_username)
driver.find_element_by_xpath('/html/body/div/div/div/div/div[1]/form/div[2]/input').send_keys(cr_password)
driver.find_element_by_xpath('/html/body/div/div/div/div/div[1]/form/div[6]/input').click()

Scrape Consume Reports vehicle classees

In [27]:
driver.get('https://www.consumerreports.org/cars/types/used/convertibles')
classes_html = driver.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/div').get_attribute('innerHTML')
vehicle_class_soup = BeautifulSoup(classes_html, 'html.parser')
vehicle_classes=['Convertibles']
for element in vehicle_class_soup.find_all('li'):
    vehicle_classes.append(element.text.replace('\n', ''))
vehicle_classes

['Convertibles',
 'Hybrids/EVs',
 'Luxury Cars',
 'Minivans',
 'Pickup Trucks',
 'Sedans',
 'Small Cars',
 'Sports Cars',
 'SUVs',
 'Wagons']

Create list of vehicle class urls

In [28]:
vehicle_class_urls=[]
for vehicle_class in vehicle_classes:
    formatted_vehicle_class = vehicle_class.replace('/', '-').replace(' ', '-').lower()
    vehicle_class_urls.append(f'https://www.consumerreports.org/cars/types/used/{formatted_vehicle_class}')
vehicle_class_urls

['https://www.consumerreports.org/cars/types/used/convertibles',
 'https://www.consumerreports.org/cars/types/used/hybrids-evs',
 'https://www.consumerreports.org/cars/types/used/luxury-cars',
 'https://www.consumerreports.org/cars/types/used/minivans',
 'https://www.consumerreports.org/cars/types/used/pickup-trucks',
 'https://www.consumerreports.org/cars/types/used/sedans',
 'https://www.consumerreports.org/cars/types/used/small-cars',
 'https://www.consumerreports.org/cars/types/used/sports-cars',
 'https://www.consumerreports.org/cars/types/used/suvs',
 'https://www.consumerreports.org/cars/types/used/wagons']

Get html for each vehicle class url with list of vehicles for that class

In [29]:
soups = []
for url in vehicle_class_urls:
    driver.get(url)
    time.sleep(1)
    table_html = driver.find_element_by_xpath('/html/body/div[4]/div/div[2]/div/div[2]/table/tbody').get_attribute('innerHTML')
    soups.append(BeautifulSoup(table_html,'html.parser'))

Create list of all Consumer Reports rated used vehicles from vehicle class urls html

In [30]:
cars_list = []
for soup in soups:
    for item in soup.find_all('tr'):
        if len(item.find_all('td', {'rowspan': '4'}))>0:
            model = item.find('td').find('div').find('div').find('div').find('a').find('span').text
            brand = item.find('td').find('div').find('div').find('div').find('a').text.replace(model, '')
            cars_list.append(brand+':'+model)

cars_list = list(set(cars_list))
cars_list[0:10]

['Infiniti:Q50',
 'Toyota:Tundra',
 'Ford:F-250',
 'Dodge:Durango',
 'Toyota:RAV4',
 'BMW:X5',
 'Honda:CR-Z',
 'Ram:2500',
 'Toyota:4Runner',
 'Chevrolet:Avalanche']

Create list of urls to scrape from cars list

In [31]:
reviews_urls_list = []
for car in cars_list:
    car = car.replace(' ', '-').lower()
    car_brand = car.split(':')[0]
    car_model = car.split(':')[1]
    reviews_urls_list.append(f'https://www.consumerreports.org/cars/{car_brand}/{car_model}/2019/overview/')
reviews_urls_list[0:10]

['https://www.consumerreports.org/cars/infiniti/q50/2019/overview/',
 'https://www.consumerreports.org/cars/toyota/tundra/2019/overview/',
 'https://www.consumerreports.org/cars/ford/f-250/2019/overview/',
 'https://www.consumerreports.org/cars/dodge/durango/2019/overview/',
 'https://www.consumerreports.org/cars/toyota/rav4/2019/overview/',
 'https://www.consumerreports.org/cars/bmw/x5/2019/overview/',
 'https://www.consumerreports.org/cars/honda/cr-z/2019/overview/',
 'https://www.consumerreports.org/cars/ram/2500/2019/overview/',
 'https://www.consumerreports.org/cars/toyota/4runner/2019/overview/',
 'https://www.consumerreports.org/cars/chevrolet/avalanche/2019/overview/']

Scrape data from review urls

In [32]:
missing_urls=[]
review_data = []
for review_url in reviews_urls_list[0:20]:
    driver.get(review_url)
    time.sleep(10)
    if driver.find_element_by_xpath('/html').text[0:14] == 'Buying Product':
        missing_urls.append(review_url)
        print('Bad: ', review_url)
    else:
        model_data = []
        model_data.append(review_url.split('/')[-5]+'-'+review_url.split('/')[-4])
        model_data.extend([driver.find_element_by_xpath('/html/body/div[6]/div/div/div/div/div/div[1]/div[1]/a/div[1]').text.title(),
            driver.find_element_by_xpath('/html/body/div[6]/div/div/div/div/div/div[1]/div[1]/a/div[2]/div/div/div[2]').text.strip()])
        for index in [2, 7]:
            model_data.extend([driver.find_element_by_xpath(f'/html/body/div[7]/div[1]/div[1]/div[2]/div[1]/div[2]/div/div[{index}]/div[1]').text,
               driver.find_element_by_xpath(f'/html/body/div[7]/div[1]/div[1]/div[2]/div[1]/div[2]/div/div[{index}]/div[2]/div/div/div/div/div[2]/span').text])
        for index in [2, 4, 6, 8, 10, 12, ]:
            model_data.extend([driver.find_element_by_xpath(f'/html/body/div[7]/div[1]/div[1]/div[2]/div[2]/div[2]/div/div[{index}]/div[1]').text,
               driver.find_element_by_xpath(f'/html/body/div[7]/div[1]/div[1]/div[2]/div[2]/div[2]/div/div[{index}]/div[2]/div/div/div/div/div[2]/span').text])
        model_data.extend([driver.find_element_by_xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[3]/div[2]/div/div[2]/div[1]').text,
            driver.find_element_by_xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[3]/div[2]/div/div[2]/div[2]/div/div/div/div/div[2]/span').text])
        driver.get(review_url.replace('overview/', 'ratings-specs/?pagestop'))
        time.sleep(5)
        driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/h3/div[2]').click()
        driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[7]/h3/div[2]').click()
        model_data.extend([driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[2]/div/div/div[3]/div[1]/div').text.title(),
            driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[2]/div/div/div[3]/div[2]/div/div/div[2]').text.strip()])
        model_data.extend([driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[2]/div/div/div[7]/div[1]/div').text.title(),
            driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[2]/div/div/div[7]/div[2]/div/div/div[2]').text.strip()])
        model_data.extend([driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/div/div/div[3]/div[1]/div').text.title(),
            driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/div/div/div[4]/div[2]/div/div/div[2]').text.strip()])
        model_data.extend([driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/div/div/div[6]/div[1]/div').text.title(),
            driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/div/div/div[6]/div[2]/div/div/div[2]').text.strip()])
        model_data.extend([driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/div/div/div[10]/div[1]/div').text.title(),
            driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[3]/div/div/div[10]/div[2]/div/div/div[2]').text.strip()])
        model_data.extend([driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[7]/div/div/div[7]/div[1]/div').text.title(),
        driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[7]/div/div/div[7]/div[2]/div').text.strip()])
        review_data.append(model_data)    
        print('Good: ', review_url)

Good:  https://www.consumerreports.org/cars/infiniti/q50/2019/overview/
Good:  https://www.consumerreports.org/cars/toyota/tundra/2019/overview/
Good:  https://www.consumerreports.org/cars/ford/f-250/2019/overview/
Good:  https://www.consumerreports.org/cars/dodge/durango/2019/overview/
Good:  https://www.consumerreports.org/cars/toyota/rav4/2019/overview/
Good:  https://www.consumerreports.org/cars/bmw/x5/2019/overview/
Bad:  https://www.consumerreports.org/cars/honda/cr-z/2019/overview/
Good:  https://www.consumerreports.org/cars/ram/2500/2019/overview/
Good:  https://www.consumerreports.org/cars/toyota/4runner/2019/overview/
Bad:  https://www.consumerreports.org/cars/chevrolet/avalanche/2019/overview/
Bad:  https://www.consumerreports.org/cars/cadillac/xlr/2019/overview/
Good:  https://www.consumerreports.org/cars/honda/accord/2019/overview/
Bad:  https://www.consumerreports.org/cars/scion/tc/2019/overview/
Good:  https://www.consumerreports.org/cars/infiniti/q70/2019/overview/
Good

View test scrape results

In [36]:
pd.DataFrame(review_data).iloc[:, 0:15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,infiniti-q50,Reliability,5 / 5,Acceleration,5 / 5,Braking,5 / 5,Ride,4 / 5,Noise,4 / 5,Front Seat Comfort,4 / 5,Rear Seat Comfort,3 / 5
1,toyota-tundra,Reliability,4 / 5,Acceleration,4 / 5,Braking,1 / 5,Ride,2 / 5,Noise,4 / 5,Front Seat Comfort,4 / 5,Rear Seat Comfort,3 / 5
2,ford-f-250,Reliability,3 / 5,Acceleration,4 / 5,Braking,1 / 5,Ride,2 / 5,Noise,3 / 5,Front Seat Comfort,4 / 5,Rear Seat Comfort,5 / 5
3,dodge-durango,Reliability,,Acceleration,4 / 5,Braking,4 / 5,Ride,4 / 5,Noise,5 / 5,Front Seat Comfort,5 / 5,Rear Seat Comfort,4 / 5
4,toyota-rav4,Reliability,2 / 5,Acceleration,4 / 5,Braking,4 / 5,Ride,3 / 5,Noise,2 / 5,Front Seat Comfort,4 / 5,Rear Seat Comfort,4 / 5
5,bmw-x5,Reliability,3 / 5,Acceleration,5 / 5,Braking,5 / 5,Ride,4 / 5,Noise,5 / 5,Front Seat Comfort,5 / 5,Rear Seat Comfort,4 / 5
6,ram-2500,Reliability,3 / 5,Acceleration,,Braking,,Ride,,Noise,,Front Seat Comfort,,Rear Seat Comfort,
7,toyota-4runner,Reliability,4 / 5,Acceleration,4 / 5,Braking,4 / 5,Ride,2 / 5,Noise,4 / 5,Front Seat Comfort,4 / 5,Rear Seat Comfort,4 / 5
8,honda-accord,Reliability,4 / 5,Acceleration,4 / 5,Braking,4 / 5,Ride,4 / 5,Noise,4 / 5,Front Seat Comfort,4 / 5,Rear Seat Comfort,4 / 5
9,infiniti-q70,Reliability,,Acceleration,5 / 5,Braking,5 / 5,Ride,4 / 5,Noise,4 / 5,Front Seat Comfort,4 / 5,Rear Seat Comfort,5 / 5


In [37]:
pd.DataFrame(review_data).iloc[:, 15:]

Unnamed: 0,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32
0,Interior Fit and Finish,5 / 5,Trunk/Cargo Area,2 / 5,Fuel Economy,3 / 5,Routine Handling,4 / 5,Headlights,3 / 5,Driving Position,4 / 5,Third Seat Comfort,,Usability,3 / 5,Max. Load,900 lb.
1,Interior Fit and Finish,3 / 5,Trunk/Cargo Area,5 / 5,Fuel Economy,1 / 5,Routine Handling,3 / 5,Headlights,4 / 5,Driving Position,4 / 5,Third Seat Comfort,,Usability,4 / 5,Max. Load,1395 lb.
2,Interior Fit and Finish,4 / 5,Trunk/Cargo Area,5 / 5,Fuel Economy,1 / 5,Routine Handling,1 / 5,Headlights,3 / 5,Driving Position,4 / 5,Third Seat Comfort,,Usability,4 / 5,Max. Load,2175 lb.
3,Interior Fit and Finish,4 / 5,Trunk/Cargo Area,4 / 5,Fuel Economy,2 / 5,Routine Handling,3 / 5,Headlights,3 / 5,Driving Position,5 / 5,Third Seat Comfort,3 / 5,Usability,5 / 5,Max. Load,1200 lb.
4,Interior Fit and Finish,3 / 5,Trunk/Cargo Area,2 / 5,Fuel Economy,4 / 5,Routine Handling,4 / 5,Headlights,3 / 5,Driving Position,4 / 5,Third Seat Comfort,,Usability,4 / 5,Max. Load,900 lb.
5,Interior Fit and Finish,5 / 5,Trunk/Cargo Area,3 / 5,Fuel Economy,3 / 5,Routine Handling,4 / 5,Headlights,2 / 5,Driving Position,5 / 5,Third Seat Comfort,,Usability,3 / 5,Max. Load,950 lb.
6,Interior Fit and Finish,,Trunk/Cargo Area,,Fuel Economy,,Routine Handling,,Headlights,,Driving Position,,Third Seat Comfort,,Usability,,Max. Load,1855 lb.
7,Interior Fit and Finish,3 / 5,Trunk/Cargo Area,4 / 5,Fuel Economy,2 / 5,Routine Handling,2 / 5,Headlights,4 / 5,Driving Position,4 / 5,Third Seat Comfort,,Usability,4 / 5,Max. Load,1155 lb.
8,Interior Fit and Finish,4 / 5,Trunk/Cargo Area,4 / 5,Fuel Economy,5 / 5,Routine Handling,4 / 5,Headlights,4 / 5,Driving Position,4 / 5,Third Seat Comfort,,Usability,4 / 5,Max. Load,850 lb.
9,Interior Fit and Finish,5 / 5,Trunk/Cargo Area,3 / 5,Fuel Economy,3 / 5,Routine Handling,4 / 5,Headlights,2 / 5,Driving Position,4 / 5,Third Seat Comfort,,Usability,4 / 5,Max. Load,860 lb.
