# Browser Automation

In [1]:
import glob
import pandas as pd
from lxml import html
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException

In [2]:
pathCSV = "mmy.csv"
recallDf = pd.read_csv(pathCSV)
recallDf['Found'] = False
recallDf['numVehs'] = 0
recallDf['numRecalls'] = 0
print(recallDf)


      Make       Model  Year  Found  numVehs  numRecalls
0    Acura         RDX  2020  False        0           0
1     Audi          A3  2006  False        0           0
2     Audi          A6  2020  False        0           0
3     Audi          Q5  2018  False        0           0
4      BMW  128i Coupe  2009  False        0           0
..     ...         ...   ...    ...      ...         ...
234  Volvo         S40  2005  False        0           0
235  Volvo         S60  2006  False        0           0
236  Volvo         S60  2011  False        0           0
237  Volvo        XC60  2010  False        0           0
238  Volvo        XC90  2007  False        0           0

[239 rows x 6 columns]


In [3]:
def singleVehSaver(idx, url, folderName="singleVehHTMLs", mod_numVeh=True):
    recall_name = "_".join(url.split("#")[0].split("/")[4:])
    print(f"Accessing website: {url}")
    browser.get(url)
    timeout = 10
    try:
        page_present = EC.presence_of_element_located((By.ID, 'recalls'))
        WebDriverWait(browser, timeout).until(page_present)
        print ("Webpage is loaded")                                
    except TimeoutException:
        print ("Timed out waiting for page to load.")
        return 0    
    if mod_numVeh:
        recallDf.at[idx, 'numVehs'] = 1 
    htmlstring = browser.page_source
    with open(f'./{folderName}/{idx}_{recall_name}.html', "w") as file:
        file.write(htmlstring)
    return 1


In [4]:
def singleVehParser(htmlPath, recallDf):
    with open(htmlPath, "r") as file:
        tree = html.fromstring(file.read())
    idx = int(htmlPath.split("/")[-1].split("_")[0])
    recallstrList = tree.xpath('//*[@id="recalls"]//text()')
    if len(recallstrList)>0:
        recallDf.at[idx, 'Found'] = True
        recallDf.at[idx, 'numRecalls'] = int(recallstrList[1][0])
        


In [5]:
def multiVehSaver(idx, recall, recallDf):
    recall_name = f"{idx}_{recall.Year}_{recall.Make}_{recall.Model}]"
    url = f"https://www.nhtsa.gov/vehicle/{recall.Year}/{recall.Make}/{recall.Model}"
    print(f"Accessing website: {url}")
    browser.get(url)
    timeout = 15
    try:
        page_present = EC.presence_of_element_located((By.ID, 'block-nhtsa-content'))
        WebDriverWait(browser, timeout).until(page_present)
                                                       
    except TimeoutException:
        print ("Timed out waiting for page to load.")
        
    htmlstring = browser.page_source
    
    nameStr = browser.find_element("xpath", '//*[@id="block-nhtsa-content"]/div[4]').text
    safetyLoc = [i for i,x in enumerate(nameStr.split("\n")) if x=="OVERALL SAFETY RATING"]
    recallDf.at[idx, 'numVehs'] = len(safetyLoc)

    print(f"There are {len(safetyLoc)} types of vehicles")
    for s_idx in safetyLoc:
        name1 = nameStr.split("\n")[s_idx - 1].split(" ")[0] + "/" + " ".join(nameStr.split("\n")[s_idx - 1].split(" ")[1:])
        name2 = " ".join(nameStr.split("\n")[s_idx - 1].split(" ")[:-1]) + "/" + nameStr.split("\n")[s_idx - 1].split(" ")[-1] 
        print(name1)
        print(name2)
        url_new1 = f"{url}/{name1}#recalls"
        url_new2 = f"{url}/{name2}#recalls"
        singleVehSaver(idx, url_new1, folderName="multiVehHTMLs", mod_numVeh=False)
        singleVehSaver(idx, url_new2, folderName="multiVehHTMLs", mod_numVeh=False)


In [6]:
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
browser=webdriver.Chrome(options=chrome_options)
for idx, recall in recallDf.head(3).iterrows():
    url = f"https://www.nhtsa.gov/vehicle/{recall.Year}/{recall.Make}/{recall.Model}#recalls"
    singleVehSaver(idx, url)


Accessing website: https://www.nhtsa.gov/vehicle/2020/Acura/RDX#recalls
Timed out waiting for page to load.
Accessing website: https://www.nhtsa.gov/vehicle/2006/Audi/A3#recalls
Webpage is loaded
Accessing website: https://www.nhtsa.gov/vehicle/2020/Audi/A6#recalls
Timed out waiting for page to load.


In [7]:
htmlFiles = sorted(glob.glob('./singleVehHTMLs/*.html'))
for name in htmlFiles:
    singleVehParser(name, recallDf)
print(recallDf.loc[recallDf["Found"]])

      Make     Model  Year  Found  numVehs  numRecalls
1     Audi        A3  2006   True        1           4
9      BMW        X3  2004   True        0           2
10     BMW        i3  2016   True        0           2
13   Buick  Envision  2017   True        0           4
14   Buick  LaCrosse  2010   True        0           2
..     ...       ...   ...    ...      ...         ...
234  Volvo       S40  2005   True        0           3
235  Volvo       S60  2006   True        0           4
236  Volvo       S60  2011   True        0           6
237  Volvo      XC60  2010   True        0           7
238  Volvo      XC90  2007   True        0           2

[107 rows x 6 columns]


In [None]:
otherDf = recallDf.loc[recallDf["Found"]==False]
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)

for idx, recall in otherDf.head(20).iterrows():
    browser=webdriver.Chrome(options=chrome_options)
    multiVehSaver(idx, recall, recallDf)
    browser.close()

Accessing website: https://www.nhtsa.gov/vehicle/2020/Acura/RDX
There are 2 types of vehicles
SUV/FWD
SUV/FWD
Accessing website: https://www.nhtsa.gov/vehicle/2020/Acura/RDX/SUV/FWD#recalls
Webpage is loaded
Accessing website: https://www.nhtsa.gov/vehicle/2020/Acura/RDX/SUV/FWD#recalls
Webpage is loaded
SUV/AWD
SUV/AWD
Accessing website: https://www.nhtsa.gov/vehicle/2020/Acura/RDX/SUV/AWD#recalls
Webpage is loaded
Accessing website: https://www.nhtsa.gov/vehicle/2020/Acura/RDX/SUV/AWD#recalls
Webpage is loaded
Accessing website: https://www.nhtsa.gov/vehicle/2020/Audi/A6
There are 2 types of vehicles
4/DR FWD
4 DR/FWD
Accessing website: https://www.nhtsa.gov/vehicle/2020/Audi/A6/4/DR FWD#recalls
Timed out waiting for page to load.
Accessing website: https://www.nhtsa.gov/vehicle/2020/Audi/A6/4 DR/FWD#recalls
Webpage is loaded
4/DR AWD
4 DR/AWD
Accessing website: https://www.nhtsa.gov/vehicle/2020/Audi/A6/4/DR AWD#recalls
Timed out waiting for page to load.
Accessing website: https://

In [None]:
htmlFiles = sorted(glob.glob('./multiVehHTMLs/*.html'))
for name in htmlFiles:
    singleVehParser(name, recallDf)

In [None]:
print(recallDf.loc[recallDf["Found"]])