# Vehicle Data Scraping

Source: https://pythoninoffice.com/get-table-data-from-web-page-using-python-pandas/
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.notna.html#pandas.DataFrame.notna  
        https://www.fueleconomy.gov/feg/PowerSearch

In [1]:
import pandas as pd
import numpy as np
import re
import bs4 as bs
import requests

fueleconomy.gov has tabulated data for all kinds of automobiles. Let's use the `read_html` function in `pandas` to scrape the website for this data. Notice these URLs are quite long. This is because I have several filters from the powersearch feature on the website activated to exclude luxury vehicle brands and focus on vehicle types that are in the UIUC fleet.

In [382]:
diesel_url = 'https://www.fueleconomy.gov/feg/PowerSearch.do?action=PowerSearch&year1=2018&year2=2021&cbmkAcura=Acura&cbmkBYD=BYD&cbmkChevrolet=Chevrolet&cbmkChrysler=Chrysler&cbmkDodge=Dodge&cbmkFiat=Fiat&cbmkFord=Ford&cbmkGMC=GMC&cbmkHonda=Honda&cbmkHyundai=Hyundai&cbmkJeep=Jeep&cbmkKia=Kia&cbmkMazda=Mazda&cbmkMitsubishi=Mitsubishi&cbmkNissan=Nissan&cbmkRam=Ram&cbmkSubaru=Subaru&cbmkToyota=Toyota&cbmkVolkswagen=Volkswagen&cbmkVolvo=Volvo&cbmcfamilySedans=Family+Sedans&cbmclargeSedans=Large+Sedans&cbmcpickupTrucks=Pickup+Trucks&cbmcsportUtilityVehicles=Sport+Utility+Vehicles&cbmcminivans=Minivans&cbmcvans=Vans&minmsrpsel=0&maxmsrpsel=0&city=0&highway=0&combined=0&cbftdiesel=Diesel&YearSel=2018-2021&MakeSel=Acura%3B+BYD%3B+Chevrolet%3B+Chrysler%3B+Dodge%3B+Fiat%3B+Ford%3B+GMC%3B+Honda%3B+Hyundai%3B+Jeep%3B+Kia%3B+Mazda%3B+Mitsubishi%3B+Nissan%3B+Ram%3B+Subaru%3B+Toyota%3B+Volkswagen%3B+Volvo&MarClassSel=Family+Sedans%2C+Large+Sedans%2C+Pickup+Trucks%2C+Sport+Utility+Vehicles%2C+Minivans%2C+Vans&FuelTypeSel=Diesel&VehTypeSel=&TranySel=&DriveTypeSel=&CylindersSel=&MpgSel=000&sortBy=Comb&Units=&url=SearchServlet&opt=new&minmsrp=0&maxmsrp=0&minmpg=0&maxmpg=0&sCharge=&tCharge=&startstop=&cylDeact=&rowLimit=200'
electric_url = 'https://www.fueleconomy.gov/feg/PowerSearch.do?action=PowerSearch&year1=2018&year2=2021&cbmkAcura=Acura&cbmkBYD=BYD&cbmkChevrolet=Chevrolet&cbmkChrysler=Chrysler&cbmkDodge=Dodge&cbmkFiat=Fiat&cbmkFord=Ford&cbmkGMC=GMC&cbmkHonda=Honda&cbmkHyundai=Hyundai&cbmkJeep=Jeep&cbmkKia=Kia&cbmkMazda=Mazda&cbmkMitsubishi=Mitsubishi&cbmkNissan=Nissan&cbmkRam=Ram&cbmkSubaru=Subaru&cbmkToyota=Toyota&cbmkVolkswagen=Volkswagen&cbmkVolvo=Volvo&cbmcfamilySedans=Family+Sedans&cbmclargeSedans=Large+Sedans&cbmcpickupTrucks=Pickup+Trucks&cbmcsportUtilityVehicles=Sport+Utility+Vehicles&cbmcminivans=Minivans&cbmcvans=Vans&minmsrpsel=0&maxmsrpsel=0&city=0&highway=0&combined=0&cbvtelectric=Electric&YearSel=2018-2021&MakeSel=Acura%3B+BYD%3B+Chevrolet%3B+Chrysler%3B+Dodge%3B+Fiat%3B+Ford%3B+GMC%3B+Honda%3B+Hyundai%3B+Jeep%3B+Kia%3B+Mazda%3B+Mitsubishi%3B+Nissan%3B+Ram%3B+Subaru%3B+Toyota%3B+Volkswagen%3B+Volvo&MarClassSel=Family+Sedans%2C+Large+Sedans%2C+Pickup+Trucks%2C+Sport+Utility+Vehicles%2C+Minivans%2C+Vans&FuelTypeSel=&VehTypeSel=Electric&TranySel=&DriveTypeSel=&CylindersSel=&MpgSel=000&sortBy=Comb&Units=&url=SearchServlet&opt=new&minmsrp=0&maxmsrp=0&minmpg=0&maxmpg=0&sCharge=&tCharge=&startstop=&cylDeact=&rowLimit=200'
e85_url = 'https://www.fueleconomy.gov/feg/PowerSearch.do?action=PowerSearch&year1=2018&year2=2021&cbmkAcura=Acura&cbmkBYD=BYD&cbmkChevrolet=Chevrolet&cbmkChrysler=Chrysler&cbmkDodge=Dodge&cbmkFiat=Fiat&cbmkFord=Ford&cbmkGMC=GMC&cbmkHonda=Honda&cbmkHyundai=Hyundai&cbmkJeep=Jeep&cbmkKia=Kia&cbmkMazda=Mazda&cbmkMitsubishi=Mitsubishi&cbmkNissan=Nissan&cbmkRam=Ram&cbmkSubaru=Subaru&cbmkToyota=Toyota&cbmkVolkswagen=Volkswagen&cbmkVolvo=Volvo&cbmcfamilySedans=Family+Sedans&cbmclargeSedans=Large+Sedans&cbmcpickupTrucks=Pickup+Trucks&cbmcsportUtilityVehicles=Sport+Utility+Vehicles&cbmcminivans=Minivans&cbmcvans=Vans&minmsrpsel=0&maxmsrpsel=0&city=0&highway=0&combined=0&cbfte85=E85&YearSel=2018-2021&MakeSel=Acura%3B+BYD%3B+Chevrolet%3B+Chrysler%3B+Dodge%3B+Fiat%3B+Ford%3B+GMC%3B+Honda%3B+Hyundai%3B+Jeep%3B+Kia%3B+Mazda%3B+Mitsubishi%3B+Nissan%3B+Ram%3B+Subaru%3B+Toyota%3B+Volkswagen%3B+Volvo&MarClassSel=Family+Sedans%2C+Large+Sedans%2C+Pickup+Trucks%2C+Sport+Utility+Vehicles%2C+Minivans%2C+Vans&FuelTypeSel=E85&VehTypeSel=&TranySel=&DriveTypeSel=&CylindersSel=&MpgSel=000&sortBy=Comb&Units=&url=SearchServlet&opt=new&minmsrp=0&maxmsrp=0&minmpg=0&maxmpg=0&sCharge=&tCharge=&startstop=&cylDeact=&rowLimit=200'
#gasoline_url = 'https://www.fueleconomy.gov/feg/PowerSearch.do?action=PowerSearch&year1=2018&year2=2021&cbmkAcura=Acura&cbmkBYD=BYD&cbmkChevrolet=Chevrolet&cbmkChrysler=Chrysler&cbmkDodge=Dodge&cbmkFiat=Fiat&cbmkFord=Ford&cbmkGMC=GMC&cbmkHonda=Honda&cbmkHyundai=Hyundai&cbmkJeep=Jeep&cbmkKia=Kia&cbmkMazda=Mazda&cbmkMitsubishi=Mitsubishi&cbmkNissan=Nissan&cbmkRam=Ram&cbmkSubaru=Subaru&cbmkToyota=Toyota&cbmkVolkswagen=Volkswagen&cbmkVolvo=Volvo&cbmcfamilySedans=Family+Sedans&cbmclargeSedans=Large+Sedans&cbmcpickupTrucks=Pickup+Trucks&cbmcsportUtilityVehicles=Sport+Utility+Vehicles&cbmcminivans=Minivans&cbmcvans=Vans&minmsrpsel=0&maxmsrpsel=0&city=0&highway=0&combined=0&cbvtgasoline=Gasoline&YearSel=2018-2021&MakeSel=Acura%3B+BYD%3B+Chevrolet%3B+Chrysler%3B+Dodge%3B+Fiat%3B+Ford%3B+GMC%3B+Honda%3B+Hyundai%3B+Jeep%3B+Kia%3B+Mazda%3B+Mitsubishi%3B+Nissan%3B+Ram%3B+Subaru%3B+Toyota%3B+Volkswagen%3B+Volvo&MarClassSel=Family+Sedans%2C+Large+Sedans%2C+Pickup+Trucks%2C+Sport+Utility+Vehicles%2C+Minivans%2C+Vans&FuelTypeSel=&VehTypeSel=Gasoline&TranySel=&DriveTypeSel=&CylindersSel=&MpgSel=000&sortBy=Comb&Units=&url=SearchServlet&opt=new&minmsrp=0&maxmsrp=0&minmpg=0&maxmpg=0&sCharge=&tCharge=&startstop=&cylDeact=&rowLimit=1453'
gasoline_url = 'https://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&year1=2018&year2=2021&cbmkAcura=Acura&cbmkBYD=BYD&cbmkChevrolet=Chevrolet&cbmkChrysler=Chrysler&cbmkDodge=Dodge&cbmkFiat=Fiat&cbmkFord=Ford&cbmkGMC=GMC&cbmkHonda=Honda&cbmkHyundai=Hyundai&cbmkJeep=Jeep&cbmkKia=Kia&cbmkMazda=Mazda&cbmkMitsubishi=Mitsubishi&cbmkNissan=Nissan&cbmkRam=Ram&cbmkSubaru=Subaru&cbmkToyota=Toyota&cbmkVolkswagen=Volkswagen&cbmkVolvo=Volvo&cbmcfamilySedans=Family+Sedans&cbmclargeSedans=Large+Sedans&cbmcpickupTrucks=Pickup+Trucks&cbmcsportUtilityVehicles=Sport+Utility+Vehicles&cbmcminivans=Minivans&cbmcvans=Vans&minmsrpsel=0&maxmsrpsel=0&city=0&hwy=0&comb=0&cbvtgasoline=Gasoline&YearSel=2018-2021&make=Acura%3B+BYD%3B+Chevrolet%3B+Chrysler%3B+Dodge%3B+Fiat%3B+Ford%3B+GMC%3B+Honda%3B+Hyundai%3B+Jeep%3B+Kia%3B+Mazda%3B+Mitsubishi%3B+Nissan%3B+Ram%3B+Subaru%3B+Toyota%3B+Volkswagen%3B+Volvo&mclass=Family+Sedans%2C+Large+Sedans%2C+Pickup+Trucks%2C+Sport+Utility+Vehicles%2C+Minivans%2C+Vans&vfuel=&vtype=Gasoline&trany=&drive=&cyl=&MpgSel=000&sortBy=Comb&Units=&url=SearchServlet&opt=new&minmsrp=0&maxmsrp=0&minmpg=0&maxmpg=0&sCharge=&tCharge=&startstop=&cylDeact=&rowLimit=1453&pageno=1&tabView=0&tabView=0'

In [7]:
diesel = pd.read_html(diesel_url)[0]
electric = pd.read_html(electric_url)[0]
e85 = pd.read_html(e85_url)[0]
gasoline = pd.read_html(gasoline_url)[0]

The web-scraped data has all kinds of formatting issues in pandas, so we will need to clean it up a bit. I wrote the following functions to do exactly this.

In [8]:
def convert_to_string (arr):
    """
    This function converts object arrays to string arrays
    
    Parameters:
    -----------
    arr : array of objects
    
    Returns:
    --------
    str_arr : float
            Array of strings
            
    """
    str_arr = np.array([])
    for a in arr:
        str_arr = np.append(str_arr, str(a))
    return str_arr


def split_dash (s):
    """
    This function splits a string along any dashes it might have.
    
    Parameters:
    -----------
    s : string
    
    Returns:
    --------
    s_arr : strings
            Array of strings
    """
    return re.split('[–]', s)


def split_slash (s):
    """
    This function splits a string along any forward slashes it might have.
    
    Parameters:
    -----------
    s : string
    
    Returns:
    --------
    s_arr : strings
            Array of strings
    """
    return re.split('[/]', s)

def split_gal (s):
    """
    This function splits a string along 'gal' substrings
    
    Parameters:
    -----------
    s : string
    
    Returns:
    --------
    s_arr : strings
            Array of strings
    """
    return re.split('[gal]', s)

In [404]:
def get_avg_msrp (df):
    """
    This function takes the data table fron fueleconomy.gov
    and returns the average of the MSRP values listed there.
    
    Parameters:
    -----------
    df : pandas dataframe
        THe return value of pd.read_html()
    
    Returns:
    --------
    avg : float
        Average of the MSRP data
        
    """
    
    # Get rid of NaN MSRP values
    df = df[df['MSRP'].notna()]
    
    # Pick out MSRP entries that have a dollar sign
    df = df[df['MSRP'].str.contains('\$')]
    
    # Convert these to string form
    msrp_string = convert_to_string(df['MSRP'].values)
    
    # Many of the MSRPs are a given as a range between a lower and upper limit
    # of values. We want to take the average of this range.
    msrp_string = map(split_dash, msrp_string)
    msrp_num = np.array([])
    for s in msrp_string:
        if (len(s) == 2):
            s[0] = float(s[0][1:].replace(',',''))
            s[1] = float(s[1][1:].replace(',',''))
            s = (s[0] + s[1]) / 2
        else:
            s = float(s[0][1:].replace (',',''))
            
        msrp_num = np.append(msrp_num, s)
            
    return np.average(msrp_num)
    

In [10]:
def get_avg_mileage (df):
    """
    This function takes the data table fron fueleconomy.gov
    and returns the average of the mileage (gal/100mi)
    listed there.
    
    Parameters:
    -----------
    df : pandas dataframe
        The return value of pd.read_html()
    
    Returns:
    --------
    avg : float
        Average of the mileage data
        
    """
    
    # Get rid of NaN mileage values
    df = df[df['DriverMPG'].notna()]
    
    # Pick out mileage entries in the gal/100 mi formal
    df = df[df['DriverMPG'].str.contains('/100')]
    
    # Convert these to string form
    mileage_string = convert_to_string(df['DriverMPG'].values)
    
    mileage_num = np.array([])
    for s in mileage_string:
        mileage_num = np.append(mileage_num, float(re.split('[ gal]', s)[0]))
            
    return np.average(mileage_num)

In [406]:
def get_vehicle_params (url):
    """
    This function takes the link from a fueleconomy.gov
    powersearch and returns an average of various parameters
    about the search. Currently, the function only returns
    average MSRP and average fuel tank size.
    
    Parameters:
    -----------
    url : url string
        A url from a fueleconomy.gov powersearch
        
    Returns:
    --------
    avg_msrp : float
    
    avg_ft_size: float
    """
    
    # a helper function we pass to the find_all procedure in BeautifulSoup
    def testfunc (href):
        return href and re.compile("action=sbs").search(href)
    
    slug = 'https://www.fueleconomy.gov/feg/'
    
    # where we will get all the juicy goodness
    search_page = bs.BeautifulSoup(requests.get(url).text, 'html.parser')
    
    ## Vehicle parameters
    # MSRP
    ms = search_page.find_all('td', class_='msrp')
    # Link to detailed vehicle page
    #inks =  #find all the entires in the table that have links to the car page
    
    tanks = np.array([])
    msrps = np.array([])
    i = 0
    for l in search_page.table.find_all(href=testfunc):
        #Check if there is an MSRP value
        msrp = re.split('[\W]+[\W]+', ms[i].text)
        if (len(msrp) > 2):
            # Parse and average the MSRP value if given a range
            if (len(msrp) == 3):
                msrp = float(msrp[1].replace(',',''))
            else:
                s = float(msrp[1].replace(',',''))
                s += float(msrp[2].replace(',','')) 
                msrp = s/2

            tank_size = 0
        
            #putting this on one line to save space in memory
            gal = bs.BeautifulSoup(requests.get(slug + l['href']).text, 'html.parser').find('th', string = re.compile('Tank Size')).next_sibling.string
            #gal = car_page

            #Get the tank size and parse if given a range
            gal_string = re.split('[-]', re.split(' ', gal)[0])
            
            #Average the tank value if given a range
            if (len(gal_string) == 2):
                tank_size = (float(gal_string[0]) + float(gal_string[1])) / 2
            elif (gal_string[0] != ''):
                tank_size = float(gal_string[0])
            #print(tank_size)
            if(tank_size != 0):
                tanks = np.append(tanks, tank_size)
                msrps = np.append(msrps, msrp)
        i += 1
    return np.average(tanks), np.average(msrps)

With our functions defined and ready to go, we just need to call them with the relevant vehicle data:

In [396]:
diesel_params = get_vehicle_params(diesel_url)
e85_params = get_vehicle_params(e85_url)
gasoline_params = get_vehicle_params(gasoline_url)

# MSRP

In [405]:
diesel_msrp = 1e-6 * diesel_params[1]#get_avg_msrp(diesel)
electric_msrp = 1e-6 * get_avg_msrp(electric)
e85_msrp = 1e-6 * e85_params[1]#get_avg_msrp(e85)
gasoline_msrp = 1e-6 * gasoline_params[1]#get_avg_msrp(gasoline)

print(f'Diesel: {diesel_msrp} M$')
print(f'Electric: {electric_msrp} M$')
print(f'E85: {e85_msrp} M$')
print(f'Gasoline: {gasoline_msrp} M$')

Diesel: 0.03759027272727273 M$
Electric: 0.04111 M$
E85: 0.03795976744186046 M$
Gasoline: 0.027040851612903222 M$


# Mileage

In [407]:
diesel_mileage = get_avg_mileage(diesel)
e85_mileage = get_avg_mileage(e85) #gallons per 100 miles
gasoline_mileage = get_avg_mileage(e85) #gallons per 100 miles

print(f'Diesel: {diesel_mileage}gal/100mi')
print(f'E85: {e85_mileage}gal/100mi')
print(f'Gasoline: {gasoline_mileage}gal/100mi')

Diesel: 4.1329268292682935gal/100mi
E85: 6.300387596899225gal/100mi
Gasoline: 6.300387596899225gal/100mi


# Tank Size

In [398]:
diesel_tank_size = diesel_params[0]
e85_tank_size = e85_params[0]
gasoline_tank_size = gasoline_params[0]

print(f'Diesel: {diesel_tank_size} gallons')
print(f'E85: {e85_tank_size} gallons')
print(f'Gasoline: {gasoline_tank_size} gallons')

Diesel: 21.46181818181818 gallons
E85: 25.537209302325586 gallons
Gasoline: 14.965161290322582 gallons


# CostInvest

In [400]:
IMPGSL = 0.00222 #M$/kgal
IMPDSL = 0.00248 #M$/kgal
IMPE85 = 0.00199 #M$/kgal
lifetime_mileage = 179954 #[1]

diesel_cost = diesel_msrp / diesel_tank_size #(diesel_mileage*lifetime_mileage/100)
e85_cost = e85_msrp / e85_tank_size #(e85_mileage*lifetime_mileage/100)
gasoline_cost = gasoline_msrp / gasoline_tank_size #(gasoline_mileage*lifetime_mileage/100)

print(f'Diesel :{diesel_cost} M$/gal')
print(f'E85: {e85_cost} M$/gal')
print(f'Gasoline: {gasoline_cost} M$/gal')

Diesel :0.0017514952558454764 M$/gal
E85: 0.0014864493215554135 M$/gal
Gasoline: 0.0018069201586480425 M$/gal


# CostFixed

In [403]:
#assume fixed costs are 5% of MSRP annually
diesel_fixed = 0.05 * diesel_msrp
electric_fixed = 0.05 * electric_msrp
e85_fixed = 0.05 * e85_msrp
gasoline_fixed = 0.05 * gasoline_msrp

print(f'Diesel: {diesel_fixed} M$')
print(f'Electric: {electric_fixed} M$')
print(f'E85: {e85_fixed} M$')
print(f'Gasoline: {gasoline_fixed} M$')

Diesel: 0.0018795136363636365 M$
Electric: 0.0020555 M$
E85: 0.0018979883720930233 M$
Gasoline: 0.0013520425806451612 M$


## References

1. US DOT. Vehicle Survivability and Travel Mileage Schedules. Technical Report. National Center for Statistics and Analysis. 2006. [link](https://crashstats.nhtsa.dot.gov/Api/Public/ViewPublication/809952)

In [376]:
f = requests.get(gasoline_url).text
slug = 'https://www.fueleconomy.gov/feg/'
search_page = bs.BeautifulSoup(f, 'html.parser')
msrps = search_page.find_all('td', class_='msrp')
links = search_page.table.find_all(href=testfunc)
i = 2
link = requests.get(slug + links[i]['href']).text
msrp = msrps[i]
car_page = bs.BeautifulSoup(link, 'html.parser')
gal = car_page.find('th', string = re.compile('Tank Size')).next_sibling.string
gal_string = re.split('[-]', re.split(' ', gal)[0])
if (len(gal_string) == 2):
    tank_size = (float(gal_string[0]) + float(gal_string[1])) / 2
elif (gal_string[0] != ''):
    tank_size = float(gal_string[0])
g

False

In [379]:
re.split('[\W]+[\W]+', msrp.text)

['', '23,645', '']