# Vehicle Data Scraping

Source: https://pythoninoffice.com/get-table-data-from-web-page-using-python-pandas/
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.notna.html#pandas.DataFrame.notna  
        https://www.fueleconomy.gov/feg/PowerSearch

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def convert_to_string (arr):
    """
    This function converts object arrays to string arrays
    
    Parameters:
    -----------
    arr : array of objects
    
    Returns:
    --------
    str_arr : float
            Array of strings
            
    """
    str_arr = np.array([])
    for a in arr:
        str_arr = np.append(str_arr, str(a))
    return str_arr

def split_dash (s):
    return re.split('[–]', s)

In [6]:
diesel = pd.read_html('https://www.fueleconomy.gov/feg/PowerSearch.do?action=PowerSearch&year1=2018&year2=2021&cbmkAcura=Acura&cbmkBuick=Buick&cbmkChevrolet=Chevrolet&cbmkChrysler=Chrysler&cbmkDodge=Dodge&cbmkFord=Ford&cbmkGMC=GMC&cbmkHonda=Honda&cbmkHyundai=Hyundai&cbmkJeep=Jeep&cbmkKia=Kia&cbmkLexus=Lexus&cbmkMazda=Mazda&cbmkNissan=Nissan&cbmkRam=Ram&cbmkSubaru=Subaru&cbmkToyota=Toyota&cbmkVolkswagen=Volkswagen&cbmkVolvo=Volvo&cbmcfamilySedans=Family+Sedans&cbmclargeSedans=Large+Sedans&cbmcpickupTrucks=Pickup+Trucks&cbmcsportUtilityVehicles=Sport+Utility+Vehicles&cbmcminivans=Minivans&cbmcvans=Vans&minmsrpsel=0&maxmsrpsel=0&city=0&highway=0&combined=0&cbvtdiesel=Diesel&YearSel=2018-2021&MakeSel=Acura%3B+Buick%3B+Chevrolet%3B+Chrysler%3B+Dodge%3B+Ford%3B+GMC%3B+Honda%3B+Hyundai%3B+Jeep%3B+Kia%3B+Lexus%3B+Mazda%3B+Nissan%3B+Ram%3B+Subaru%3B+Toyota%3B+Volkswagen%3B+Volvo&MarClassSel=Family+Sedans%2C+Large+Sedans%2C+Pickup+Trucks%2C+Sport+Utility+Vehicles%2C+Minivans%2C+Vans&FuelTypeSel=&VehTypeSel=Diesel&TranySel=&DriveTypeSel=&CylindersSel=&MpgSel=000&sortBy=Comb&Units=&url=SearchServlet&opt=new&minmsrp=0&maxmsrp=0&minmpg=0&maxmpg=0&sCharge=&tCharge=&startstop=&cylDeact=&rowLimit=200')[0]
electric = pd.read_html('https://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&year1=2018&year2=2021&minmsrpsel=0&maxmsrpsel=0&city=0&hwy=0&comb=0&cbvtelectric=Electric&YearSel=2018-2021&make=&mclass=&vfuel=&vtype=Electric&trany=&drive=&cyl=&MpgSel=000&sortBy=&Units=&url=SearchServlet&opt=new&minmsrp=0&maxmsrp=0&minmpg=0&maxmpg=0&sCharge=&tCharge=&startstop=&cylDeact=&rowLimit=200&pageno=1&tabView=0#')[0]
e85 = pd.read_html('https://www.fueleconomy.gov/feg/PowerSearch.do?action=PowerSearch&year1=2018&year2=2021&cbmkAcura=Acura&cbmkBuick=Buick&cbmkChevrolet=Chevrolet&cbmkChrysler=Chrysler&cbmkDodge=Dodge&cbmkFord=Ford&cbmkGMC=GMC&cbmkHonda=Honda&cbmkHyundai=Hyundai&cbmkJeep=Jeep&cbmkKia=Kia&cbmkLexus=Lexus&cbmkMazda=Mazda&cbmkNissan=Nissan&cbmkRam=Ram&cbmkSubaru=Subaru&cbmkToyota=Toyota&cbmkVolkswagen=Volkswagen&cbmkVolvo=Volvo&cbmcfamilySedans=Family+Sedans&cbmclargeSedans=Large+Sedans&cbmcpickupTrucks=Pickup+Trucks&cbmcsportUtilityVehicles=Sport+Utility+Vehicles&cbmcminivans=Minivans&cbmcvans=Vans&minmsrpsel=0&maxmsrpsel=0&city=0&highway=0&combined=0&cbvte85=E85&YearSel=2018-2021&MakeSel=Acura%3B+Buick%3B+Chevrolet%3B+Chrysler%3B+Dodge%3B+Ford%3B+GMC%3B+Honda%3B+Hyundai%3B+Jeep%3B+Kia%3B+Lexus%3B+Mazda%3B+Nissan%3B+Ram%3B+Subaru%3B+Toyota%3B+Volkswagen%3B+Volvo&MarClassSel=Family+Sedans%2C+Large+Sedans%2C+Pickup+Trucks%2C+Sport+Utility+Vehicles%2C+Minivans%2C+Vans&FuelTypeSel=&VehTypeSel=E85&TranySel=&DriveTypeSel=&CylindersSel=&MpgSel=000&sortBy=Comb&Units=&url=SearchServlet&opt=new&minmsrp=0&maxmsrp=0&minmpg=0&maxmpg=0&sCharge=&tCharge=&startstop=&cylDeact=&rowLimit=200')[0]

In [7]:
def get_avg_msrp (df):
    """
    This function takes the data table fron fueleconomy.gov
    and returns the average of the MSRP values listed there.
    
    Parameters:
    -----------
    df : pandas dataframe
        THe return value of pd.read_html()
    
    Returns:
    --------
    avg : float
        Average of the MSRP data
        
    """
    
    # Get rid of NaN MSRP values
    df = df[df['MSRP'].notna()]
    
    # The data table is a bit wonky, so pick out monetary
    df = df[df['MSRP'].str.contains('\$')]
    
    # Convert these to string
    msrp_string = convert_to_string(df['MSRP'].values)
    
    # Many of the MSRPs are a range. We want to take the average of
    # this range.
    msrp_string = map(split_dash, msrp_string)
    msrp_num = np.array([])
    for s in msrp_string:
        if (len(s) == 2):
            s[0] = float(s[0][1:].replace(',',''))
            s[1] = float(s[1][1:].replace(',',''))
            s = (s[0] + s[1]) / 2
        else:
            s = float(s[0][1:].replace (',',''))
            
        msrp_num = np.append(msrp_num, s)
            
    return np.average(msrp_num)
    

In [14]:
get_avg_msrp(diesel)

37713.125

In [15]:
get_avg_msrp(electric)

63840.41139240506

In [16]:
get_avg_msrp(e85)

43864.098837209305

In [17]:
electric

Unnamed: 0,Vehicle,EPA Fuel Economy,DriverMPG,Unnamed: 3,AnnualFuelCost,MSRP,EnergyImpactScore,GreenhouseGasEmissions(tailpipe)
0,2020 Tesla Model 3 Standard Range Plus Automat...,2020 Tesla Model 3 Standard Range Plus Automat...,2020 Tesla Model 3 Standard Range Plus Automat...,2020 Tesla Model 3 Standard Range Plus Automat...,2020 Tesla Model 3 Standard Range Plus Automat...,2020 Tesla Model 3 Standard Range Plus Automat...,2020 Tesla Model 3 Standard Range Plus Automat...,2020 Tesla Model 3 Standard Range Plus Automat...
1,,141 MPGe 148 132 combined city/hwy city ...,,,$450,"$37,990",0.1 barrels/yr,0 grams/mile
2,,141,MPGe,MPGe,,,,
3,,141,148,132,,,,
4,,combined city/hwy,city,hwy,,,,
...,...,...,...,...,...,...,...,...
733,,68 MPGe 67 68 combined city/hwy city hwy...,,,$950,"$185,000",0.3 barrels/yr,0 grams/mile
734,,68,MPGe,MPGe,,,,
735,,68,67,68,,,,
736,,combined city/hwy,city,hwy,,,,
