Process for scraping used car listings for an analysis of value over time.

In [None]:
import requests
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', 500)

In [None]:
# function for scraping listings from website #1
def web_scrape_1(URL_base_1):
    
    listing_dic = {} # dictionary for storing scraped listings
    entries_completed = 0 # entry counter
    page_num = 0 # listings page counter
    

    while True:
        
        time.sleep(2)   # prevents server issues

        page_num += 1
        
        URL = URL_base_1 + str(page_num)
    
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, 'html.parser')
        
        # stops function at last page
        if len(soup) == 0:
            print('DONE: page', page_num, 'does not exist :-(')
            break
            
        # parsing each listing
        try:
            listings = soup.find_all('div', class_ = "box tight listing premium")
           
            for listing in listings:
                entries_completed += 1
                internal_dic = {}
        
                model = listing.find('h3', class_ = "model")
                internal_dic['model'] = model.text
    
                try:
                    price = listing.find('h4', class_ = "price")
                    internal_dic['price'] = price.text
                except AttributeError:
                    print("no price found for #", entries_completed)
            
                try:
                    specs = listing.find('ul', class_ ="specs")
                    internal_dic['specs'] = specs.text
                except AttributeError:
                    print("no specs found for #", entries_completed)
                
                try:
                    descrip = listing.find('div', class_ ="description")
                    internal_dic['descrip'] = descrip.text
                except AttributeError:
                    print("no descrip found for #", entries_completed)
        
                listing_dic[entries_completed] = internal_dic
        
        except AttributeError:
            print("ERROR: IN #" ,entries_completed)
        
        
        print("COMPLETED: page #", page_num)
        
    print("COMPLETED: WEB SCRAPE 1")
    return listing_dic

In [None]:
# parse miles from messy 'specs' output
def parse_miles(listing_dic):
    
    miles = {}
    
    for key in listing_dic:
        
        nested_key = listing_dic[key]
        
        for item in nested_key:
            if item == 'specs':
                try:
                    specs_split = nested_key[item].split('\n')
                    descrip = listing_dic[key]['descrip']
                    if 'miles' in specs_split[-2]:
                        miles[key] = specs_split[-2]
                    elif 'miles' in descrip:
                        descrip_1 = descrip.lower()
                        pos = descrip_1.find('miles')
                        start = pos-16
                        end = pos+5
                        miles[key] = descrip_1[start:end]
                except TypeError:
                    continue

    # getting rid of unneeded text  
    to_replace = ['original', 'less than', 'actual', 'only', 'documented']
    num_list = ['0','1','2','3','4','5','6','7','8','9']
    
    for key in miles:
        
        string = miles[key].lower()

        #replacing unnecessary words
        for word in to_replace:
                string = string.replace(word, '')
                
        string = string.replace('k', '000')
        
        miles[key] = string
        
        #extracting only numeric characters from strings
        string_cleaned = []
        for letter in string:
            if letter in num_list:
                string_cleaned.append(letter)
        
        cleaned_miles = ''.join(string_cleaned)
        
        #flagging suspiciously low miles
        if len(cleaned_miles) <= 3:
            cleaned_miles = cleaned_miles + '-suspicious'
            print(key,':',cleaned_miles)
            
        #flagging suspiciously high miles
        elif len(cleaned_miles) > 6:
            cleaned_miles = cleaned_miles + '-suspicious'
            print(key,':',cleaned_miles)

        miles[key] = cleaned_miles
        
        listing_dic[key]['mileage'] = miles[key]
    
    print('COMPLETED: MILES PARSED')
    return listing_dic

In [None]:
def parse_color(listing_dic):
    
    colors = {}
    color_errors = []
    num_list = ['0','1','2','3','4','5','6','7','8','9']
    
    for key in listing_dic:
        nested_key = listing_dic[key]
        for item in nested_key:
            if item == 'specs':
                specs_split = nested_key['specs'].split('\n')
                    
                #finding listings where engine mislabeled as color
                for i in specs_split[3]:
                    colors[key] = specs_split[3]
                    if i in num_list:
                        color_errors.append(key)
    for key in colors:
        if key in color_errors:
            continue
        else:
            listing_dic[key]['ext_color'] = colors[key]
    
    print('COMPLETED: COLORS PARSED')
    return listing_dic

In [None]:
def create_at_df(listing_dic):
    df = pd.DataFrame.from_dict(listing_dic)
    at = df.transpose()
    
    print('COMPLETED: DF AT CREATED')
    return at

In [None]:
#cleaning specific to this web source
def basic_cleaning_at(at):
    
    #Cleaning model col
    at['model'] = at['model'].str.replace('New', '')

    
    #creating year column
    at['year'] = at['model'].str.split()
    at['year'] = at['year'].apply(lambda x: x[0])

    #creating model_name
    try:
        at['model_name'] = at['model'].str.split()
        at['model_name'] = at['model_name'].apply(lambda x: x[1:])
        at['model_name'] = at['model_name'].apply(lambda x: ' '.join(x))
    except KeyError:
        at['model_name'] = at['error!']
        print('error with model_name?!')
    
    #simplifying price
    at['price'] = at['price'].str.replace('$','')
    at['price'] = at['price'].str.replace(',','')
    at['price'] = at['price'].str.strip()
    at['price'] = at['price'].replace('For Auction', np.nan)
    at['price'] = at['price'].replace('Call for Price', np.nan)
    
    #creating trim
    at['trim'] = at['model_name']

    at['trim'] = at['trim'].str.lower()
    #at['trim'] = at['trim'].str.replace(make, '')
    #at['trim'] = at['trim'].str.replace(model, '')

    at['trim'] = at['trim'].str.replace('0', '')
    at['trim'] = at['trim'].str.replace('1', '')
    at['trim'] = at['trim'].str.replace('2', '')
    at['trim'] = at['trim'].str.replace('3', '')
    at['trim'] = at['trim'].str.replace('4', '')
    at['trim'] = at['trim'].str.replace('5', '')
    at['trim'] = at['trim'].str.replace('6', '')
    at['trim'] = at['trim'].str.replace('7', '')
    at['trim'] = at['trim'].str.replace('8', '')
    at['trim'] = at['trim'].str.replace('9', '')

    at['trim'] = at['trim'].str.strip()

    #clean descrip
    at['descrip'] = at['descrip'].str.replace('\n','')
    
    #drop unneeded cols
    at = at.drop(['model', 'specs'], axis = 1)
    
    print('COMPLETED: AT BASIC CLEANING')
    return at

In [None]:
def web_scrape_2(URL_base_2):
    
    listing_dic = {} # dictionary for storing scraped listings
    entries_completed = 0 # entry counter
    page_num = 0 # listings page counter
    

    while page_num <= 100:
        page_num += 1
        
        URL = URL_base_2 + str(page_num) + '&searchRadius=5000&sort[]=best_match'
    
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, 'html.parser')
        main = soup.find(class_ = "row row-2 margin-bottom-3")
        
        #finding last page
        not_found = soup.find('span', class_="d-none d-md-block")
        if not not_found is None:
            print('DONE: page', page_num, 'does not exist :-(')
            break
        else:
            try:
                listings = main.find_all('div', class_ = "card-content vehicle-card-body")
            
                for listing in listings:
                    entries_completed += 1
                    internal_dic = {}
    
                    year = listing.find('span', class_ = "vehicle-card-year font-size-1")
                    internal_dic['model_year'] = year.text
    
                    make = listing.find('span', class_ ="vehicle-header-make-model text-truncate")
                    internal_dic['make_model'] = make.text
    
                    try:
                        trim = listing.find('div', class_ = "font-size-1 text-truncate")
                        internal_dic['trim'] = trim.text
                    except AttributeError:
                        print("no trim found for #", entries_completed)
    
                    try:
                        price_label = listing.find('span', class_ = "graph-icon-title margin-left-1 vehicle-card-price-rating-label text-truncate font-weight-bold")
                        internal_dic['price_label'] = price_label.text
                    except AttributeError:
                        print("no price_label found for #", entries_completed)
    
                    try:
                        price = listing.find('h4', class_ = 'heading-3 margin-y-1 font-weight-bold')
                        internal_dic['price'] = price.text
                    except AttributeError:
                        print("no price found for #", entries_completed)
    
                    try:
                        mileage = listing.find('div', class_ = 'd-flex w-100 justify-content-between')
                        internal_dic['mileage'] = mileage.text
                    except AttributeError:
                        print("no mileage found for #", entries_completed)

                    try:
                        location = listing.find('div', class_ = 'vehicle-card-location font-size-1 margin-top-1')
                        internal_dic['location'] = location.text
                    except AttributeError:
                        print("no location found for #", entries_completed)
        
                    try:
                        color = listing.find('div', class_ = 'vehicle-card-location font-size-1 margin-top-1 text-truncate')
                        internal_dic['color'] = color.text
                    except AttributeError:
                        print("no color found for #", entries_completed)
    
                    listing_dic[entries_completed] = internal_dic
            except AttributeError:
                print("ERROR: IN #" ,entries_completed)
                
        print("COMPLETED: page #", page_num)
        time.sleep(2) #prevents server issues
    
    
    print("COMPLETED: WEB SCRAPE 2")
    return listing_dic   

In [None]:
def create_tc_df(listing_dic_tc):
    tc = pd.DataFrame.from_dict(listing_dic_tc)
    tc = tc.transpose()
    
    print('COMPLETED: DF TC CREATED')
    return tc

In [None]:
def basic_cleaning_tc(tc):
    
    #cleaning price
    tc['price'] = tc['price'].str.replace('$', '')
    tc['price'] = tc['price'].str.replace(',','')
    tc['price'] = tc['price'].str.strip()


    #cleaning mileage
    tc['mileage'] = tc['mileage'].str.replace('Discount Available', '')
    tc['mileage'] = tc['mileage'].str.replace('Upfront Price Available', '')
    tc['mileage'] = tc['mileage'].str.replace('miles', '')
    tc['mileage'] = tc['mileage'].str.replace(',','')
    tc['mileage'] = tc['mileage'].str.strip()
    
    #cleaning and seperating location

    tc['location'] = tc['location'].str.split('-')
    tc['city'] = tc['location'].apply(lambda x: x[1])
    tc['city'] = tc['city'].str.split(',')
    tc['state'] = tc['city']
    tc['city'] = tc['city'].apply(lambda x: x[0])
    tc['state'] = tc['state'].apply(lambda x: x[1:])
    tc['state'] = tc['state'].apply(lambda x: ''.join(x))
    
    ### making ext_color and int_color ###
    tc['color'] = tc['color'].str.split(',')
    
    tc['ext_color'] = tc['color'].apply(lambda x: x[0])
    tc['ext_color'] = tc['ext_color'].str.replace('exterior','')

    tc['int_color'] = tc['color'].apply(lambda x: x[1])
    tc['int_color'] = tc['int_color'].str.replace('interior','')
    
    #drop redudant cols
    tc = tc.drop(['location', 'color'], axis = 1)
    
    print("COMPLETED: TC BASIC CLEANING")
    return tc

In [None]:
#run all functions
def scrape(URL_base_1, URL_base_2, save_path_1, save_path_2):
    
    #running at
    listing_dic_at = web_scrape_1(URL_base_1)
    listing_dic_at = parse_miles(listing_dic_at)
    listing_dic_at = parse_color(listing_dic_at)
    at = create_at_df(listing_dic_at)
    at = basic_cleaning_at(at)
    
    #running tc
    listing_dic_tc = web_scrape_2(URL_base_2)
    tc = create_tc_df(listing_dic_tc)
    
    #save as csv
    at.to_csv(save_path_1)
    tc.to_csv(save_path_2)
    
    tc = basic_cleaning_tc(tc)
    tc.to_csv(save_path_2)
    
    print("COMPLETED: AT AND TC SAVED AS CSV")

In [None]:
models = pd.read_csv('/Users/avacheevers/Documents/Cars/both.csv')
model_dic = models.to_dict()

In [None]:
for key in model_dic['Car']:
    scrape(
    model_dic['AT URL'][key],
    model_dic['TC URL'][key],
    '/Users/avacheevers/Documents/Cars/1_'+model_dic['Car'][key]+'_at.csv',
    '/Users/avacheevers/Documents/Cars/1_'+model_dic['Car'][key]+'_tc.csv'
    )
    print(key, "completed part 1")