In [1]:
import requests, time
import pandas as pd
from bs4 import BeautifulSoup as bs

In [2]:
def fetch_page_and_convert_to_bs(url):

    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    
    # Delay 5 seconds before returning the soup object to prevent Python from sending too many requests in a rapid succession
    
    time.sleep(5)

    return soup

def get_vehicle_urls():
    
    counter = 1
    iterate = True
    pages = 0
    vehicle_urls = []
    
    # Storing the base url the data will be fetched from

    base_url = 'https://www.ebay.com/b/Cars-Trucks/6001?UF_context=finderType%3AVEHICLE_FINDER&rt=nc&_from=R40&_pgn='

    # Running a loop to determine how many pages with vehicle content exist
    
    while (iterate): # replace "pages == 0" with "iterate"
        
        """ 
            Creating a dynamic url that will update the page number in each pass of the loop
            until there is no vehicle content to be found
        """

        url = f'{base_url}{counter}'
        
        # Instantiating a beautiful soup object to store the html
        
        soup = fetch_page_and_convert_to_bs(url)        
        
        # Identify if any vehicles exist

        has_vehicle = bool(len(soup.find('a', class_ = 's-item__link')))
        
        # Scrap 3 pages worth of data
        
        if (has_vehicle and pages < 2 ):
            
            pages += 1
            counter += 1
            
            # Find all the vehicles that exist on the page
            
            vehicles = soup.find_all('a', class_ = 's-item__link')
            
            for vehicle in vehicles:
                vehicle_urls.append(vehicle['href'])
        else:
            iterate = False
            return vehicle_urls

In [3]:
urls = get_vehicle_urls()
print(urls)

['https://www.ebay.com/itm/1968-Chevrolet-Camaro/223573084084?hash=item340dfe87b4:g:VMkAAOSwcN5c7B5M', 'https://www.ebay.com/itm/2018-Ford-Mustang-Cobra-Jet/383118299712?hash=item5933a15240:g:8VYAAOSwp1BdYEGK', 'https://www.ebay.com/itm/2007-Ford-Mustang-Shelby-GT500/293176318200?hash=item4442ab98f8:g:63sAAOSwpPpdRbQY', 'https://www.ebay.com/itm/1970-Chevrolet-Corvette/303245303215?hash=item469ad41daf:g:~8IAAOSwlPhdSH~r', 'https://www.ebay.com/itm/1955-Chevrolet-Other-Pickups-1st-Series-Pickup-RESTORED/254319460633?hash=item3b369f0119:g:d7kAAOSwyWddRa-Y', 'https://www.ebay.com/itm/1958-Chevrolet-Other-Pickups-Truck/283565552156?hash=item4205d2f21c:g:rw4AAOSwdNBdQNtC', 'https://www.ebay.com/itm/2019-Ford-F-150-Raptor/123872707374?hash=item1cd763832e:g:NCwAAOSwTa1c5AfJ', 'https://www.ebay.com/itm/1998-Ford-Mustang/293179867562?hash=item4442e1c1aa:g:AGUAAOSwR11dSd0h', 'https://www.ebay.com/itm/2019-Ford-Mustang-GT/133147944903?hash=item1f003c67c7:g:NrkAAOSwTUZdUzJ4', 'https://www.ebay.com

In [4]:
# The next two lines are used for testing only

counter = 0
vehicles = []


# This list does not include "url", "condition", or "model" (all are defined below)

key_names = ['condition', 'drive_type', 'engine', 'exterior_color', 'fuel_type', \
             'interior_color', 'make', 'mileage', 'model', 'price',  \
             'seller_name', 'source_id', 'transmission', 'url', 'vin', 'year'
]

for url in urls:
    
    vehicle = {}

    soup = fetch_page_and_convert_to_bs(url)

    try:
        condition = soup.find('div', class_ = 'nonActPanel').div.find_all('div')[1].text

    except AttributeError as error:
        condition = None

    try:
        price = soup.find('span', {'itemprop' : 'price'}).text

    except AttributeError as error:
        price = None        

    try:
        model = soup.find('h2',  {'itemprop' : 'model'}).text

    except AttributeError as error:
        model = None

    # The element with the id: "itemSellerDesc" is an additional section that "sometimes" exists...
    # checking for this element to determine which index position will contain the table that stores the attributes    

    item_attr_div = soup.find('div', class_ = 'itemAttr')
    does_item_specifics_tbl_exist = bool(soup.find('table', {"id":'itmSellerDesc'}))

    # Target the table that stores the vehicle attributes and store the rows

    if (does_item_specifics_tbl_exist):

        table = item_attr_div.find_all('table')[1]

    else:

        table = item_attr_div.find_all('table')[0]

    attr_labels = table.find_all('td', class_ = 'attrLabels')
    attr_values = table.find_all('span')
    clean_labels = []
    clean_values = []

    for label in attr_labels:

        clean_label = label.text.strip() \
            .replace(':','') \
            .replace(' ', '_') \
            .lower()

        if (clean_label == 'model'):
            continue

        elif (clean_label == 'vin_(vehicle_identification_number)'):
            new_vin_label = clean_label.replace('vin_(vehicle_identification_number)', 'vin')
            clean_labels.append(new_vin_label)

        elif (clean_label == 'for_sale_by'):
            new_seller_label = clean_label.replace('for_sale_by', 'seller_name')
            clean_labels.append(new_seller_label)                    

        else:                
            clean_labels.append(clean_label)

    for value in attr_values:
        clean_values.append(value.text)


    for x in range(len(clean_labels)):
        vehicle[clean_labels[x]] = clean_values[x]

    vehicle['price'] = price        
    vehicle['url'] = url
    vehicle['condition'] = condition
    vehicle['model'] = model
    vehicle['source_id'] = 0

    # Adding the required keys

    for key in key_names:
        if (key not in vehicle):
            vehicle[key] = None

    # Removing unrequired keys

    vehicle = {k: vehicle[k] for k in key_names}

    vehicles.append(vehicle)

In [5]:
print(vehicles)

[{'condition': 'Used', 'drive_type': 'RWD', 'engine': '327', 'exterior_color': 'Red', 'fuel_type': 'Gasoline', 'interior_color': 'Black', 'make': 'Chevrolet', 'mileage': '67225', 'model': 'Camaro', 'price': 'US $22,999.00', 'seller_name': 'Dealer', 'source_id': 0, 'transmission': 'Manual', 'url': 'https://www.ebay.com/itm/1968-Chevrolet-Camaro/223573084084?hash=item340dfe87b4:g:VMkAAOSwcN5c7B5M', 'vin': '124378N312077', 'year': '1968'}, {'condition': 'New', 'drive_type': None, 'engine': '5.2L supercharge', 'exterior_color': None, 'fuel_type': None, 'interior_color': None, 'make': 'Ford', 'mileage': '2', 'model': 'Mustang', 'price': 'US $179,000.00', 'seller_name': None, 'source_id': 0, 'transmission': None, 'url': 'https://www.ebay.com/itm/2018-Ford-Mustang-Cobra-Jet/383118299712?hash=item5933a15240:g:8VYAAOSwp1BdYEGK', 'vin': '1FA6P8JZ0K5550988', 'year': '2018'}, {'condition': 'Used', 'drive_type': None, 'engine': '5.4L 32-VALVE SUPERCHARGED V8 ENGINE', 'exterior_color': 'White', 'fue

In [9]:
df = pd.DataFrame(vehicles)

df.head()

Unnamed: 0,condition,drive_type,engine,exterior_color,fuel_type,interior_color,make,mileage,model,price,seller_name,source_id,transmission,url,vin,year
0,Used,RWD,327,Red,Gasoline,Black,Chevrolet,67225,Camaro,"US $22,999.00",Dealer,0,Manual,https://www.ebay.com/itm/1968-Chevrolet-Camaro...,124378N312077,1968
1,New,,5.2L supercharge,,,,Ford,2,Mustang,"US $179,000.00",,0,,https://www.ebay.com/itm/2018-Ford-Mustang-Cob...,1FA6P8JZ0K5550988,2018
2,Used,,5.4L 32-VALVE SUPERCHARGED V8 ENGINE,White,Gasoline,Black,Ford,24996,Mustang,"US $28,991.00",Dealer,0,Manual,https://www.ebay.com/itm/2007-Ford-Mustang-She...,1ZVHT88S675315135,2007
3,Used,RWD,454,Gray,Gasoline,Black,Chevrolet,999999,Corvette,"US $7,975.00",Private Seller,0,Automatic,https://www.ebay.com/itm/1970-Chevrolet-Corvet...,194370S406512,1970
4,Used,,Inline 6,Red,Gasoline,Black,Chevrolet,0,Other Pickups,"US $27,500.00",Dealer,0,Manual,https://www.ebay.com/itm/1955-Chevrolet-Other-...,H55S008410,1955
