In [1]:
import requests
from bs4 import BeautifulSoup
import pprint
import pandas as pd

In [2]:
def get_page_count(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "html.parser")
    return soup.find("ul",{"data-test":"searchResultsPagination"}).get_text().split()[-1]

In [3]:
def get_single_element(listing, el_type, data_test):
    return listing.find(el_type, {"data-test":data_test}).get_text().strip()

In [4]:
def parse_json(url):
    with requests.get(url) as res:
        if res.status_code == 200:
            info = []

            soup = BeautifulSoup(res.content, "html.parser")
            list_rows = soup.find_all("div", { "class": "linkable card card-shadow vehicle-card" })
            for row in list_rows:
                listing = row.find("div", {"class":"card-content order-3 vehicle-card-body",
                                                           "data-test":"cardContent" 
                          })
                
                car = dict()
                year_make_model = [i.get_text() for i in listing.find("div",{"data-test": "vehicleCardYearMakeModel"
                           }).find_all("span")]
                
                if 'Sponsored' in year_make_model:
                    year_make_model.remove('Sponsored')
                make_model_list = year_make_model[1].split()
                
              
                car['year'] = year_make_model[0]
                car['make'] = make_model_list[0]
                car['model'] = make_model_list[1]
                car["trim"] =  get_single_element(listing,"div", "vehicleCardTrim")
                print(get_single_element(listing,"div", "vehicleCardTrim"))
                
                
                price = get_single_element(listing,"div", "vehicleCardPricingBlockPrice")
                
                
                
                car["price"] = int(get_single_element(listing,"div", "vehicleCardPricingBlockPrice").replace("$","").replace(",",""))
                car["mileage"] = int(get_single_element(listing,"div","vehicleMileage").split()[0].replace(",",""))


                dist_location = [i.strip() for i in get_single_element(listing,"div", "vehicleCardLocation").split("-")]
                
                dist = dist_location[0].split()[0]
                if "," in dist:
                    dist = dist.replace(",","")
                car["distance"] = int(dist)
                car["location"] = dist_location[1]

                colors = {v.split()[1]: v.split()[0] for v in get_single_element(listing,"div", "vehicleCardColors").split(",")}
                car['interior_color'] = colors['interior']
                car['exterior_color'] = colors['exterior']


                condition = get_single_element(listing,"div", "vehicleCardCondition").split(",")

                if "No" not in condition[0]:
                    car['num_accidents'] = condition[0].split()[0]
                else:
                    car['num_accidents'] = 0

                car['num_owners'] = condition[1].strip().split()[0]
                car['usage'] = condition[2].strip()
                

                info.append(car)
    return info

In [5]:
url="https://www.truecar.com/used-cars-for-sale/listings/bmw/m5/year-2019-max/location-athens-ga/?rentalHistory=false&searchRadius=5000&sort[]=price_asc&titleHistory[]=hide-lemon&titleHistory[]=hide-frame-damage&titleHistory[]=hide-theft-recovery&titleHistory[]=hide-salvage"

In [6]:
max_pages = int(get_page_count(url))
urls = []
for i in range(1,max_pages+1):
    temp = url.split("?")
    urls.append(temp[0]+"?page="+str(i)+temp[1])
pprint.pprint(urls)

['https://www.truecar.com/used-cars-for-sale/listings/bmw/m5/year-2019-max/location-athens-ga/?page=1rentalHistory=false&searchRadius=5000&sort[]=price_asc&titleHistory[]=hide-lemon&titleHistory[]=hide-frame-damage&titleHistory[]=hide-theft-recovery&titleHistory[]=hide-salvage',
 'https://www.truecar.com/used-cars-for-sale/listings/bmw/m5/year-2019-max/location-athens-ga/?page=2rentalHistory=false&searchRadius=5000&sort[]=price_asc&titleHistory[]=hide-lemon&titleHistory[]=hide-frame-damage&titleHistory[]=hide-theft-recovery&titleHistory[]=hide-salvage',
 'https://www.truecar.com/used-cars-for-sale/listings/bmw/m5/year-2019-max/location-athens-ga/?page=3rentalHistory=false&searchRadius=5000&sort[]=price_asc&titleHistory[]=hide-lemon&titleHistory[]=hide-frame-damage&titleHistory[]=hide-theft-recovery&titleHistory[]=hide-salvage',
 'https://www.truecar.com/used-cars-for-sale/listings/bmw/m5/year-2019-max/location-athens-ga/?page=4rentalHistory=false&searchRadius=5000&sort[]=price_asc&titl

In [7]:
pages = []
for i in urls:
    pages.append(parse_json(i))

In [8]:
df = pd.DataFrame([cars for page in pages for cars in page])
df.head()

Unnamed: 0,year,make,model,trim,price,mileage,distance,location,interior_color,exterior_color,num_accidents,num_owners,usage
0,2019,BMW,M5,Sedan,63995,41551,710,"Westbury, NY",Brown,White,0,2,Personal use
1,2019,BMW,M5,Sedan,76498,39328,541,"West Palm Beach, FL",Black,White,0,1,Personal use
2,2019,BMW,M5,Sedan,78998,23870,816,"Wichita, KS",Black,Gray,0,1,Personal use
3,2019,BMW,M5,Sedan,58999,72390,2105,"Turlock, CA",Black,White,0,2,Personal use
4,2019,BMW,M5,Competition,61490,66256,67,"Marietta, GA",Black,Gray,1,1,Personal use


In [10]:
df.dtypes

year              object
make              object
model             object
trim              object
price              int64
mileage            int64
distance           int64
location          object
interior_color    object
exterior_color    object
num_accidents     object
num_owners        object
usage             object
dtype: object

In [11]:
df.num_accidents = df.num_accidents.astype('int64')
df.num_owners = df.num_owners.astype('int64')

In [12]:
df.dtypes

year              object
make              object
model             object
trim              object
price              int64
mileage            int64
distance           int64
location          object
interior_color    object
exterior_color    object
num_accidents      int64
num_owners         int64
usage             object
dtype: object

In [13]:
df.trim.value_counts()

Sedan          105
Competition     60
Name: trim, dtype: int64