In [1]:
import requests
from bs4 import BeautifulSoup
import pprint
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor

In [2]:
def get_page_count(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "html.parser")
    temp = soup.find("ul",{"data-test":"searchResultsPagination"}).get_text().split()
    return temp[-1]

In [3]:
def get_single_element(listing, el_type, data_test):
    temp = listing.find(el_type, {"data-test":data_test})
    if temp is not None:
        return temp.get_text().strip()
    else:
        return None

In [4]:
def parse_json(url):
    with requests.get(url) as res:
        if res.status_code == 200:
            info = []

            soup = BeautifulSoup(res.content, "html.parser")
            list_rows = soup.find_all("div", { "data-test": "usedListing" })
            for row in list_rows:
                listing = row.find("div", {"class":"card-content order-3 vehicle-card-body",
                                                           "data-test":"cardContent" 
                          })
                
                car = dict()
                year_make_model = [i.get_text() for i in listing.find("div",{"data-test": "vehicleCardYearMakeModel"
                           }).find_all("span")]
                
                if 'Sponsored' in year_make_model:
                    year_make_model.remove('Sponsored')
                make_model_list = year_make_model[1].split()
                
              
                car['year'] = year_make_model[0]
                car['make'] = make_model_list[0]
                car['model'] = make_model_list[1]
                trim = get_single_element(listing,"div", "vehicleCardTrim") 
                if trim == 'Sedan':
                    car['trim'] = 'Base'
                else:
                    car['trim'] = trim

                
                price = get_single_element(listing,"div", "vehicleCardPricingBlockPrice")
                if price is None:
                    continue
                
                
                car["price"] = int(get_single_element(listing,"div", "vehicleCardPricingBlockPrice").replace("$","").replace(",",""))
                car["mileage"] = int(get_single_element(listing,"div","vehicleMileage").split()[0].replace(",",""))


                dist_location = [i.strip() for i in get_single_element(listing,"div", "vehicleCardLocation").split("-")][1]
                
                
                # dist = dist_location[0].split()[0]
                # if "," in dist:
                #     dist = dist.replace(",","")
                # car["distance"] = dist
 
                if len(dist_location.split()) != 2:
                    continue
                
                car["location_state"] = dist_location.split()[1].strip()
                car["location_city"] = dist_location.split()[0].replace(",","").strip()
        

                colors = {v.split()[1]: v.split()[0] for v in get_single_element(listing,"div", "vehicleCardColors").split(",")}
                car['interior_color'] = colors['interior']
                car['exterior_color'] = colors['exterior']


                condition = get_single_element(listing,"div", "vehicleCardCondition").split(",")

                if "No" not in condition[0]:
                    car['num_accidents'] = condition[0].split()[0]
                else:
                    car['num_accidents'] = 0

                car['num_owners'] = condition[1].strip().split()[0]
                car['usage'] = condition[2].strip()
                

                info.append(car)
    
    return info

In [5]:
brands_date_ranges = [{
                        'brand':'bmw',
                        'date_ranges':('year-min-2016', 'year-2017-2019','year-2020-2022','year-2023-max')
                    },
                    {
                        'brand': 'mercedes-benz',
                        'date_ranges': ('year-min-2016', 'year-2017-2019', 'year-2020-2023'
                                       )
                    }
                    ,{
                        'brand':'audi',
                        'date_ranges':('year-min-2019','year-2020-2023')
                    }
]
                        

In [6]:


# url="https://www.truecar.com/used-cars-for-sale/listings/{}/year-2019-max/price-above-1/location-athens-ga/?rentalHistory=false&searchRadius=5000&sort[]=price_asc&titleHistory[]=hide-lemon&titleHistory[]=hide-frame-damage&titleHistory[]=hide-theft-recovery&titleHistory[]=hide-salvage"
url="https://www.truecar.com/used-cars-for-sale/listings/{brand}/{date_range}/price-above-1/location-athens-ga/?rentalHistory=false&searchRadius=5000&sort[]=price_asc&titleHistory[]=hide-lemon&titleHistory[]=hide-frame-damage&titleHistory[]=hide-theft-recovery&titleHistory[]=hide-salvage"




In [7]:
def generate_links(base_url, brands_date_ranges):
    all_links = []
    for bdr in brands_date_ranges:
        brand = bdr['brand']
        date_ranges = bdr['date_ranges']
        for date_range in date_ranges:
            url = base_url.format(brand=brand, date_range=date_range)
            max_pages = int(get_page_count(url))
           
            urls = []
            urls.append(url)
            for i in range(2, max_pages + 1):
                temp = url.split("?")
                urls.append(temp[0] + "?page=" + str(i) + "&" + temp[1])
            print(f"For Brand: {brand} and Date Range {date_range} there where {len(urls)} found!")
            all_links.extend(urls)
    return all_links

In [8]:
links = generate_links(base_url=url, brands_date_ranges=brands_date_ranges)
print(f"Generated {len(links)} URLS")

For Brand: bmw and Date Range year-min-2016 there where 273 found!
For Brand: bmw and Date Range year-2017-2019 there where 208 found!
For Brand: bmw and Date Range year-2020-2022 there where 187 found!
For Brand: bmw and Date Range year-2023-max there where 17 found!
For Brand: mercedes-benz and Date Range year-min-2016 there where 267 found!
For Brand: mercedes-benz and Date Range year-2017-2019 there where 231 found!
For Brand: mercedes-benz and Date Range year-2020-2023 there where 296 found!
For Brand: audi and Date Range year-min-2019 there where 298 found!
For Brand: audi and Date Range year-2020-2023 there where 132 found!
Generated 1909 URLS


In [9]:
def collect_data(urls):
    all_cars = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(parse_json, url) for url in urls]
        num_processed = 0
        for future in futures:
            try:
                cars = future.result()
                all_cars.extend(cars)
                num_processed += len(cars)
                print(f"Num Completed: {num_processed}", flush=True)
                time.sleep(0.01)
                
            except Exception as e:
                print(f"Error processing future: {e}")
    return all_cars
data = collect_data(urls=links)

Num Completed: 25
Num Completed: 52
Num Completed: 68
Num Completed: 93
Num Completed: 114
Num Completed: 138
Num Completed: 160
Num Completed: 186
Num Completed: 207
Num Completed: 228
Num Completed: 249
Num Completed: 272
Num Completed: 293
Num Completed: 311
Num Completed: 335
Num Completed: 356
Num Completed: 378
Num Completed: 397
Num Completed: 417
Num Completed: 444
Num Completed: 466
Num Completed: 489
Num Completed: 509
Num Completed: 530
Num Completed: 550
Num Completed: 571
Num Completed: 593
Num Completed: 611
Num Completed: 635
Num Completed: 663
Num Completed: 685
Num Completed: 709
Num Completed: 729
Num Completed: 752
Num Completed: 774
Num Completed: 795
Num Completed: 811
Num Completed: 829
Num Completed: 854
Num Completed: 881
Num Completed: 903
Num Completed: 919
Num Completed: 944
Num Completed: 965
Num Completed: 991
Num Completed: 1012
Num Completed: 1033
Num Completed: 1051
Num Completed: 1076
Num Completed: 1099
Num Completed: 1124
Num Completed: 1147
Num Compl

In [10]:
len(data)

34977

In [11]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,year,make,model,trim,price,mileage,location_state,location_city,interior_color,exterior_color,num_accidents,num_owners,usage
0,2011,BMW,5,528i Sedan,8897,136512,FL,Hollywood,Black,Black,1,4,Personal use
1,2006,BMW,3,325xi Sedan AWD,1999,202121,VA,Woodbridge,Unknown,White,0,2,Personal use
2,2006,BMW,5,525i Sedan,2900,178740,AZ,Phoenix,Unknown,Unknown,2,8,Fleet use
3,2000,BMW,5,528i Sedan Automatic,2950,149176,VA,Woodbridge,Unknown,Green,1,2,Personal use
4,2004,BMW,5,530i Sedan,2995,190067,TN,Murfreesboro,Black,Black,1,5,Personal use


In [12]:
df.dtypes

year              object
make              object
model             object
trim              object
price              int64
mileage            int64
location_state    object
location_city     object
interior_color    object
exterior_color    object
num_accidents     object
num_owners        object
usage             object
dtype: object

In [13]:
df.num_accidents = df.num_accidents.astype('int64')
df.num_owners = df.num_owners.astype('int64')
df.year = pd.to_datetime(df.year).dt.year
target = df.price
df = df.drop(['price'], axis=1)
df['price'] = target

In [14]:
df.trim.value_counts()

Premium Plus                              794
GLC 300 SUV 4MATIC                        684
xDrive28i AWD                             640
C 300 Sedan RWD                           634
xDrive35i AWD                             620
                                         ... 
2005 Cabriolet 1.8T CVT                     1
4.2L quattro Automatic                      1
Premium Plus Coupe 2.0L quattro Manual      1
Sedan Automatic                             1
Sedan Manual                                1
Name: trim, Length: 1367, dtype: int64

In [19]:
df.to_csv("./Data/Used_Cars_German_Brands", index=False)