In [None]:
import requests
import pandas as pd
import numpy as np
import time
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
from requests.exceptions import Timeout
from IPython.display import clear_output

In [None]:
html = requests.get("https://www.ccarprice.com/au/") #This is the website this script uses to scrape needed information

if html.status_code == 200:
    print("Connection established successfully!")
else:
    print(f"Connection failed with status code {html.status_code}")

In [None]:
#Turning response object fom "html" variable into beautifulsoup object to crawl through the site
soup = BeautifulSoup(html.content, 'html.parser')

In [None]:
mydivs = soup.find("div", {"class": "vertical-menu"}).label.find("div", {"class": "show1"}).find_all("a", {"class": "brnd"})
brand_links = [x.get("href") for x in mydivs] #This list will store links of all car brands given in the website

print(f'The webiste "https://www.ccarprice.com/au/" has data of {len(brand_links)} car brands.')
print('\nAll Brands Links:')
for n, each_brand_link in enumerate(brand_links):
    print(f'{n+1}. {each_brand_link}')

**Now I'll create a dictionary with key as the car's brand and its value as a list of all car links of the corresponding car brand.**

In [None]:
all_brands_car_links = {}

In [None]:
for n, each_brand_link in enumerate(brand_links):

    # Using the regular expression to extract the text
    brand_name = re.search(r'https:\/\/www\.ccarprice\.com\/au\/(.+)\-car', str(each_brand_link)).group(1)

    # Matched string might contain special character and first letter of each word needs to be capitalize
    if '-' in brand_name:
        brand_name = brand_name.replace('-', ' ').title()
    else:
        brand_name = brand_name.title()

    html1 = requests.get(str(each_brand_link))
    soup1 = BeautifulSoup(html1.content, 'html.parser')

    all_cars_of_this_brand = soup1.body.find("div", {"id": "page"}).find_all("div", {"id": "pbox", "class": "price-cover"})[-1].find_all("div", {"id": "pbox", "class": "listing"})

    all_cars_links_of_this_brand = [] #This list will store the links of all car correponding to a current car brand

    for i in all_cars_of_this_brand: #Extracting just the links of each needed cars
            text = i.getText().strip()
            if text.split("\n")[-1] != "Coming soon":
                all_cars_links_of_this_brand.append(i.a.get("href"))
    
    all_brands_car_links[brand_name] = all_cars_links_of_this_brand
    
#     break

In [None]:
total_cars_details = 0

print('The website "https://www.ccarprice.com/au/" has the following data:\n')

for n, (each_brand, list_of_cars) in enumerate(all_brands_car_links.items()):
    print(f'{n+1}. {each_brand} has {len(list_of_cars)}')
    total_cars_details += len(list_of_cars)

print(f'\nThe website has a total of {total_cars_details} car details.')

In [None]:
all_car_details = pd.DataFrame() # Creatng empty df

In [None]:
index_number = 0
for n, (each_brand, list_of_cars_links) in enumerate(all_brands_car_links.items()):
    
    for m, link_of_each_car in enumerate(list_of_cars_links):
        car_details_grouped_by_brand = {}
        
        try:
            html2 = requests.get(link_of_each_car, timeout=10) #setting a timeout for server response
        except:
            continue #continuing to next webpage if timeout exceeds
        else: #If timeout doesn't exceed, extracting data
            soup2 = BeautifulSoup(html2.content, 'html.parser')

            price_details = soup2.select_one('html body div#page.main div div#pbox.detail-cover div#pbox.image-detail div#pbox.detail-price')

            try:
                price_in_aud = re.search(r'Price in AUD\:\n?\s*(.+)\n?\<br\/\>\s*Price in USD\:\s*\$?(.+)\n?', str(price_details)).group(1).replace(',', '')
            except:
                price_in_aud = np.nan
                
            try:
                price_in_usd = re.search(r'Price in AUD\:\n?\s*(.+)\n?\<br\/\>\s*Price in USD\:\s*\$?(.+)\n?', str(price_details)).group(2).replace(',', '')
            except:
                price_in_usd = np.nan

            if price_in_aud == 'N/A' and price_in_usd == 'N/A':
                price_in_aud = np.nan
                price_in_usd = np.nan
            

            year_UNFILTERED = soup2.select_one('html body div#page.main div div#spec div#pbox.detail-cover div.tr div.td2').text.strip()
            try:
                year = re.search('.*(\d{4}).*', year_UNFILTERED).group(1)
            except:
                year = np.nan

            ##
            car_details_grouped_by_brand['brand'] = each_brand
            try:
                car_details_grouped_by_brand['year'] = year
            except:
                car_details_grouped_by_brand['year'] = np.nan
            car_details_grouped_by_brand['price_in_aud'] = price_in_aud
            car_details_grouped_by_brand['price_in_usd'] = price_in_usd
            
            data_not_needed = ['Available Colors', 'Warranty']

            list_of_car_features = soup2.select_one('html body div#page.main div div#spec div#pbox.detail-cover').find_all("div", {"class": "tr"})
            for each in list_of_car_features:
                if each.find_all("div")[0].text.strip() not in data_not_needed:
                    feature_name= each.find_all("div")[0].text.strip().lower().replace(" ", '_')
                    feature_data= each.find_all("div")[1].text.strip()
                    
                    ##
                    car_details_grouped_by_brand[feature_name] = feature_data

            if n == 0 and m == 0:
                inner_df = pd.DataFrame(car_details_grouped_by_brand, index=[index_number])
                index_number += 1
                all_car_details = inner_df.copy()
            else:
                inner_df = pd.DataFrame(car_details_grouped_by_brand, index=[index_number])
                index_number += 1
                all_car_details = pd.concat([all_car_details, inner_df], join='outer').copy()
            
            time.sleep(0.1)
            
            clear_output(wait=True)
            print(f"Number of scraped car data: {index_number+1}")

In [None]:
all_car_details

In [None]:
# Saving the initial dataframe
all_car_details.to_csv('CarPricesRaw.csv', index=False)