<h2><font face=verdona color=yellow><center>Web Extraction Pattern</center></font></h3>
<h4><font face=verdona >
Structure of Extracting Data from websites<br>
⦿ Searching Reputed Used Car Selling Organization Website - Website1_HomeURL, Website2_HomeURL, ...., WebsiteN_HomeURL<br>
⦿ Searching all Cities each organization operates in and fetching links filtering only that city - Website1 : City1_URL,...,CityN_URL<br>
⦿ Fetching links of all cars sold in each city - City1 : Car1_URL, Car2_URL, Car3_URL,........,CarN_URL<br>
⦿ Extracting only required data from each Car link.<br>
</font></h4>
<pre>
                       Website1                             .....                         WebsiteN
                          |                                                                  |
                 -------------------                                                ------------------------
                 |        |         |                                               |                      | 
              City1     City2     City3                                          City1     .....         CityN
                |         |         |                                              |                       |
    -------------      --------     -----------------                  -------------                   --------------
    |     |     |      |      |         |     |     |                  |           |                   |            |                       
  Car1  Car2  Car3    Car1   Car2      Car1  Car2  Car3              Car1  ....  CarN                 Car1  ....  CarN
    ↓     ↓    ↓       ↓      ↓         ↓      ↓    ↓                  ↓          ↓                     ↓           ↓                                     
      Extracted        Extracted           Extracted                     Extracted                        Extracted                                     
        Data             Data                 Data                         Data                              Data                                       


<h4><font face=verdona>DATA CREDITS - Cars24, Carwaale

<H2><font face=georgia color=cyan><center>Importing Modules

In [5]:
import os
import ast
import time
import requests
import re
from bs4 import BeautifulSoup
from pathlib import Path


from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException

In [6]:
def fetch_web_homepage(mainpage_link, output_directory,
                   headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'}
):
    try:
        os.makedirs(output_directory, exist_ok=True)
        available_cities = requests.get(mainpage_link, headers=headers)
        available_cities.raise_for_status() 
    except requests.exceptions.RequestException as e:
        print(f"Error fetching main page {mainpage_link}: {e}")
        return {} 

    soup = BeautifulSoup(available_cities.content, 'lxml')
    with open(os.path.join(output_directory, 'main_cities.txt'), 'w', encoding='utf-8')as f:
        f.write(soup.prettify())

In [7]:
organizations = ['cars24', 'carwale', 'cartrade', 'cardekho', 'spinny']

mainpage_cars24 = 'https://www.cars24.com/buy-used-cars/'
mainpage_carwale = 'https://www.carwale.com/used/'
mainpage_cartrade = 'https://www.cartrade.com/second-hand/'
mainpage_cardekho = 'https://www.cardekho.com/used-cars/'
mainpage_spinny = 'https://www.spinny.com/used-cars/'

websites = {organizations[0] : mainpage_cars24, organizations[1] : mainpage_carwale, organizations[2]: mainpage_cartrade, organizations[3] : mainpage_cardekho, organizations[4]: mainpage_spinny}

In [8]:
# for organization, website_URL in websites.items():
#     fetch_web_homepage(mainpage_link=website_URL, output_directory='organized_cars_structure/'+organization+'_data')

In [9]:
def extract_citywise_links(root_directory, required_links_extractor, city_extractor, html_tag='a'):
    with open(os.path.join(root_directory, 'main_cities.txt'), 'r', encoding='utf-8')as f:
        content = f.read()

    soup = BeautifulSoup(content, 'lxml')
    href_values = [a.get('href') for a in soup.select(html_tag, href=True)]
    city_links = [required_links_extractor(link) for link in href_values]
    cities = [city_extractor(link) for link in city_links]
    citywise_links = dict()

    for city, link in zip(cities, city_links):
        if city.strip() != '':
            citywise_links[city] = link
    return citywise_links

In [10]:
extract_city_links_cars24 = lambda x: (x if 'buy-used-cars-' in x else '')
city_extractor_cars24 = lambda x: x[len('https://www.cars24.com/buy-used-cars-'):-1]

extract_city_links_carwale = lambda href_val : 'https://www.carwale.com' + href_val
city_extractor_carwale = lambda href_val: href_val[len('https://www.carwale.com/used/'):-1]

extract_city_links_cartrade = lambda x: 'https://www.cartrade.com'+x if '-' not in  x.split('/')[2] else '//'
city_extractor_cartrade = lambda x: x.split('/')[-2]

extract_city_links_cardekho = lambda x: (x if 'used-cars+in+' in x and '?' not in x else '')
city_extractor_cardekho = lambda x: x[len('https://www.cardekho.com/used-cars+in+'):]

extract_city_links_spinny = lambda x: ('https://www.spinny.com'+x if 'in' in x and '/s/' in x else '')
city_extractor_spinny = lambda x: x.split('-in-')[-1][:-3]

link_city_selector_pairs =  {
    organizations[0] : [extract_city_links_cars24, city_extractor_cars24, 'a.sc-fuUopJ.ieLSJK'],
    organizations[1] : [extract_city_links_carwale, city_extractor_carwale, 'a.o-aw.o-aE.o-co.o-ml.o-lT.o-mo.o-mO.o-bS.EW3D5f.o-kY.o-f'],
    organizations[2] : [extract_city_links_cartrade, city_extractor_cartrade, 'a.font-muli-bold.pt-3.pr-3.pb-3.pl-3.font-13.color-comet.display-block.full-width.full-height.nav-tab-link'],
    organizations[3] : [extract_city_links_cardekho, city_extractor_cardekho, 'a'],
    organizations[4] : [extract_city_links_spinny, city_extractor_spinny, 'a.styles__navItemTitle'],
}

cars24_citylinks, carwale_citylinks, cardekho_citylinks, spinny_citylinks, cartrade_citylinks = {},{},{},{},{}

orgs_cities = [cars24_citylinks, carwale_citylinks, cardekho_citylinks, spinny_citylinks, cartrade_citylinks]


for org,org_cities in zip(organizations, orgs_cities):
    org_cities.update(extract_citywise_links(
    root_directory=f'organized_cars_structure/{org}_data/', required_links_extractor=link_city_selector_pairs[org][0], 
    city_extractor=link_city_selector_pairs[org][1], html_tag=link_city_selector_pairs[org][2]))
    print(f'\n{org.capitalize()} Available cities and their links \n','-'*30)
    if org_cities == organizations[-1]:
        org_cities = org_cities[:5]
    print(org_cities)
    print(f'Total cities : {len(org_cities)}')


Cars24 Available cities and their links 
 ------------------------------
{'delhi-ncr': 'https://www.cars24.com/buy-used-cars-delhi-ncr/', 'bangalore': 'https://www.cars24.com/buy-used-cars-bangalore/', 'chennai': 'https://www.cars24.com/buy-used-cars-chennai/', 'hyderabad': 'https://www.cars24.com/buy-used-cars-hyderabad/', 'mumbai': 'https://www.cars24.com/buy-used-cars-mumbai/', 'ahmedabad': 'https://www.cars24.com/buy-used-cars-ahmedabad/', 'pune': 'https://www.cars24.com/buy-used-cars-pune/', 'new-delhi': 'https://www.cars24.com/buy-used-cars-new-delhi/', 'noida': 'https://www.cars24.com/buy-used-cars-noida/', 'kochi': 'https://www.cars24.com/buy-used-cars-kochi/', 'ghaziabad': 'https://www.cars24.com/buy-used-cars-ghaziabad/', 'gurgaon': 'https://www.cars24.com/buy-used-cars-gurgaon/', 'kolkata': 'https://www.cars24.com/buy-used-cars-kolkata/', 'lucknow': 'https://www.cars24.com/buy-used-cars-lucknow/', 'chandigarh-tricity': 'https://www.cars24.com/buy-used-cars-chandigarh-tricit

In [11]:
price_range_pairs = {0 : 3, 3 : 10, 10 : 30, 30 : 100}

filter_links = {
    organizations[0]: 
    lambda city, new_min, new_max : f'https://www.cars24.com/buy-used-cars-{city}/?f=listingPrice%3Abw%3A{new_min}%2C{new_max}&sort=bestmatch&serveWarrantyCount=true&listingSource=TabFilter&storeCityId=1',
    
    organizations[1]: 
    lambda city, new_min, new_max : f'https://www.carwale.com/used/{city}/?kms=0-&year=0-&budget={new_min}-{new_max}&state=-1&so=-1&sc=-1',
     
    organizations[2]: 
    lambda city, new_min, new_max : f'https://www.cartrade.com/second-hand/{city}/#so=-1&sc=-1&budget={new_min}-{new_max}',

    organizations[3]: 
    lambda city, new_min, new_max : f'https://www.cardekho.com/used-cars+{new_min}-lakh-to-{new_max}-lakh+in+{city}',
    
    organizations[4]: 
    lambda city, new_min, new_max : f'https://www.spinny.com/used-cars-over-{new_min}-lakh-rs-under-{new_max}-lakh-rs-in-{city}/s/',
    }

def extract_filterurls(price_range_pairs, organizations, orgs_city_links, filter_links, output_directory):

    for min, max in price_range_pairs.items():
        for organization, orgwise_links, filter in zip(organizations, orgs_city_links, filter_links.values()):
            filter_name_url = dict()
            os.makedirs('organized_cars_structure/'+organization+'_data/'+output_directory, exist_ok=True)

            for city in orgwise_links.keys():
                if  organization == 'cars24' :
                    new_min, new_max = min*100000, max*100000
                else :
                    new_min, new_max = min, max

                filter_url = filter(city, new_min, new_max)
                filtername = f'{city}-{str(min)}-{str(max)}-lakhs_cars.txt'

                filter_name_url[organization+'-'+filtername] = filter_url

            output_file = f"organized_cars_structure/{organization}_data/{output_directory}/filterurls-{str(min)}-{str(max)}-lakhs.txt"
            
            with open(output_file, 'w', encoding='utf-8')as f:
                f.write(str(filter_name_url))

In [12]:
extract_filterurls(price_range_pairs, organizations, orgs_cities, filter_links, output_directory='cityprice_urls')

In [13]:
def setup_selenium_driver():
    chrome_options = ChromeOptions()
    chrome_options.add_argument('--headless')  # Run in background
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36')
    
    try:
        driver = webdriver.Chrome(options=chrome_options)
        return driver
    except Exception as e:
        print(f"Failed to initialize Chrome driver: {e}")
        return None

def wait_for_container(driver, container, timeout=30):
    try:
        element = WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.CLASS_NAME,container))
        )
        return True
    except TimeoutException:
        print(f"{container} not found within timeout period")
        return False

def scrape_with_selenium(url, driver, container=None):
    try:
        driver.get(url)
        if container is not None:
            if wait_for_container(driver, container):
                time.sleep(2)
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, 'lxml')
                return soup
            else:
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, 'lxml')
                return soup
        else :
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'lxml')
            return soup            
                    
    except WebDriverException as e:
        print(f"Selenium error: {e}")
        return None

def fetch_filtered_data(organizations, org_start_index=0, file_start_index=0, start_index=0,
                      headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'}):
    file_num = 0
    iteration = 0
    driver = None
    
    try:
        for org_id, organization in enumerate(organizations):
            if org_id < org_start_index:
                continue

            if organization == 'spinny':
                if driver is None:
                    print("Setting up Selenium driver for dynamic content...")
                    driver = setup_selenium_driver()
                    if driver is None:
                        print("Failed to setup Selenium driver, falling back to requests")

            import_folder = os.path.join('organized_cars_structure/', organization + '_data/cityprice_urls')
            output_folder = os.path.join('organized_cars_structure/', organization + '_data/cityprice_data')
            filter_urls_file = os.listdir(import_folder)
            
            for file_num, file in enumerate(filter_urls_file):
                if file_num < file_start_index:
                    continue
                
                import_file = Path(os.path.join(import_folder, file))
                if import_file.is_file():
                    file_content = import_file.read_text()
                    filter_name_url = ast.literal_eval(file_content)
                    
                    for iteration, (filter_name, filter_link) in enumerate(filter_name_url.items()):
                        filter_name = filter_name[len(organization) + 1:]
                        if iteration < start_index:
                            continue
                        
                        time.sleep(5)
                        output_file = Path(os.path.join(output_folder, filter_name))
                        os.makedirs(output_folder, exist_ok=True)
                        output_file.touch(exist_ok=True)
                        
                        if organization == 'spinny' and driver is not None:
                            print(f"Using Selenium for {organization} - {filter_name}")
                            soup = scrape_with_selenium(filter_link, driver, container='Ripple__container')
                            if soup:
                                output_file.write_text(soup.prettify(), encoding='utf-8')
                            else:
                                print(f"Selenium failed for {filter_name}, skipping...")
                                continue
                        else:
                            try:
                                content = requests.get(filter_link, headers=headers, timeout=30)
                                soup = BeautifulSoup(content.content, 'lxml')
                                output_file.write_text(soup.prettify(), encoding='utf-8')
                            except requests.RequestException as e:
                                print(f"Request failed for {filter_name}: {e}")
                                continue
                        
            print(f'{organization.capitalize()}: Scraped {file_num + file_start_index + 1}/{len(filter_urls_file)} files')
            
            if organization == 'spinny' and driver is not None:
                driver.quit()
                driver = None
                print("Selenium driver closed")
            
    except Exception as ex:
        print(f"Error: {ex}")
        print(f"Execution stopped at {organizations[org_id + org_start_index]} - File: {filter_urls_file[file_num + file_start_index]}, Link: {iteration}")
    finally:
        if driver is not None:
            driver.quit()
        return org_id + org_start_index, file_num + file_start_index, iteration

In [14]:
# org_resume_index, file_resume_index, iteration = fetch_filtered_data(organizations)

In [15]:
html_tags = {
    organizations[0] : 'a.styles_carCardWrapper__sXLIp',
    organizations[1] : 'a.o-C',
    organizations[2] : 'div.titlebox.hover h3.title a',
    organizations[3] : 'h3.ListingBrandModelDetail__makeModelInfo.ListingBrandModelDetail__medium a',
    organizations[4] : 'a.font-muli'
    }

links_extractor = {
    organizations[0] : lambda x: x,
    organizations[1] : lambda x: 'https://www.carwale.com'+x,
    organizations[2] : lambda x: 'https://www.cardekho.com'+x,
    organizations[3] : lambda x: 'https://www.spinny.com'+x,
    organizations[4] : lambda x: 'https://www.cartrade.com'+x}


def extract_carurls(file, links_extractor, html_tag='a'):
    with open(file, 'r', encoding='utf-8')as f:
        content = f.read()

    soup = BeautifulSoup(content, 'lxml')
    href_values = [a.get('href') for a in soup.select(html_tag, href=True)]
    links = [links_extractor(link) for link in href_values]
    links = [link for link in links if link.strip() != '']
    return links

def pass_path_to_carurls_extractor(organizations):
    try : 
        for org in organizations:
            root_folder = os.path.join('organized_cars_structure/',org+'_data/cityprice_data/')
            output_folder = os.path.join('organized_cars_structure/',org+'_data/cars_urls/')
            os.makedirs(output_folder, exist_ok=True)
            files = os.listdir(root_folder)
            for file in files:
                import_file = Path(os.path.join(root_folder, file))
                output_file = Path(os.path.join(output_folder, file.replace('_cars','-cars-urls')))
                urls = extract_carurls(import_file, links_extractor[org], html_tag=html_tags[org])
                output_file.touch(exist_ok=True)
                output_file.write_text(str(urls), encoding='utf-8')
            print(f'{org.capitalize()} : Extracted all links \n ')    
    except Exception as ex: 
        print(ex)

In [16]:
# pass_path_to_carurls_extractor([organizations[3]])

In [17]:
CHROMEDRIVER_PATH = r'c:\Users\antho\Downloads\chromedriver-win64 (1)\chromedriver-win64\chromedriver.exe'
EDGEDRIVER_PATH = r'c:\Users\antho\Downloads\edgedriver_win64\msedgedriver.exe'
GECKODRIVER_PATH = r'c:\Users\antho\Downloads\geckodriver-v0.36.0-win32\geckodriver.exe'


extract_file_name = {
    organizations[0]: lambda url: url[len('https://www.cars24.com/buy-used-'):-1] + '.txt',
    organizations[1]: lambda url: '_'.join(url[len('https://www.carwale.com//used/'):-1].split('/')[1:]) + '.txt',
    organizations[2]: lambda url: url[len('https://www.cartrade.com/second-hand/'):-len('/?dc=0')].replace('/', '-') + '.txt',
    organizations[3]: lambda url: url.split('/used-')[-1].split('.htm')[0][:-len('-c6bf-4d66-9cee-5042109ec0f5')] + '.txt',
    organizations[4]: lambda url: '-'.join(url.split('/')[5:][:-1]) + '.txt'
}

def initialize_driver(browser_to_use,
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}):
    
    if browser_to_use == 'chrome':
        chrome_options = ChromeOptions()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument(f"user-agent={headers['User-Agent']}")
        service = ChromeService(executable_path=CHROMEDRIVER_PATH)
        driver = webdriver.Chrome(service=service, options=chrome_options)
    elif browser_to_use == 'edge':
        edge_options = EdgeOptions()
        edge_options.add_argument("--headless")
        edge_options.add_argument("--disable-gpu")
        edge_options.add_argument(f"user-agent={headers['User-Agent']}")
        service = EdgeService(executable_path=EDGEDRIVER_PATH)
        driver = webdriver.Edge(service=service, options=edge_options)
    elif browser_to_use == 'firefox':
        firefox_options = FirefoxOptions()
        firefox_options.add_argument("--headless")
        firefox_options.add_argument("--disable-gpu")
        firefox_options.set_preference("general.useragent.override", headers['User-Agent'])
        firefox_options.binary_location = r'C:\Program Files\Mozilla Firefox\firefox.exe'
        service = FirefoxService(executable_path=GECKODRIVER_PATH)
        driver = webdriver.Firefox(service=service, options=firefox_options)
    return driver

def fetch_carsdata_cardekho(driver, url):
    try:
        driver.set_page_load_timeout(30)
        driver.get(url)

        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.outer-card-container.specsCard")))
        time.sleep(3)

        try:
            view_features_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'cta-text') and contains(text(), 'View all Features')]")))
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", view_features_button)
            time.sleep(1)
            driver.execute_script("arguments[0].click();", view_features_button)
        except (TimeoutException, NoSuchElementException) as e:
            pass
        except Exception as e:
            print(f"  - General error clicking 'View all Features' button: {e}")


        try:
            view_specs_button = WebDriverWait(driver, 15).until(
                EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'cta-text') and contains(text(), 'View all Specifications')]")))
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", view_specs_button)
            time.sleep(1.5)
            driver.execute_script("arguments[0].click();", view_specs_button)

            try:
                WebDriverWait(driver, 5).until(
                    EC.visibility_of_element_located((By.CSS_SELECTOR, "div.accordianSec div.content[style*='height: auto'][data-gsp-accordion-content]")))
            except TimeoutException:
                pass

        except TimeoutException:
            pass
        except ElementClickInterceptedException:
            print(f"  - 'View all Specifications' button click intercepted. Retrying with direct JS.")
            try:
                time.sleep(1)
                driver.execute_script("arguments[0].click();", view_specs_button)
                time.sleep(4)
            except Exception as retry_e:
                print(f"  - Failed to click 'View all Specifications' button even on retry: {retry_e}")
        except Exception as e:
            print(f"  - General error clicking 'View all Specifications' button: {e}")
        
    
        accordian_headers_after_expand = driver.find_elements(By.CSS_SELECTOR, "h3.accordianheader")
        clicked_h3_count_after_expand = 0
        for i, header in enumerate(accordian_headers_after_expand):
            try:
                WebDriverWait(driver, 5).until(EC.visibility_of(header))
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", header)
                time.sleep(0.2)
                driver.execute_script("arguments[0].click();", header)
                clicked_h3_count_after_expand += 1
                time.sleep(0.5)
            except (ElementClickInterceptedException, StaleElementReferenceException, TimeoutException, Exception) as inner_e:
                print(f"  - Could not click h3 '{header.text}' after expansion: {inner_e}")
            time.sleep(2)

        content = driver.page_source
        soup = BeautifulSoup(content, 'lxml')
        return soup

    except Exception as e:
        print(f"\n!!! Major error during Selenium processing for URL ({url}): {e}")
        return False


def fetch_carsdata(organizations, org_start_index=None, file_start_index=None, url_start_index=None, 
                    urls_per_browser=5,
                   headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}):
    file_num = 0
    url_id = 0

    browsers = ['edge', 'chrome', 'firefox']
    current_browser_index = 0
    driver = None

    try:
        list_of_not_exec = []
        for org_id, org in enumerate(organizations):
            if org_id < org_start_index and org_start_index is not None:
                continue
            
            root_folder = os.path.join('organized_cars_structure/', org + '_data/cars_urls/')
            output_folder = os.path.join('organized_cars_structure/', org + '_data/cars_data/')
            os.makedirs(output_folder, exist_ok=True)
            files = os.listdir(root_folder)
            print(f"\n--- Processing Organization: {org} ---")

            for file_num, file in enumerate(files):
                
                if file_num < file_start_index and file_start_index is not None:
                    continue

                import_file = Path(os.path.join(root_folder, file))
                file_content = import_file.read_text(encoding='utf-8')
                urls = ast.literal_eval(file_content)
                print(f"\n--- Processing File no.: {file_num} ---",end='\t')
                
                for url_id, url in enumerate(urls):
                    if url_id < url_start_index and url_start_index is not None:
                        continue
                    print(f"\n--- Processing Url no.: {url_id} ---")

                    if org == 'cardekho' or org == 'spinny':
                        browser_to_use = browsers[current_browser_index % len(browsers)]
                        if driver is None or (url_id > 0 and (url_id % urls_per_browser == 0)):
                            if driver:
                                print(f"Closing previous {browsers[(current_browser_index-1)%len(browsers)]} driver.")
                                driver.quit()
                            try:
                                driver = initialize_driver(browser_to_use)
                                current_browser_index += 1 
                            except Exception as e:
                                print(f"!!! Failed to initialize {browser_to_use} driver: {e}")
                                continue 

                    if org == 'cardekho':
                        soup = fetch_carsdata_cardekho(driver, url)
                        if not soup : 
                            soup=BeautifulSoup('', 'lxml')
                            list_of_not_exec.append([ org_id, file_num, url_id])
                    elif org == 'spinny':
                        unique_url_num = url[:-1].split('/')[-1]
                        detailed_specs_url = url.replace('com/buy',f'com/car-specification/{unique_url_num}?referrer=/buy')
                        soup = scrape_with_selenium(detailed_specs_url, driver)                     
                    else: 
                        time.sleep(5)
                        content = requests.get(url=url, headers=headers)
                        content.raise_for_status()
                        soup = BeautifulSoup(content.content, 'lxml')
                    
                    try:
                        filename_extractor = extract_file_name[org]
                        output_file = filename_extractor(url)
                        output_path = Path(os.path.join(output_folder, output_file))
                        print(output_path)
                        output_path.touch(exist_ok=True)
                        output_path.write_text(soup.prettify(), encoding='utf-8')
                        print(f"Successfully scraped and saved to: {output_path}")

                    except requests.exceptions.RequestException as e:
                        print(f"Error fetching URL with requests ({url}): {e}")
                        continue

    except Exception as ex:
        print(f"\n!!! An unhandled error occurred in the main loop: {ex}")
        current_org = organizations[org_id] if 'org_id' in locals() and org_id < len(organizations) else "N/A"
        current_file_num = file_num if 'file_num' in locals() else "N/A"
        current_url_id = url_id if 'url_id' in locals() else "N/A"
        print(f'!!! Execution stopped at organization: {current_org}, File no. : {current_file_num}, Url no. : {current_url_id}')
    else:
        final_org_name = organizations[org_id] if 'org_id' in locals() and org_id < len(organizations) else "Unknown"
        final_file_count = file_num + 1 if 'file_num' in locals() else 0
        final_url_count = url_id + 1 if 'url_id' in locals() else 0
        print(f'\n--- Execution completed for {final_org_name} ---')
        print(f'Processed Files: {final_file_count}/{len(files)}, Last URL no. : {final_url_count}')
    finally:
        if driver:
            print("Closing final browser driver.")
            driver.quit()
    return list_of_not_exec

In [164]:
# resume_exec_for = fetch_carsdata(organizations)

In [296]:
feature_names = ['Reg_yr', 'Manufacturing_yr', 'Km_driven', 'Engine capacity', 'Prev_owners', 'Cylinders', 'Max Power', 'Seater', 'Ground_clearance', 'Bootspace', 'Fueltank', 'Airbags', 'Fueltype','RTO', 'Transmission', 'Insurance_expiry', 'Insurance_type', 'Drivetrain', 'Bodytype', 'Model','Brand', 'City', 'Price']

cars24_labels = ['Reg. year', 'Make year', 'KM driven', 'Engine capacity', 'Ownership', 'Cylinders', 'Max Power (bhp)', 'Seating Capacity', 'Ground Clearance (mm)', 'Bootspace (litres)', 'Fueltank Capacity (litres)','Airbags', 'Fuel', 'Reg number','Transmission', 'insuranceExpiry', 'insuranceType', 'Drivetrain',  'bodyType', 'model', 'make', 'city', 'price']

carwale_labels = ['formattedRegistrationDate', 'Manufacturing_yr', 'kilometers', 'Engine', 'noOfOwners', 'Engine', 'Max Power (bhp@rpm)', 'Seating Capacity', 'Ground Clearance (unladen)', 'Bootspace(litres)', 'Fuel Tank Capacity', 'Airbags', 'Fuel Type', 'registrationNumber', 'Transmission', 'insuranceExpiry', 'insurance', 'Drivetrain', 'bodyType', 'modelName', 'makeName', 'cityName', 'priceNumeric']

cartrade_labels = ['Reg_yr','productionDate','KMS Driven', 'Engine', 'numberOfPreviousOwners', 'Engine', 'Max Power (bhp@rpm)', 'vehicleSeatingCapacity', 'Ground Clearance (unladen)', 'Bootspace', 'Fuel Tank Capacity', 'Airbs', 'fuelType', 'RTO LOCATION', 'vehicleTransmission', 'INSURANCE EXPIRY', 'INSURANCE TYPE', 'Drivetrain', 'bodyType', 'model', 'Brandname', 'addressLocality', 'price']

cardekho_labels = ['Registration Year', 'Year of Manufacture', 'Kms Driven', 'Engine Displacement', 'Ownership', 'No. of Cylinders', 'Max Power', 'Seating Capacity', 'Ground Clearance Unladen', 'Boot Space',  'Fuel Tank Capacity', 'Airbags','Fuel Type', 'RTO', 'Transmission Type', 'Insurance_status','Insurance', 'Drive Type', 'car_segment', 'model_name', 'brand_name', 'city', 'price']

spinny_labels = ['Registration Year', 'Make Year', 'Km driven', 'Displacement', 'No. of Owner', 'Number of cylinders', 'Max power (bhp)', 'Seating capacity', 'Ground clearance', 'Boot space',  'Fuel tank capacity', 'Airbags','Fuel Type', 'RTO', 'vehicleTransmission', 'Insurance Validity','Insurance Type', 'Drivetrain', 'BodyType','model', 'brand', 'Car Location', 'Price']

org_labels = {
    organizations[0] : cars24_labels,
    organizations[1] : carwale_labels,
    organizations[2] : cartrade_labels,
    organizations[3] : cardekho_labels,
    organizations[4] : spinny_labels    
}

In [305]:
from datetime import datetime

def cars24_data2dict(content, features=feature_names, keys=cars24_labels):

    dict_data = {}
    for key, feature in zip(keys, features):
        dict_data[feature] = None

    for key, feature in zip(keys, features):
        val = None
        pattern = f'\\\"label\\\":\\\"{key}\\\",\\\"value\\\":\\\"'
        if pattern in content : 
            val = content.split(pattern)[-1].split('\\\"')[0]
       
        if val == None : 
            pattern2 = f'"{key}":"'
            if pattern2 in content:
                val = content.split(pattern2)[-1].split('",')[0]

        if val == None:
            pattern3 = f'\\\"{key}\\\":\\\"'
            if pattern3 in content:
                val = content.split(pattern3)[-1].split('\\\",')[0]
        
        dict_data[feature] = val

    if dict_data['Max Power'] is not None : 
        rpm = content.split('\\\"label\\\":\\\"Max Power (rpm)\\\",\\\"value\\\":\\\"')[-1].split('\\\"')[0]
        dict_data['Max Power'] = dict_data['Max Power']+'@'+rpm

    if dict_data['Insurance_expiry'] is not None : 
        dt = datetime.fromtimestamp(int(dict_data['Insurance_expiry']))
        dict_data['Insurance_expiry'] = dt.strftime('%Y-%m-%d')
    
    brand_pattern = r'"Brand","name":"([^\\\"]*?)"'
    dict_data['Brand'] = re.search(brand_pattern, content).group(1)

    price_patterns = [
        r'\\\"finalPrice\\\":{\\\"amount\\\":(\d+(?:\.\d+)?),',  
        r'\\\"listingPrice\\\":(\d+(?:\.\d+)?),'                      
    ]

    dict_data['Price'] = None
    for pattern in price_patterns:
        price_match = re.search(pattern, content)
        if price_match:
            dict_data['Price'] = price_match.group(1)
            break
    return dict_data


In [306]:
def carwale_data2dict(content, features=feature_names, keys=carwale_labels):

    dict_data = {}

    if content.strip() == '':
        return dict_data
    
    reqd_data = content.split('usedCarDetails')[-1].split('"similarCars":')[0]

    for i, (key, feature) in enumerate(zip(keys, features)): 
        element_present = True if len(reqd_data.split(f'"specName":"{key}","specValue":"')) > 1 else False
        val = reqd_data.split(f'"specName":"{key}","specValue":"')[-1].split('"')[0] 
        dict_data[feature] = val if val.strip() != '' and element_present else None
        if dict_data[feature] == None:
            element_present = True if len(reqd_data.split(f'"{key}":')) > 1 else False
            val = reqd_data.split(f'"{key}":')[-1].split('"')[1]
            dict_data[feature] = val if val.strip() != '' and element_present else None
    
    if dict_data['Engine capacity'] is not None and ',' in dict_data['Engine capacity']:
        dict_data['Engine capacity'] = dict_data['Engine capacity'].split(',')[0].strip()
        dict_data['Cylinders'] = dict_data['Cylinders'].split(',')[1].split('Cylinders')[0].strip()

    dict_data['Bodytype'] = content.split('"bodyType": "')[1].split('",')[0]

    dict_data['Airbags'] = 0 if 'No Airbags' in reqd_data else 1

    make_yr_pattern, make_mnt_pattern = r'"makeYear":"([^\\\"]*?)"', r'"makeMonth":"([^\\\"]*?)"'
    make_yr, make_mnt = re.search(make_yr_pattern, reqd_data).group(1),re.search(make_mnt_pattern, reqd_data).group(1)    
    dict_data['Manufacturing_yr'] = make_mnt +' '+ make_yr

    return dict_data

In [307]:
def cartrade_data2dict(content, features=feature_names, keys=cartrade_labels):
    clean_dict = {}

    key_pattern = '|'.join(re.escape(key) for key in keys)
    pattern = rf'"({key_pattern})":\s*("([^"]*)"|-?\d+\.?\d*),'
    matches = re.findall(pattern, content)

    found_keys = {}
    for match in matches:
        key = match[0] 
        value = match[1].replace('"','')
        found_keys[key] = value

    reqd_data = content.split('<tbody>')[-1].split('</tbody')[0]
    soup = BeautifulSoup(reqd_data, 'lxml')
    key_vals1 = [tag.get_text(strip=True) for tag in soup.select('td')]

    for key,feature in zip(keys, features):
        clean_dict[feature] = found_keys.get(key, None)

        if clean_dict[feature] is None:        
            if key not in key_vals1 :
                continue
            index = key_vals1.index(key)
            if not index%2 :
                clean_dict[feature] = key_vals1[index+1]
    
    for key, feature in zip(keys, features):
        if clean_dict[feature] is None:

            if len(content.split(key)) < 2 :
                continue
            if key=='Engine' and 'Engine Type' in content:
                val = ' '.join(content.split('Engine Type')[:-1]).split(key)[-1].split('</span>')[1].split('>')[-1].strip()
            else : 
                val = content.split(key)[-1].split('</span>')[1].split('>')[-1].strip()
            clean_dict[feature] = val
            
    clean_dict['Model'] = content.split('.setTargeting("Model",')[-1].split(')')[0].replace("'", "").strip()

    clean_dict['Manufacturing_yr'] = clean_dict['Manufacturing_yr'].split('T')[0]
    clean_dict['Brand'] = content.split('"url":')[0].split('"name": "')[-1].strip()[:-2]
    clean_dict['Airbags'] = 0 if 'No Airbags' in content else 1
    clean_dict['Bodytype'] = content.split('"bodyType": "')[-1].split('",')[0]

    if clean_dict['Engine capacity'] is not None and ',' in clean_dict['Engine capacity']:
        clean_dict['Engine capacity'] = clean_dict['Engine capacity'].split(',')[0].strip()
        clean_dict['Cylinders'] = clean_dict['Cylinders'].split(',')[1].split('Cylinders')[0].strip()

    return clean_dict

In [308]:
def cardekho_data2dict(content, features=feature_names, keys=cardekho_labels):
   clean_dict = {}

   for key, feature in zip(keys, features):
      data = content
      if key == 'Airbags':
         clean_dict[feature] = len(data.split('Airbag'))-1
      elif 'Insurance' in key :
         clean_dict[feature] = None
      elif len(data.split(key)) > 1:
         data = ' '.join(data.split(key)[1:])
         val = data.split('</span')[0].split('">')[-1].strip()
         val = val if val != '' else None
         clean_dict[feature] = val
      else : 
         clean_dict[feature] = None

   for key, feature in zip(keys, features):
      if clean_dict[feature] is None and len(content.split(f'"{key}":"'))>1:
         clean_dict[feature] = content.split(f'"{key}":"')[-1].split('",')[0]

   return clean_dict

In [309]:
def spinny_data2dict(content, features=feature_names, keys=spinny_labels):
   clean_dict = {}
   if '<div class="ResponsiveToolTip__responsiveToolTipContainer DesktopOverview__responsiveTooltip DesktopOverview__increaseBR">' in content :
      reqd_data = content.split('<div class="ResponsiveToolTip__responsiveToolTipContainer DesktopOverview__responsiveTooltip DesktopOverview__increaseBR">')[0]
   else :
      reqd_data = content.split('<div class="ResponsiveToolTip__responsiveToolTipContainer DescriptionModal__featureDescriptionDesktopTooltip"')[0]
   val = None

   for key, feature in zip(keys, features):
      val = None

      if key == 'Airbags':
         val = 1 if 'Airbags' in reqd_data else 0
      elif key == 'BodyType':
         pass
      elif key == 'Displacement' or key == 'Number of cylinders' or key == 'Seating capacity' or key ==  'Max power (bhp)' or key == 'Ground clearance' or  key == 'Boot space' or key == 'Fuel tank capacity' or key == 'Drivetrain':
         if key in reqd_data:
            data = reqd_data.split(key)[-1]
            val = data.split('</')[1].split('">')[-1].strip()
      elif key == 'RTO':
         if f'"{key}","unitText":"' in reqd_data :
            val = reqd_data.split(f'"{key}","unitText":"')[-1].split('"')[0]
      elif key == 'vehicleTransmission' or key == "Price" or key == "brand" or key== "model":
         if f'"{key}":' in reqd_data :
            val = reqd_data.split(f'"{key}":')[-1].split(',')[0].replace('"','')
      elif key in reqd_data:
         data = reqd_data.split(key)[-1]
         val = data.split('">')[1].split('</div>')[0].strip()
      else : 
         val = None
      val = val if val != '' else None
      if val == None :
         if f'"{key}":"' in reqd_data :
            val  = reqd_data.split(f'"{key}":"')[-1].split('"')[0]
      clean_dict[feature] = val

   clean_dict['City'] = clean_dict['City'].split(',')[-1].strip() if clean_dict['City'] is not None else None
   clean_dict['Bodytype'] = None

   return clean_dict

In [310]:
more_specs = {
    organizations[0]: cars24_data2dict,
    organizations[1]: carwale_data2dict,
    organizations[2]: cartrade_data2dict,
    organizations[3]: cardekho_data2dict,
    organizations[4]: spinny_data2dict
    }

In [313]:
def extract_data2dict():
    file_num=0
    try : 
        for org in organizations[:]:
            import_folder = f'organized_cars_structure/{org}_data/cars_data/'
            output_folder = f'organized_cars_structure/{org}_data/cars_dict_data/'
            files = os.listdir(import_folder)
            os.makedirs(output_folder, exist_ok=True)
            for file_num, file in enumerate(files) :
                import_file = Path(os.path.join(import_folder, file))
                output_file = Path(os.path.join(output_folder, file.replace('.txt','_dict.txt')))
                output_file.touch(exist_ok=True)
                if import_file.is_file(): 
                    file_content = import_file.read_text(encoding='utf-8')
                    data_extractor = more_specs[org]
                    specs = data_extractor(file_content)
                    # print(specs)
                    output_file.write_text(str(specs), encoding='utf-8')
                    # break
    except Exception as ex :
        print(f'Execution stopped at : {org} File no. : {file_num+1} due to {ex}, {file} \n{specs}')
    else :
        print(f'Organization : {org} Extracted files : {file_num+1}/{len(files)} ')

In [314]:
extract_data2dict()

Organization : spinny Extracted files : 896/896 


In [317]:
import numpy as np
all_data = {}
for feature in feature_names:
    all_data[feature] = []

for org in organizations:
    root_folder = f'organized_cars_structure/{org}_data/cars_dict_data'
    print('Processing Organization : ', org)
    print(len(os.listdir(root_folder)))
    org_wise = {}
    for feature in feature_names:
        org_wise[feature] = []
    for file in os.listdir(root_folder):
        file = Path(os.path.join(root_folder, file))
        if file.is_file():
            dict_data = file.read_text(encoding='utf-8')
            dict_data = ast.literal_eval(dict_data)
            for feature in feature_names:
                val = dict_data.get(feature, None)
                val = np.nan if val is None else val
                org_wise[feature].append(val)
                all_data[feature].append(val)

    print(org_wise)

Processing Organization :  cars24
815
{'Reg_yr': ['Jul 2018', 'Feb 2014', 'Jun 2015', 'Jun 2017', 'Sep 2020', 'Mar 2023', 'Oct 2018', 'Nov 2018', 'Apr 2019', 'May 2015', 'Mar 2023', 'Jul 2017', 'Sep 2017', 'Jun 2018', 'Oct 2020', 'Jul 2021', 'Sep 2022', 'Sep 2023', 'Aug 2017', 'Aug 2018', 'Jul 2018', 'Nov 2019', 'Mar 2018', 'Jun 2024', 'Dec 2024', 'Feb 2018', 'Oct 2021', 'Feb 2021', 'Jul 2022', 'Apr 2022', 'May 2017', 'Sep 2020', 'Oct 2020', 'Jan 2017', 'Jun 2017', 'Jul 2018', 'Jun 2018', 'Oct 2016', 'Aug 2016', 'Jun 2017', 'Mar 2019', 'Jun 2018', 'Aug 2018', 'Jul 2019', 'Mar 2020', 'Jun 2019', 'Oct 2019', 'Feb 2021', 'Sep 2011', 'May 2015', 'Apr 2016', 'Oct 2016', 'Aug 2016', 'Mar 2017', 'Mar 2019', 'Oct 2019', 'Dec 2020', 'Jun 2021', 'Jun 2010', 'Jan 2012', 'Oct 2013', 'Sep 2013', 'Apr 2014', 'Aug 2014', 'Apr 2015', 'Jul 2014', 'Aug 2014', 'Nov 2015', 'Dec 2019', 'Jun 2019', 'Jan 2020', 'Aug 2021', 'Jan 2023', 'May 2023', 'Mar 2024', 'May 2019', 'Oct 2013', 'Jul 2015', 'Dec 2015', 'O

In [None]:
import pandas as pd
pd.DataFrame(all_data).to_csv('datasets/cars_dataset.csv', index=0)