In [10]:
#!pip install selenium pandas openpyxl

In [38]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import pandas as pd
import requests
import re

# Function to fetch car details from each car's page
def getDetailData(link):
    patterns = {
        'Reg year': r'Reg year (\w+ \d{4})',
        'Make year': r'Make year (\d{4})',
        'Reg number': r'Reg number (\w+-\w+)',
        'Engine capacity': r'Engine capacity (\d+ cc)',
        'Spare key': r'Spare key (\w+)',
        'Transmission': r'Transmission (\w+)',
        'KM driven': r'KM driven ([\d,]+ km)',
        'Fuel type': r'Fuel type (\w+)'
    }

    base_url = 'https://www.cars24.com'  # If the link is relative
    full_link = base_url + link if not link.startswith('http') else link

    page = requests.get(full_link)
    soup = BeautifulSoup(page.content, 'html.parser')

    extracted_data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, soup.text)
        if match:
            extracted_data[key] = match.group(1)

    return extracted_data

# Setting up Chrome WebDriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')  # Run in headless mode
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)

# Open the Cars24 page for Honda cars in Delhi NCR
url  ="https://www.cars24.com/buy-used-car?f=make%3A%3D%3Amaruti%3AOR%3Amake%3A%3D%3Atoyota%3AOR%3Amake%3A%3D%3Askoda&sort=bestmatch&serveWarrantyCount=true&gaId=349842771.1725638054&listingSource=TabFilter&storeCityId=3686"

driver.get(url)

# Scroll the page to load more cars (adjust the number of scrolls if necessary)
for i in range(5):  # Adjust the range for the number of scrolls
    driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
    time.sleep(2)  # Wait for new content to load

# Save the page source to file
with open("output.html", "w", encoding="utf-8") as file:
    file.write(driver.page_source)

driver.quit()

# Read the saved HTML file
with open('output.html', 'r', encoding="utf-8") as file:
    data = file.read()

# Parse the HTML data
soup = BeautifulSoup(data, 'html.parser')
tables = soup.find_all("a", {"class": "_1_1Uy"})  # Change this class based on the actual class found in the source

Cars = []
for table in tables:
    link = table['href']
    h3=table.find('h3',class_='_2Out2').get_text()
    make=h3.split()[1]
    model = " ".join(h3.split()[2:])  # Car model
    year_of_manufacture = h3.split()[0]

    li_items = table.find('ul', class_='_3jRcd').find_all('li')  # Car details
    km_driven = li_items[-3].text  # Example: '5,061 km'
    fuel_type = li_items[-2].text  # Example: 'Petrol'
    ownership = li_items[-1].text  # Example: '1st owner'
    price = table.find('strong', class_='_37WXy').text  # Price


    location_string=table.find('p',class_='_2rxhF').get_text()
    location=" ".join(location_string.split()[5:])
    #location_string = table.find('p', class_='_2rxhF').text  # Extract location

    # Handling location with error check
    #location_match = re.search(r'Free Test Drive Today at(.*)', location_string)
    #if location_match:
        #location = location_match.group(1).strip()
    #else:
        #location = "Unknown"  # Default to "Unknown" if no match is found

    # Add car data
    car_data = {
        "link": link,
        "make":make,
        "model": model,
        "year_of_manufacture": year_of_manufacture,
        "km_driven": km_driven,
        "fuel_type": fuel_type,
        "ownership": ownership,
        "price": price,
        "location": location
    }

    # Fetch more details from the car's page
    more_details = getDetailData(link)
    car_data.update(more_details)

    Cars.append(car_data)

# Save the data to a CSV file
df = pd.DataFrame(Cars)
csv_file = 'cars_details.csv'
df.to_csv(csv_file, index=False)

print(f"Data successfully written to {csv_file}")


Data successfully written to cars_details.csv


In [39]:
df

Unnamed: 0,link,make,model,year_of_manufacture,km_driven,fuel_type,ownership,price,location,Reg year,Make year,Reg number,Engine capacity,Spare key,Transmission,KM driven,Fuel type
0,https://www.cars24.com/buy-used-maruti-swift-d...,Maruti,Swift Dzire VDI,2013,"1,09,035 km",Diesel,2nd owner,₹4.25L,"Bachupally, Hyderabad",Apr 2013,2013,AP10-BD4428Engine,1248 cc,YesTransmission,ManualKM,"109,035 km",Diesel2013
1,https://www.cars24.com/buy-used-maruti-ertiga-...,Maruti,Ertiga VXI AT SHVS,2022,"35,378 km",Petrol,1st owner,₹10.84L,"Bachupally, Hyderabad",May 2022,2022,TS12-EU4555Engine,1462 cc,YesTransmission,AutomaticKM,"35,378 km",Petrol2022
2,https://www.cars24.com/buy-used-maruti-alto-k1...,Maruti,Alto K10 VXI,2022,"22,864 km",Petrol,1st owner,₹4.15L,"Bachupally, Hyderabad",Nov 2022,2022,TS07-JM3858Engine,998 cc,YesTransmission,ManualKM,"22,864 km",Petrol2022
3,https://www.cars24.com/buy-used-maruti-baleno-...,Maruti,Baleno DELTA PETROL 1.2,2016,"55,511 km",Petrol,2nd owner,₹4.91L,"Kompally, Hyderabad",Oct 2016,2016,TS07-FE5376Engine,1197 cc,YesTransmission,ManualKM,"55,511 km",Petrol2016
4,https://www.cars24.com/buy-used-maruti-alto-80...,Maruti,Alto 800 LXI,2014,"43,780 km",Petrol,1st owner,₹2.39L,"Kompally, Hyderabad",Aug 2014,2014,TS08-EC0838Engine,796 cc,NoTransmission,ManualKM,"43,780 km",Petrol2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,https://www.cars24.com/buy-used-maruti-ertiga-...,Maruti,Ertiga VXI AT SHVS,2022,"29,143 km",Petrol,1st owner,₹10.80L,"Bachupally, Hyderabad",Jul 2022,2022,TS07-JG7401Engine,1462 cc,YesTransmission,AutomaticKM,"29,143 km",Petrol2022
210,https://www.cars24.com/buy-used-maruti-ertiga-...,Maruti,Ertiga VXI AT SHVS,2020,"25,009 km",Petrol,1st owner,₹8.93L,"Bachupally, Hyderabad",Sep 2020,2020,TS09-FN6588Engine,1462 cc,YesTransmission,AutomaticKM,"25,009 km",Petrol2020
211,https://www.cars24.com/buy-used-maruti-s-press...,Maruti,S PRESSO VXI PLUS AMT,2020,"29,955 km",Petrol,1st owner,₹4.37L,"Kompally, Hyderabad",Jun 2020,2020,TS07-HF2983Engine,998 cc,YesTransmission,AutomaticKM,"29,955 km",Petrol2020
212,https://www.cars24.com/buy-used-maruti-dzire-2...,Maruti,Dzire ZDI PLUS,2018,"90,164 km",Diesel,1st owner,₹7.18L,"Kompally, Hyderabad",Dec 2018,2018,TS11-EP5868Engine,1248 cc,YesTransmission,ManualKM,"90,164 km",Diesel2018


In [10]:
from google.colab import files

files.download('cars_details.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>