# Car Price Prediction: Data Collection

`Mostly Required Features: `
1. Brand
2. Model
3. Variant
3. Manufacturing Year
4. Driven kilometers
5. Fuel
6. Number of Owners
7. Location
8. Price of the Car

`Sources:`
- Olx
- Cardekho
- Cars24

`Body Type:`
- Hatchback
- Sedan
- SUV
- Luxury Sedan
- Luxury SUV
- Coupe
- Mini Van

In [61]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
import time, sys
import tqdm.notebook as tqdm
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

### Defining functions for webscraping

In [9]:
#for starting driver
def start_driver():
    return webdriver.Chrome('chromedriver.exe')

#for scraping data from https://www.cars24.com/
def scrape_from_cars24(url='https://www.cars24.com',location='delhi',body_type='Hatchback',limit=1000):
    #starting driver
    driver = start_driver()
    driver.maximize_window()
    
    #initializing driver with url
    driver.get(url)
    
    try:
        #clicking on select manually button
        WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//button[contains(text(),"SELECT MANUALLY")]'))).click()

        #initializing search box with location
        searchBox = WebDriverWait(driver,10).until(ec.presence_of_element_located((By.XPATH,'//div[@class="_6QaMX"]/input')))
        searchBox.send_keys(location)

        #clicking on searched location
        WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//ul[@class="_16Bvy"]/li[1]'))).click()

        #clicking on VIEW ALL CARS link
        WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//a[contains(text(),"VIEW ALL CARS")]'))).click()

        #clicking on filter By Body Type
        WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//h3[contains(text(),"By Body Type")]'))).click()

        #clicking on body type
        WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,f'//div[@class="_23XSz"]/div[contains(text(),"{body_type}")]'))).click()
    except:
        driver.close()
        return pd.DataFrame({
            "Brand": [],
            "Model": [],
            "Varient": [],
            "Manufacture Year": [],
            "Driven Kilometers": [],
            "Fuel": [],
            "Number of Owner": [],
            "Body Type": [],
            "Location": [],
            "Price": []
        })
    
    #scraping the required data
    brands = []
    models = []
    varients = []
    manufacture_years = []
    driven_kms = []
    fuels = []
    owners = []
    locations = []
    body_types = []
    prices = []
    
    time.sleep(5)
    container = WebDriverWait(driver,10).until(ec.presence_of_element_located((By.XPATH,'/html/body/div[1]/div[2]/div/div[2]/div[2]/div[2]/div/div[1]')))
        
    count = 0
    x = 0
    y = 200
    success = 0
    failed = 0
    for i in tqdm.tqdm(range(limit),desc=f"Scraping [{body_type} in {location}]"):
        try:
            #moving to next sibling data
            time.sleep(2)
            container = container.find_element_by_xpath('./following-sibling::div')
            
            #scraping data
            title = container.find_element_by_xpath('.//h2[@class="_3ENhq"]').text
            driven_km = container.find_element_by_xpath('.//div[@class="_Ecri"]/p/span[1]').text
            fuel = container.find_element_by_xpath('.//div[@class="_Ecri"]/p/span[2]/span').text
            owner = container.find_element_by_xpath('.//div[@class="_Ecri"]/p/span[3]').text
            price = container.find_element_by_xpath('.//h3[@class="_6KkG6"]').text
            
            #storing scraped data
            title_list = title.split()
                        
            #for year
            manufacture_years.append(title_list[0])
            
            #for brand
            brand = title_list[1]
            ct = 2
            if title_list[ct].lower() in ['suzuki']:
                brand += " "+title_list[ct]
                ct += 1
            
            brands.append(brand)
            
            #for model
            models.append(title_list[ct])
            ct += 1
            
            #for varient
            varients.append(' '.join(title_list[ct:]))
            
            #for driven kilometers
            driven_kms.append(driven_km)
            
            #for fuel type
            fuels.append(fuel)
            
            #for number of owner
            owners.append(owner)
            
            #for location
            locations.append(location)
            
            #for body type
            body_types.append(body_type)
            
            #for price
            prices.append(price)
            
            count = 0
            success += 1
        except BaseException as e:
            count += 1
            failed += 1
            if(count > 3):
                print("No more records to scrape!")
                break
            
        sys.stdout.flush()
        sys.stdout.write(f"\rSuccess: {success} | Error: {failed}              ")
        if i % 3 == 0:
            driver.execute_script(f'window.scrollBy({x},{y})')
            x = y
            y = y + 200
    
    #creating dataframe of scraped data
    length = min([len(brands),len(models),len(varients),len(manufacture_years),len(driven_kms),len(fuels),len(owners),len(body_types),len(locations),len(prices)])
    df = pd.DataFrame({
        "Brand": brands[:length],
        "Model": models[:length],
        "Varient": varients[:length],
        "Manufacture Year": manufacture_years[:length],
        "Driven Kilometers": driven_kms[:length],
        "Fuel": fuels[:length],
        "Number of Owner": owners[:length],
        "Body Type": body_types[:length],
        "Location": locations[:length],
        "Price": prices[:length]
    })
    
    driver.close()
    return df




#for scraping data from https://www.cardekho.com/
def scrape_from_cardekho(url='https://www.cardekho.com/',location='Delhi NCR',body_type='Hatchback',limit=1000):
    #starting driver
    driver = start_driver()
    driver.maximize_window()
    
    #starting sub driver
    sub_driver = start_driver()
    sub_driver.maximize_window()
    
    #initializing driver with url
    driver.get(url)
    driver.refresh()
    
    try:
        #clicking on select Used Car
        WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//ul[@class="gsc-ta-clickWrap "]/li[contains(text(),"Used Car")]'))).click()

        #initializing search box with location
        #searchBox = WebDriverWait(driver,10).until(ec.presence_of_element_located((By.XPATH,'//input[@id="budCityName"]')))
        #searchBox.send_keys(location)

        #clicking on search button
        WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//div[@id="used_budget"]/div/ul/li[3]/button'))).click()

        #clicking on location
        WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,f'//*[@id="cityPopUp"]/div/div[2]/section/div[2]/ul/li/a/span[3][contains(text(),"{location}")]'))).click()

        #clicking on filter By Body Type
        #WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//h3[contains(text(),"By Body Type")]'))).click()

        #clicking on body type
        WebDriverWait(driver,20).until(ec.element_to_be_clickable((By.XPATH,f'//li/label[@title="{body_type}"]'))).click()
    except:
        sub_driver.close()
        driver.close()
        return pd.DataFrame({
            "Brand": [],
            "Model": [],
            "Varient": [],
            "Manufacture Year": [],
            "Driven Kilometers": [],
            "Fuel": [],
            "Number of Owner": [],
            "Body Type": [],
            "Location": [],
            "Price": []
        })
    
    #scraping the required data
    brands = []
    models = []
    varients = []
    manufacture_years = []
    driven_kms = []
    fuels = []
    owners = []
    locations = []
    body_types = []
    prices = []
    
    x = 0
    y = 500
    driver.execute_script(f'window.scrollBy({x},{y})')
    try:
        container = WebDriverWait(driver,10).until(ec.presence_of_element_located((By.XPATH,'//*[@id="rf01"]/div[1]/div/div/main/div[1]/div[1]/div[4]/div[1]')))
    except:
        container = WebDriverWait(driver,10).until(ec.presence_of_element_located((By.XPATH,'/html/body/div[2]/div/div[1]/div/div/main/div[1]/div[1]/div[5]/div[1]')))
    
    count = 0
    success = 0
    failed = 0
    for i in tqdm.tqdm(range(limit),desc=f"Scraping [{body_type} in {location}]"):
        try:
            #moving to next sibling data
            time.sleep(3)
            container = container.find_element_by_xpath('./following-sibling::div')
            container.get_attribute('innerHTML')
            
            #scraping data
            title = container.find_element_by_xpath('.//*[@class="gsc_col-xs-7 carsName"]/a')
            title_text = title.text
            
            #initializing sub_driver with current car url
            sub_url = title.get_attribute('href')
            sub_driver.get(sub_url)
            
            varient = container.find_element_by_xpath('.//div[@class="holder"]/div[1]/div[1]/div[1]').text
            driven_km = container.find_element_by_xpath('.//div[@class="gsc_col-xs-7 carsName"]/div[2]/span[1]').text
            fuel = container.find_element_by_xpath('.//div[@class="gsc_col-xs-7 carsName"]/div[2]/span[2]').text
            price = container.find_element_by_xpath('.//span[@class="amnt "]').text
            
            #getting number of owners from sub_driver
            try:
                owner = WebDriverWait(sub_driver,5).until(ec.presence_of_element_located((By.XPATH,'//*[@id="widget-Overview"]/div/div/div/div[1]/ul/li[4]/div/div'))).text
            except:
                owner = WebDriverWait(sub_driver,5).until(ec.presence_of_element_located((By.XPATH,'//*[@id="rf01"]/div[1]/div/div/main/div/div/div[1]/div[3]/div/div/div/ul/li[6]/div/div[2]'))).text
                
            
            #storing scraped data
            title_list = title_text.split()
                        
            #for year
            manufacture_years.append(title_list[0])
            
            #for brand
            brand = title_list[1]
            ct = 2
            if title_list[ct].lower() in ['suzuki']:
                brand += " "+title_list[ct]
                ct += 1
            
            brands.append(brand)
            
            #for model
            models.append(' '.join(title_list[ct:]))
                        
            #for varient
            varients.append(varient)
            
            #for driven kilometers
            driven_kms.append(driven_km)
            
            #for fuel type
            fuels.append(fuel)
            
            #for number of owner
            owners.append(owner)
            
            #for location
            locations.append(location)
            
            #for body type
            body_types.append(body_type)
            
            #for price
            prices.append(price)
            
            count = 0
            success += 1
        except BaseException as e:
            #print(str(e))
            count += 1
            failed += 1
            if(count > 3):
                print("No more records to scrape!")
                break
            
        sys.stdout.flush()
        sys.stdout.write(f"\rSuccess: {success} | Error: {failed}              ")
        if i % 3 == 0:
            driver.execute_script(f'window.scrollBy({x},{y})')
            x = y
            y = y + 300
    
    #creating dataframe of scraped data
    length = min([len(brands),len(models),len(varients),len(manufacture_years),len(driven_kms),len(fuels),len(owners),len(body_types),len(locations),len(prices)])
    df = pd.DataFrame({
        "Brand": brands[:length],
        "Model": models[:length],
        "Varient": varients[:length],
        "Manufacture Year": manufacture_years[:length],
        "Driven Kilometers": driven_kms[:length],
        "Fuel": fuels[:length],
        "Number of Owner": owners[:length],
        "Body Type": body_types[:length],
        "Location": locations[:length],
        "Price": prices[:length]
    })
    
    sub_driver.close()
    driver.close()
    return df



#function to merge dataframes
def merge_df(df_dict=None):
    p_keys = list(df_dict.keys())
    s_keys = list(df_dict[p_keys[0]].keys())
    ct = 0
    df = None
    
    for pk in p_keys:
        for sk in s_keys:
            if ct == 0:
                df = df_dict[pk][sk]
                ct += 1
                
            df = pd.merge(df,df_dict[pk][sk],how='outer')
        
    return df


In [5]:
#scraping data from cars24.com
df_cars24_dict = {}
location = ['Delhi','Noida','Gurgaon','Mumbai','Pune']
body_type = ['Hatchback','Sedan','SUV','Luxury Sedan','Luxury SUV']
for loc in location:
    df_cars24_dict[loc] = {}
    for bt in body_type:
        df_cars24_dict[loc][bt] = scrape_from_cars24(location=loc,body_type=bt,limit=800)
        

Scraping [Hatchback in Delhi]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 686 | Error: 8              No more records to scrape!


Scraping [Sedan in Delhi]:   0%|          | 0/800 [00:03<?, ?it/s]

Success: 290 | Error: 8              No more records to scrape!


Scraping [SUV in Delhi]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 231 | Error: 8              No more records to scrape!


Scraping [Luxury Sedan in Delhi]:   0%|          | 0/800 [00:02<?, ?it/s]

Success: 38 | Error: 8              No more records to scrape!


Scraping [Luxury SUV in Delhi]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 46 | Error: 8              No more records to scrape!


Scraping [Hatchback in Noida]:   0%|          | 0/800 [00:03<?, ?it/s]

Success: 589 | Error: 8              No more records to scrape!


Scraping [Sedan in Noida]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 235 | Error: 8              No more records to scrape!


Scraping [SUV in Noida]:   0%|          | 0/800 [00:02<?, ?it/s]

Success: 195 | Error: 8              No more records to scrape!


Scraping [Luxury Sedan in Noida]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 29 | Error: 8              No more records to scrape!


Scraping [Luxury SUV in Noida]:   0%|          | 0/800 [00:03<?, ?it/s]

Success: 37 | Error: 8              No more records to scrape!


Scraping [Hatchback in Gurgaon]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 586 | Error: 8              No more records to scrape!


Scraping [Sedan in Gurgaon]:   0%|          | 0/800 [00:02<?, ?it/s]

Success: 234 | Error: 8              No more records to scrape!


Scraping [SUV in Gurgaon]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 194 | Error: 8              No more records to scrape!


Scraping [Luxury Sedan in Gurgaon]:   0%|          | 0/800 [00:02<?, ?it/s]

Success: 29 | Error: 8              No more records to scrape!


Scraping [Luxury SUV in Gurgaon]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 37 | Error: 8              No more records to scrape!


Scraping [Hatchback in Mumbai]:   0%|          | 0/800 [00:03<?, ?it/s]

Success: 694 | Error: 8              No more records to scrape!


Scraping [Sedan in Mumbai]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 287 | Error: 8              No more records to scrape!


Scraping [SUV in Mumbai]:   0%|          | 0/800 [00:03<?, ?it/s]

Success: 247 | Error: 8              No more records to scrape!


Scraping [Luxury Sedan in Mumbai]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 47 | Error: 8              No more records to scrape!


Scraping [Luxury SUV in Mumbai]:   0%|          | 0/800 [00:22<?, ?it/s]

Success: 43 | Error: 8              No more records to scrape!


Scraping [Hatchback in Pune]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 579 | Error: 8              No more records to scrape!


Scraping [Sedan in Pune]:   0%|          | 0/800 [00:02<?, ?it/s]

Success: 226 | Error: 8              No more records to scrape!


Scraping [SUV in Pune]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 182 | Error: 8              No more records to scrape!


Scraping [Luxury Sedan in Pune]:   0%|          | 0/800 [00:03<?, ?it/s]

Success: 37 | Error: 8              No more records to scrape!


Scraping [Luxury SUV in Pune]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 27 | Error: 8              No more records to scrape!


In [10]:
#merging data to a single dataframe
df_cars24 = merge_df(df_cars24_dict)
df_cars24

Unnamed: 0,Brand,Model,Varient,Manufacture Year,Driven Kilometers,Fuel,Number of Owner,Body Type,Location,Price
0,Maruti,Zen,Estilo LXI,2011,"24,337km",Petrol,1st Owner,Hatchback,Delhi,"₹ 2,10,499"
1,Maruti,Alto,K10 VXI,2018,"5,198km",Petrol,1st Owner,Hatchback,Delhi,"₹ 3,43,799"
2,Maruti,Celerio,VXI AMT,2015,"27,739km",Petrol,1st Owner,Hatchback,Delhi,"₹ 3,71,199"
3,Hyundai,i20,SPORTZ 1.2 VTVT,2014,"25,187km",Petrol,1st Owner,Hatchback,Delhi,"₹ 3,60,099"
4,Maruti,S,PRESSO VXI PLUS,2020,"7,556km",Petrol,1st Owner,Hatchback,Delhi,"₹ 4,15,999"
...,...,...,...,...,...,...,...,...,...,...
5820,Mahindra,XUV500,W6 4X2,2015,"52,094km",Diesel,1st Owner,Luxury SUV,Pune,"₹ 7,18,499"
5821,BMW,X1,SDRIVE 20D,2011,"89,880km",Diesel,2nd Owner,Luxury SUV,Pune,"₹ 7,78,199"
5822,Mahindra,XUV500,W6 4X2,2011,"1,25,124km",Diesel,2nd Owner,Luxury SUV,Pune,"₹ 5,04,199"
5823,Toyota,Fortuner,2.8 4x2 AT,2016,"1,46,630km",Diesel,1st Owner,Luxury SUV,Pune,"₹ 22,82,499"


In [11]:
#scraping data from cardekho.com
df_cardekho_dict = {}
cardekho_loc = ['Ahmedabad','Bangalore','Chennai','Hyderabad','Delhi NCR']
cardekho_bt = ['Hatchback','Sedan','SUV','Luxury','Minivan','Coupe','Super Luxury','MUV']
for loc in cardekho_loc:
    df_cardekho_dict[loc] = {}
    for bt in cardekho_bt:
        df_cardekho_dict[loc][bt] = scrape_from_cardekho(location=loc,body_type=bt,limit=800)

Scraping [Hatchback in Ahmedabad]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 454 | Error: 8              No more records to scrape!


Scraping [Sedan in Ahmedabad]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 78 | Error: 8              No more records to scrape!


Scraping [SUV in Ahmedabad]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 119 | Error: 7              No more records to scrape!


Scraping [Luxury in Ahmedabad]:   0%|          | 0/800 [00:02<?, ?it/s]

Success: 100 | Error: 7              No more records to scrape!


Scraping [Minivan in Ahmedabad]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 29 | Error: 8              No more records to scrape!


Scraping [Super Luxury in Ahmedabad]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 6 | Error: 3              No more records to scrape!


Scraping [MUV in Ahmedabad]:   0%|          | 0/800 [00:04<?, ?it/s]

Success: 76 | Error: 8              No more records to scrape!


Scraping [Hatchback in Bangalore]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 369 | Error: 12              No more records to scrape!


Scraping [Sedan in Bangalore]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 19 | Error: 7              No more records to scrape!


Scraping [SUV in Bangalore]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 214 | Error: 9              No more records to scrape!


Scraping [Luxury in Bangalore]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 156 | Error: 7              No more records to scrape!


Scraping [Minivan in Bangalore]:   0%|          | 0/800 [00:05<?, ?it/s]

Success: 27 | Error: 8              No more records to scrape!


Scraping [Super Luxury in Bangalore]:   0%|          | 0/800 [00:04<?, ?it/s]

Success: 6 | Error: 3              No more records to scrape!


Scraping [MUV in Bangalore]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 59 | Error: 8              No more records to scrape!


Scraping [Hatchback in Chennai]:   0%|          | 0/800 [00:05<?, ?it/s]

Success: 77 | Error: 7              No more records to scrape!


Scraping [Sedan in Chennai]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 0 | Error: 3              No more records to scrape!


Scraping [SUV in Chennai]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 199 | Error: 7              No more records to scrape!


Scraping [Luxury in Chennai]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 63 | Error: 7              No more records to scrape!


Scraping [Minivan in Chennai]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 17 | Error: 6              No more records to scrape!


Scraping [Sedan in Hyderabad]:   0%|          | 0/800 [00:14<?, ?it/s]

Success: 109 | Error: 9              No more records to scrape!


Scraping [SUV in Hyderabad]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 215 | Error: 7              No more records to scrape!


Scraping [Luxury in Hyderabad]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 0 | Error: 3              No more records to scrape!


Scraping [Minivan in Hyderabad]:   0%|          | 0/800 [00:02<?, ?it/s]

Success: 16 | Error: 5              No more records to scrape!


Scraping [Coupe in Hyderabad]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 6 | Error: 3              No more records to scrape!


Scraping [Super Luxury in Hyderabad]:   0%|          | 0/800 [00:05<?, ?it/s]

Success: 6 | Error: 3              No more records to scrape!


Scraping [MUV in Hyderabad]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 69 | Error: 7              No more records to scrape!


Scraping [Hatchback in Delhi NCR]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 798 | Error: 2              

Scraping [Sedan in Delhi NCR]:   0%|          | 0/800 [00:04<?, ?it/s]

Success: 136 | Error: 6              No more records to scrape!


Scraping [SUV in Delhi NCR]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 373 | Error: 6              No more records to scrape!


Scraping [Luxury in Delhi NCR]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 138 | Error: 4              No more records to scrape!


Scraping [Minivan in Delhi NCR]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 18 | Error: 5              No more records to scrape!


Scraping [Coupe in Delhi NCR]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 30 | Error: 4              No more records to scrape!


Scraping [Super Luxury in Delhi NCR]:   0%|          | 0/800 [00:05<?, ?it/s]

Success: 10 | Error: 3              No more records to scrape!


Scraping [MUV in Delhi NCR]:   0%|          | 0/800 [00:00<?, ?it/s]

Success: 244 | Error: 5              No more records to scrape!


In [57]:
#merging data to a single dataframe
df_cardekho = merge_df(df_cardekho_dict)
df_cardekho

Unnamed: 0,Brand,Model,Varient,Manufacture Year,Driven Kilometers,Fuel,Number of Owner,Body Type,Location,Price
0,Maruti,Swift,VXI,2019,"20,853 kms",Petrol,1st Owner,Hatchback,Ahmedabad,6.2 Lakh
1,Maruti,Baleno,Delta,2017,"34,632 kms",Petrol,1st Owner,Hatchback,Ahmedabad,5.1 Lakh
2,Maruti,Swift,VXI,2021,"2,242 kms",Petrol,1st Owner,Hatchback,Ahmedabad,6.38 Lakh
3,Hyundai,Verna,CRDi 1.6 AT SX Plus,2017,"70,000 kms",Diesel,First Owner,Hatchback,Ahmedabad,9.5 Lakh
4,Hyundai,i20 Active,1.4 SX,2015,"85,000 kms",Diesel,First Owner,Hatchback,Ahmedabad,5.9 Lakh
...,...,...,...,...,...,...,...,...,...,...
4253,Toyota,Innova,2.5 EV CS 7 STR BSIV,2012,"1,83,000 kms",Diesel,First Owner,MUV,Delhi NCR,5.25 Lakh
4254,Toyota,Innova,2.5 GX (Diesel) 7 Seater BS IV,2012,"70,000 kms",Diesel,First Owner,MUV,Delhi NCR,6.5 Lakh
4255,Toyota,Innova,2.5 VX 8 STR BSIV,2010,"2,20,000 kms",Diesel,First Owner,MUV,Delhi NCR,4.95 Lakh
4256,Mahindra,Bolero,SLE,2010,"1,00,000 kms",Diesel,Second Owner,MUV,Delhi NCR,2.6 Lakh


In [71]:
#merging the all scraped data to a single data frame
final_df = pd.merge(df_cars24,df_cardekho,how='outer')
final_df

Unnamed: 0,Brand,Model,Varient,Manufacture Year,Driven Kilometers,Fuel,Number of Owner,Body Type,Location,Price
0,Maruti,Zen,Estilo LXI,2011,"24,337km",Petrol,1st Owner,Hatchback,Delhi,"₹ 2,10,499"
1,Maruti,Alto,K10 VXI,2018,"5,198km",Petrol,1st Owner,Hatchback,Delhi,"₹ 3,43,799"
2,Maruti,Celerio,VXI AMT,2015,"27,739km",Petrol,1st Owner,Hatchback,Delhi,"₹ 3,71,199"
3,Hyundai,i20,SPORTZ 1.2 VTVT,2014,"25,187km",Petrol,1st Owner,Hatchback,Delhi,"₹ 3,60,099"
4,Maruti,S,PRESSO VXI PLUS,2020,"7,556km",Petrol,1st Owner,Hatchback,Delhi,"₹ 4,15,999"
...,...,...,...,...,...,...,...,...,...,...
10078,Toyota,Innova,2.5 EV CS 7 STR BSIV,2012,"1,83,000 kms",Diesel,First Owner,MUV,Delhi NCR,5.25 Lakh
10079,Toyota,Innova,2.5 GX (Diesel) 7 Seater BS IV,2012,"70,000 kms",Diesel,First Owner,MUV,Delhi NCR,6.5 Lakh
10080,Toyota,Innova,2.5 VX 8 STR BSIV,2010,"2,20,000 kms",Diesel,First Owner,MUV,Delhi NCR,4.95 Lakh
10081,Mahindra,Bolero,SLE,2010,"1,00,000 kms",Diesel,Second Owner,MUV,Delhi NCR,2.6 Lakh


In [87]:
#coverting price to proper number format
#multiplying records with 100000 which contains xx Lakh
final_df.Price[final_df.Price.str.contains('Lakh')] = final_df.Price[final_df.Price.str.contains('Lakh')].apply(lambda x: str(float(x.split()[0])*100000))

#multiplying records with 10000000 which contains xx Cr
final_df.Price[final_df.Price.str.contains('Cr')] = final_df.Price[final_df.Price.str.contains('Cr')].apply(lambda x: str(float(x.split()[0])*10000000))

#extracting numbers only and discarding other characters
final_df.Price = final_df.Price.str.replace('[^0-9.]','').astype('float64')

final_df

Unnamed: 0,Brand,Model,Varient,Manufacture Year,Driven Kilometers,Fuel,Number of Owner,Body Type,Location,Price
0,Maruti,Zen,Estilo LXI,2011,"24,337km",Petrol,1st Owner,Hatchback,Delhi,210499.0
1,Maruti,Alto,K10 VXI,2018,"5,198km",Petrol,1st Owner,Hatchback,Delhi,343799.0
2,Maruti,Celerio,VXI AMT,2015,"27,739km",Petrol,1st Owner,Hatchback,Delhi,371199.0
3,Hyundai,i20,SPORTZ 1.2 VTVT,2014,"25,187km",Petrol,1st Owner,Hatchback,Delhi,360099.0
4,Maruti,S,PRESSO VXI PLUS,2020,"7,556km",Petrol,1st Owner,Hatchback,Delhi,415999.0
...,...,...,...,...,...,...,...,...,...,...
10078,Toyota,Innova,2.5 EV CS 7 STR BSIV,2012,"1,83,000 kms",Diesel,First Owner,MUV,Delhi NCR,525000.0
10079,Toyota,Innova,2.5 GX (Diesel) 7 Seater BS IV,2012,"70,000 kms",Diesel,First Owner,MUV,Delhi NCR,650000.0
10080,Toyota,Innova,2.5 VX 8 STR BSIV,2010,"2,20,000 kms",Diesel,First Owner,MUV,Delhi NCR,495000.0
10081,Mahindra,Bolero,SLE,2010,"1,00,000 kms",Diesel,Second Owner,MUV,Delhi NCR,260000.0


In [90]:
#converting 1st, 2nd, 3rd, 4th to First, Second, Third, Fourth in Number of Owners to keep data in same constraits.
for x, y in zip(['1st','2nd','3rd','4th'],['First','Second','Third','Fourth']):
    final_df["Number of Owner"] = final_df["Number of Owner"].str.replace(x,y)
final_df["Number of Owner"].unique()

array(['First Owner', 'Second Owner', 'Third Owner', 'Fourth Owner',
       'Fourth & Above Owner', 'Test Drive Car'], dtype=object)

In [91]:
#saving scraped data to a csv file
final_df.to_csv('data-car-price-prediction.csv')

### _________EOF_____