### Web Scraping

In [19]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
from datetime import datetime

In [28]:
#creating list for car website pages
car_page_listing_list = []
for i in range(4):
    url = "https://www.sgcarmart.com/used_cars/listing.php?BRSR=" + str(i * 100) + "&RPG=100&AVL=2&VEH=2"
    car_page_listing_list.append(url)

In [29]:
print(car_page_listing_list,'\n', len(car_page_listing_list))

['https://www.sgcarmart.com/used_cars/listing.php?BRSR=0&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=100&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=200&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=300&RPG=100&AVL=2&VEH=2'] 
 4


In [31]:
base_url = 'https://www.sgcarmart.com/used_cars/'
listing_urls = []
for car_page in car_page_listing_list:
    page = requests.get(car_page)
    soup = BeautifulSoup(page.text, 'lxml')
    links = soup.find_all('a')
    
    for link in links:
        # get the link
        attach = link.get('href')
    
        # check if 'ID=' and 'DL=' exist in the attach variable
        if ('ID=' in attach) and ('DL=' in attach):
        
            # Concatenate the two strings if they do
            listing_url = base_url + attach
            
            # Append result to the list
            listing_urls.append(listing_url)
    
    # removing duplicates
    set_listing_urls = set(listing_urls)
    listing_urls = list(set_listing_urls)
    
    # prevent from getting blocked from the website
    time.sleep(1)

In [32]:
print(listing_urls[:10])

['https://www.sgcarmart.com/used_cars/info.php?ID=926740&DL=3093', 'https://www.sgcarmart.com/used_cars/info.php?ID=921465&DL=2097', 'https://www.sgcarmart.com/used_cars/info.php?ID=924961&DL=2689', 'https://www.sgcarmart.com/used_cars/info.php?ID=926333&DL=2305', 'https://www.sgcarmart.com/used_cars/info.php?ID=926752&DL=2792', 'https://www.sgcarmart.com/used_cars/info.php?ID=926631&DL=3626', 'https://www.sgcarmart.com/used_cars/info.php?ID=926694&DL=1034', 'https://www.sgcarmart.com/used_cars/info.php?ID=893687&DL=1277', 'https://www.sgcarmart.com/used_cars/info.php?ID=911185&DL=1198', 'https://www.sgcarmart.com/used_cars/info.php?ID=918538&DL=3287']


### Creating Data Frame

In [33]:
# Creating an empty dataframe
df = pd.DataFrame(columns=['LISTING_URL', 'MAKE', 'PRICE', 'DEPRE_VALUE_PER_YEAR',
       'REG_DATE', 'ENGINE_CAP_CC', 'MILEAGE_KM', 'VEHICLE_TYPE'])

In [37]:
filename = 'sgcarmart_used_cars_prices'
i = 0 # Indexing rows in the DF

for listingurl in listing_urls:
    response = requests.get(listingurl)
    listing_url = BeautifulSoup(response.text, 'lxml')
    
    # get_car_make function
    def get_car_make(parsed_url):
        car_make = parsed_url.find(class_='link_redbanner').text.split()[0]
        return car_make

    # get_price function
    def get_price(parsed_url):
        price = parsed_url.find_all(class_='font_red')[0].text.strip()
        price = int(price.replace('$', '').replace(',', ''))
        if price != int(price):
            price = np.nan
        return price

    # get_depreciation function
    def get_depreciation(parsed_url):
        depre = parsed_url.find_all(class_="label")[1].findNextSibling().text.strip().split('$')
        depre1 = depre[1]
        depre1 = int(depre1.replace(',','').replace('/yrView models with similar depre','').replace('/yr','').strip())
        if depre1 != int(depre1):
            depre1 = np.nan
        return depre1
        
    # get_reg_date function
    def get_reg_date(parsed_url):
        reg_date = parsed_url.find_all(class_='row_bg')[1].find_all('td')[3].text.split()[0].split('(')[0]
        if reg_date == str('N.A.') or reg_date == str('N.A'):
            reg_date = np.nan
        return reg_date

    # get_eng_cap function
    def get_eng_cap(parsed_url):
        eng_cap = int(parsed_url.find_all(class_='row_info')[4].text.strip().replace('cc','').replace(',','').strip())
        if eng_cap != int(eng_cap):
            eng_cap = np.nan
        return eng_cap

    # get_mileage function
    def get_mileage(parsed_url):
        mileage = parsed_url.find_all(class_='row_info')[0].text.strip().replace(',','').split('km')
        mileage = int(mileage[0].replace(' ',''))
        if mileage != int(mileage):
            mileage = np.nan
        return mileage
        
    # get_veh_type function
    def get_veh_type(parsed_url):
        veh_type = parsed_url.find(class_='row_bg1').find_all('a')[0].text
        return veh_type
    
    # Pull data by using the functions
    df.loc[i, 'LISTING_URL'] = listingurl
    df.loc[i, 'MAKE'] = get_car_make(listing_url)
    try:
        df.loc[i, 'PRICE'] = get_price(listing_url)
    except:
        df.loc[i, 'PRICE'] = np.nan
    try:
        df.loc[i, 'DEPRE_VALUE_PER_YEAR'] = get_depreciation(listing_url)
    except:
        df.loc[i, 'DEPRE_VALUE_PER_YEAR'] = np.nan
        
    try:
        df.loc[i, 'REG_DATE'] = get_reg_date(listing_url)
    except:
        df.loc[i, 'REG_DATE'] = np.nan
    
    try:
        df.loc[i, 'ENGINE_CAP_CC'] = get_eng_cap(listing_url)
    except: 
        df.loc[i, 'ENGINE_CAP_CC'] = np.nan
        
    try:
        df.loc[i, 'MILEAGE_KM'] = get_mileage(listing_url)
    except:
        df.loc[i, 'MILEAGE_KM'] = np.nan
              
    try:
        df.loc[i, 'VEHICLE_TYPE'] = get_veh_type(listing_url)
    except:
        df.loc[i, 'VEHICLE_TYPE'] = np.nan
        
    df.to_csv("{}.csv".format(filename))    
        
    i += 1 # Allows next car listing to be put into a next row in the dataframe
    time.sleep(1)  # Prevents us from getting locked out of the website


In [38]:
df = pd.read_csv('sgcarmart_used_cars_prices.csv',index_col=0)
df

Unnamed: 0,LISTING_URL,MAKE,PRICE,DEPRE_VALUE_PER_YEAR,REG_DATE,ENGINE_CAP_CC,MILEAGE_KM,VEHICLE_TYPE
0,https://www.sgcarmart.com/used_cars/info.php?I...,Kia,61800,8120.0,24-Apr-2017,1591,65000.0,Mid-Sized Sedan
1,https://www.sgcarmart.com/used_cars/info.php?I...,Subaru,60800,8810.0,26-Aug-2016,1600,82740.0,SUV
2,https://www.sgcarmart.com/used_cars/info.php?I...,BMW,72777,7300.0,25-Jan-2011,2497,149563.0,Luxury Sedan
3,https://www.sgcarmart.com/used_cars/info.php?I...,Volvo,48800,9990.0,20-Jun-2014,1560,120000.0,Luxury Sedan
4,https://www.sgcarmart.com/used_cars/info.php?I...,Toyota,58800,8800.0,06-Apr-2016,1598,49980.0,Mid-Sized Sedan
...,...,...,...,...,...,...,...,...
395,https://www.sgcarmart.com/used_cars/info.php?I...,BMW,33800,11330.0,28-Jun-2012,1598,92300.0,Hatchback
396,https://www.sgcarmart.com/used_cars/info.php?I...,Honda,25800,6810.0,25-Jun-2009,1339,,Hatchback
397,https://www.sgcarmart.com/used_cars/info.php?I...,Honda,66888,6690.0,25-Oct-2011,2354,108000.0,MPV
398,https://www.sgcarmart.com/used_cars/info.php?I...,Volkswagen,51600,9310.0,30-Sep-2015,1395,30000.0,Hatchback
