In this notebook, I will be scraping new data from the same website to see if the model works on more recent posts. Note that the codes are similar to the part 1 of the notebook. This is to have an updated version of data to validate the model that we will craft in the next notebook.

In [1]:
#import packages
import requests
import pandas as pd
import time
import random
from bs4 import BeautifulSoup

import scrapy
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

In [2]:
#checking for request code
base = 'https://www.sgcarmart.com/used_cars/listing.php'
search = '&PRC=0&DEP=0&RGD=0&VEH=0&AVL=0'
pre_model = '?MOD='
model = ''
url = base + pre_model + model + search
res = requests.get(url, headers={'User-agent': 'Applebot'})
res.status_code

200

In [3]:
#scrapping listing page link
car_data = []

#creating a for loop to scrape data by listing
for i in range(0,1000,100):
    
    #url for only passenger vehicle listings by 100s
    current_url = base + '?BRSR=' + str(i) + '&RPG=100&VEH=2'
    
    #request id
    res = requests.get(current_url, headers={'User-agent': 'Applebot'})
    
    #converting html to text
    html = res.text
    
    #instantiating beatifulsoup package
    soup = BeautifulSoup(html, 'lxml')
    
    #finding table in html text
    table = soup.find('table',{'style':'margin-top:1px;'})
    
    #creating for loop to extract individual link of car listing
    for row in table.find_all('strong'):
        if row.find('a') is not None:
            car = {}
            car['model'] = row.find('a').text
            car['link'] = row.find('a').attrs['href']
            car_data.append(car)

In [4]:
#checking for errors in extraction of text
car_data

[{'model': 'Toyota Wish 1.8A (COE till 12/2025)',
  'link': 'info.php?ID=960505&DL=1194'},
 {'model': 'Lexus ES250 Executive Sunroof',
  'link': 'info.php?ID=929909&DL=2296'},
 {'model': 'Volvo V40 T2', 'link': 'info.php?ID=942028&DL=2296'},
 {'model': 'Opel Astra 1.0A Turbo', 'link': 'info.php?ID=955385&DL=2296'},
 {'model': 'Nissan Qashqai 2.0A Premium Moonroof',
  'link': 'info.php?ID=944287&DL=3043'},
 {'model': 'Volkswagen Golf 1.4A TSI Highline',
  'link': 'info.php?ID=950829&DL=2296'},
 {'model': 'BMW 2 Series 216i Gran Tourer Luxury',
  'link': 'info.php?ID=962366&DL=1000'},
 {'model': 'Nissan Qashqai 1.2A DIG-T Premium',
  'link': 'info.php?ID=956793&DL=2296'},
 {'model': 'Audi A4 2.0A TFSI S-tronic', 'link': 'info.php?ID=934961&DL=2296'},
 {'model': 'Toyota Wish 1.8A', 'link': 'info.php?ID=961824&DL=1000'},
 {'model': 'Save on Petrol with OCBC 365 Card!',
  'link': 'https://www.sgcarmart.com/phpads/www/delivery/ck.php?bannerid=3410'},
 {'model': 'Subaru Forester 2.0i-L Sunroo

In [5]:
#converting to dataframe
car_df = pd.DataFrame(car_data)

In [6]:
#setting display limit
pd.options.display.max_rows = 500

#checking for any errors
car_df

Unnamed: 0,model,link
0,Toyota Wish 1.8A (COE till 12/2025),info.php?ID=960505&DL=1194
1,Lexus ES250 Executive Sunroof,info.php?ID=929909&DL=2296
2,Volvo V40 T2,info.php?ID=942028&DL=2296
3,Opel Astra 1.0A Turbo,info.php?ID=955385&DL=2296
4,Nissan Qashqai 2.0A Premium Moonroof,info.php?ID=944287&DL=3043
...,...,...
1015,Hyundai Avante 1.6A GLS,info.php?ID=949870&DL=2855
1016,BMW 5 Series 528i Luxury,info.php?ID=961982&DL=2296
1017,Toyota C-HR Hybrid 1.8A G,info.php?ID=961981&DL=2425
1018,Honda Civic Hybrid 1.3A (COE till 02/2022),info.php?ID=957062&DL=1000


In [7]:
#dropping duplicates
car_df.drop_duplicates(inplace=True)

In [8]:
#removing ads from dataframe
car_df = car_df[~car_df.link.str.contains("info.php?ID")]
car_df = car_df[~car_df.link.str.contains("https://")]

In [10]:
#reseting index
car_df.reset_index(level=None, drop=True, inplace=True, col_level=0, col_fill='')

In [11]:
info = []

#for loop to scrape individual listing page from the dataframe
for i in car_df['link']:
    url = 'https://www.sgcarmart.com/used_cars/' + i
    res = requests.get(url, headers={'User-agent': 'Applebot'})
    html = res.text
    soup = BeautifulSoup(html, 'lxml')
    table = soup.find('table',{'id':'carInfo'})
    dr = {}
    
    #condition of not found
    if table is None:
        dr['price'] = ''
        dr['depreciation'] = ''
        dr['reg_date'] = ''
        dr['mileage'] = ''
        dr['road_tax'] = ''
        dr['coe'] = ''
        dr['eng_cap'] = ''
        dr['curb_weight'] = ''
        dr['manufactured'] = ''
        dr['transmission'] = ''
        dr['omv'] = ''
        dr['power'] = ''
        dr['num_owners'] = ''
        dr['type'] = ''
        dr['category'] = ''
        info.append(dr)
    
    #condition and location of features we are interested in
    else:
        dr['price'] = table.find('tr', {'class' : 'row_bg'}).find('td',{'class':'font_red'}).text.strip('\t\r\n')
        dr['depreciation'] = table.find_all('td',{'valign':'top'})[0].text.strip('\t\r\n')
        dr['reg_date'] = table.find_all('td',{'valign':'top'})[2].text.strip('\t\r\n')
        dr['mileage'] = table.find_all('div',{'class':'row_info'})[0].text.strip('\t\r\n')
        dr['road_tax'] = table.find_all('div',{'class':'row_info'})[1].text.strip('\t\r\n')
        dr['coe'] = table.find_all('div',{'class':'row_info'})[3].text.strip('\t\r\n')
        dr['eng_cap'] = table.find_all('div',{'class':'row_info'})[4].text.strip('\t\r\n')
        dr['curb_weight'] = table.find_all('div',{'class':'row_info'})[5].text.strip('\t\r\n')
        dr['manufactured'] = table.find_all('div',{'class':'row_info'})[6].text.strip('\t\r\n')
        dr['transmission'] = table.find_all('div',{'class':'row_info'})[7].text.strip('\t\r\n')
        dr['omv'] = table.find_all('div',{'class':'row_info'})[8].text.strip('\t\r\n')
        dr['power'] = table.find_all('div',{'class':'row_info'})[10].text.strip('\t\r\n')
        dr['num_owners'] = table.find_all('div',{'class':'row_info'})[11].text.strip('\t\r\n')
        dr['type'] = table.find_all('tr',{'class':'row_bg1'})[0].find_all('td')[1].text.strip('\t\r\n')
        dr['category'] = table.find_all('tr',{'class':'row_bg1'})[-3].find('td').text.strip('\t\r\n')
        info.append(dr)

In [12]:
#converting into a dataframe
info_df = pd.DataFrame(info)
info_df

Unnamed: 0,price,depreciation,reg_date,mileage,road_tax,coe,eng_cap,curb_weight,manufactured,transmission,omv,power,num_owners,type,category
0,"$40,800","$8,250 /yr ...",27-Jan-2011(4yrs 11mths 9days COE left) ...,N.A. ...,\t$976 /yr,"$20,357","1,798 cc","1,360 kg ...",2010,Auto,"$21,503",106.0 kW (142 bhp),3,MPV,"Category\nCOE Car, Premium Ad Car"
1,"$125,800","$14,550 /yrVie...",28-Mar-2018(7yrs 2mths 5days COE left) ...,"50,000...","\t$1,792 /yr","$37,010","2,494 cc","1,615 kg ...",2018,Auto,"$36,195",135.0 kW (181 bhp),1,Luxury Sedan,"Category\nPARF Car, Premium Ad Car"
2,"$78,500","$10,050 /yrVie...",16-Nov-2017(6yrs 9mths 24days COE left) ...,"51,000...",\t$684 /yr,"$47,112","1,498 cc","1,454 kg ...",2017,Auto,"$23,530",90.0 kW (120 bhp),1,Hatchback,"Category\nPARF Car, Premium Ad Car"
3,"$59,500","$8,460 /yrView...",17-Oct-2017(6yrs 8mths 24days COE left) ...,"62,000...",\t$392 /yr,"$41,761",999 cc,"1,188 kg ...",2017,Auto,"$19,155",77.0 kW (103 bhp),1,Hatchback,"Category\nPARF Car, Premium Ad Car"
4,"$67,800","$9,770 /yrView...",28-Dec-2016(5yrs 11mths 5days COE left) ...,"75,200...","\t$1,210 /yr","$51,109","1,997 cc","1,379 kg ...",2016,Auto,"$19,653",106.0 kW (142 bhp),2,SUV,Category\nPARF Car
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,"$328,000","$39,640 /yr ...",26-Apr-1993(8yrs 3mths 8days COE left) ...,"69,000...",\t$858 /yr,"$26,175","1,308 cc",N.A. ...,1992,Manual,"$52,183",More than 6,,Sports Car,"Category\nCOE Car, Low Mileage Car"
997,"$78,800","$7,670 /yrView...",23-Jun-2020(9yrs 5mths COE left) ...,"1,000 ...",\t$738 /yr,"$31,210","1,591 cc","1,345 kg ...",2019,Auto,"$13,151",93.8 kW (125 bhp),1,Mid-Sized Sedan,"Category\nPARF Car, Almost New Car, Low Mileag..."
998,"$86,800","$17,360 /yrVie...",01-Apr-2014(3yrs 2mths 9days COE left) ...,"109,00...","\t$1,210 /yr","$84,001","1,997 cc","1,615 kg ...",2013,Auto,"$53,265",180.0 kW (241 bhp),2,Luxury Sedan,"Category\nPARF Car, Premium Ad Car"
999,"$75,888","$11,410 /yrVie...",08-Jun-2017(6yrs 4mths 16days COE left) ...,"32,101...",\t$974 /yr,"$54,556","1,797 cc","1,440 kg ...",2017,Auto,"$31,675",90.0 kW (120 bhp),1,SUV,"Category\nPARF Car, Premium Ad Car, Low Mileag..."


In [13]:
#checking shape before merging
car_df.shape

(1001, 2)

In [14]:
#checking shape before merging
info_df.shape

(1001, 15)

In [15]:
#merging both dataframes
merge_df = car_df.merge(info_df,left_index=True, right_index=True)

In [18]:
merge_df.to_csv(path_or_buf='../data/cars_info_validation.csv',index=False)
#Data scraped on 22 January 2021