In [24]:
# start scraping information from just one page

from urllib.request import urlopen
from bs4 import BeautifulSoup

url='https://boston.craigslist.org/search/cta?s=0'

html = urlopen(url)
bs = BeautifulSoup(html.read(),'html.parser')
cars=bs.find_all('li',{ 'class':'result-row'})

scrapedCarsList=[]
for car in cars:
    salesTitle=car.find('a',{'class':'result-title hdrlnk'})
    price=car.find('span',{'class':'result-price'})
    postingDate=car.find('time',{'class':'result-date'})
    #Some listings do not have a price.
    if price!=None:
        new_car=[salesTitle.get_text(),postingDate.get_text(),price.get_text()]
        #print(new_car) #uncomment to see all the cars with a newline
        scrapedCarsList.append(new_car)
print(scrapedCarsList[0:3]) #uncomment to see the list of cars on the first page
len(scrapedCarsList)

[['2018 Chevrolet Silverado 3500HD', 'Feb 19', '$45000'], ['2006 Mercedes Benz C280 4matic', 'Feb 19', '$4995'], ['Car', 'Feb 19', '$3500']]


120

In [25]:
# now let's revise the code to write the results of the first page into a csv file named 'CarCraglist.csv'.

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

with open('CarCraglist.csv', 'w',newline='') as myFile:
    writer = csv.writer(myFile)
    writer.writerow(["sales Title", "Listing Date", "Price"])

url='https://boston.craigslist.org/search/cta?s=0'
html = urlopen(url)
bs = BeautifulSoup(html.read(),'html.parser')
cars=bs.find_all('li',{ 'class':'result-row'})

scrapedCarsList=[]
for car in cars:
    salesTitle=car.find('a',{'class':'result-title hdrlnk'})
    price=car.find('span',{'class':'result-price'})
    postingDate=car.find('time',{'class':'result-date'})
    #Some listings do not have a price.
    if price!=None:
        new_car=[salesTitle.get_text(),postingDate.get_text(),price.get_text()]
        scrapedCarsList.append(new_car)

with open('CarCraglist.csv', 'a',newline='',encoding='utf-8') as myFile:
    writer = csv.writer(myFile)
    writer.writerows(scrapedCarsList)

In [26]:
#  create the list of URL's for the most recent 1,200 posting

baseURL='https://boston.craigslist.org/search/cta?s='
urlList=[]
for i in range(0,1201,120):
    newURL=baseURL+str(i)
    urlList.append(newURL)

print(urlList[0:50]) #uncomment to see the urls
len(urlList)

['https://boston.craigslist.org/search/cta?s=0', 'https://boston.craigslist.org/search/cta?s=120', 'https://boston.craigslist.org/search/cta?s=240', 'https://boston.craigslist.org/search/cta?s=360', 'https://boston.craigslist.org/search/cta?s=480', 'https://boston.craigslist.org/search/cta?s=600', 'https://boston.craigslist.org/search/cta?s=720', 'https://boston.craigslist.org/search/cta?s=840', 'https://boston.craigslist.org/search/cta?s=960', 'https://boston.craigslist.org/search/cta?s=1080', 'https://boston.craigslist.org/search/cta?s=1200']


11

In [27]:
#  trun the scraping script into a function so that it can  takes the page number (0, 120, 240, ...) as input and returns a list of all the cars on the page in a list of lists format.

def craigslistCarsScrape(pageNumber):
    print('*** Scraping cars on page:',int(pageNumber/120+1),'***\n\n')

    baseURL='https://boston.craigslist.org/search/cta?s='
    url=baseURL+str(pageNumber)
    html = urlopen(url)
    bs = BeautifulSoup(html.read(),'html.parser')
    cars=bs.find_all('li',{ 'class':'result-row'})
    scrapedCarsList=[]            
    for car in cars:
        salesTitle=car.find('a',{'class':'result-title hdrlnk'})
        price=car.find('span',{'class':'result-price'})
        postingDate=car.find('time',{'class':'result-date'})
        #Some listings do not have a price.
        if price!=None:
            new_car=[salesTitle.get_text(),postingDate.get_text(),price.get_text()]
            scrapedCarsList.append(new_car)
    return scrapedCarsList

In [28]:
# error handling to make the codes more robust

from urllib.error import HTTPError
from urllib.error import URLError

def craigslistCarsScraper(pageNumber):
    print('*** Scraping cars on page:',int(pageNumber/120+1),'***\n\n')

    baseURL='https://boston.craigslist.org/search/cta?s='
    url=baseURL+str(pageNumber)
    
    try:
        
        html = urlopen(url)
    
    except HTTPError as e:
        print(e)
        print('-----------------------HTTPError----------------------')
        return None
    except URLError as e:
        print('Server cound not be found')
        print('-----------------------URLError----------------------')
        return None
    
    bs = BeautifulSoup(html.read(),'html.parser')
    
    try:
        
        cars=bs.find_all('li',{ 'class':'result-row'})
    
    except AttributeError as e:
        print('Tag was not found')
        print('-----------------------AttributeError----------------------')
    
    else:
        scrapedCarsList=[]
        for car in cars:
            salesTitle=car.find('a',{'class':'result-title hdrlnk'})
            price=car.find('span',{'class':'result-price'})
            postingDate=car.find('time',{'class':'result-date'})
            #Some listings do not have a price.
            if price!=None:
                new_car=[salesTitle.get_text(),postingDate.get_text(),price.get_text()]
                    
                scrapedCarsList.append(new_car)
               
        return scrapedCarsList

In [29]:
craigslistCarsScraper(600)

*** Scraping cars on page: 6 ***




[['2009 Honda Civic LX sedan', 'Feb 19', '$3700'],
 ['2006 Ford F150 Crew Cab XLT 4x4', 'Feb 19', '$7500'],
 ['1998 Toyota Camry Runs Mechanic Special', 'Feb 19', '$700'],
 ['06 volvo xc90 awd seats 7! loaded cheap 4x4!', 'Feb 19', '$3475'],
 ['2005 Lexus SC 430 Hard Top Convertible 81K', 'Feb 19', '$16356'],
 ['2009 Infinity G 37x Silver on tan Leather Just 76K Very ,Very Nice!',
  'Feb 19',
  '$10250'],
 ['2005 Lexus SC 430 Hard Top Convertible 81K', 'Feb 19', '$16356'],
 ['2005 TOYOTA TUNDRA 4WD DOUBLE CAB SR5/TRD PKG CLEAN SHARP SERVICED',
  'Feb 19',
  '$8200'],
 ['2008 Pontiac Solstice Convertible 66K Stick shift', 'Feb 19', '$8750'],
 ['2005 Lexus SC 430 Hard Top Convertible 81K', 'Feb 19', '$16256'],
 ['2009 FORD F150 V8 4WD SUPR CREWCAB XLT CHRMES FBGLASS CAP XCLEAN SHARP',
  'Feb 19',
  '$13500'],
 ['2004 Toyota Tundra', 'Feb 19', '$6995'],
 ['2012 Mercedes-Benz C-Class  sedan C300 4MATIC (SILVER)',
  'Feb 19',
  '$12495'],
 ['2012 Mercedes-Benz C-Class  sedan C300 4MATIC (SI

In [30]:
# run the function in a loop and write the resutls on a csv

with open('craigslist_cars_final.csv', 'w',newline='') as myFile:
    writer = csv.writer(myFile)
    writer.writerow(["Listing Title", "Listing Date", "Price"])

with open('craigslist_cars_final.csv', 'a',newline='',encoding='utf-8') as myFile:
    writer = csv.writer(myFile)
    for i in range(0,1201,120):
        scrapedCarsList=craigslistCarsScraper(i)
        writer.writerows(scrapedCarsList)

print('----------------------------------------Well done---------------------------------------------- ')
print('-----------------------------------Scraping completed------------------------------------------ ')
print('------------Please find the csv file in the folder where this scraping file exists------------- ')

*** Scraping cars on page: 1 ***


*** Scraping cars on page: 2 ***


*** Scraping cars on page: 3 ***


*** Scraping cars on page: 4 ***


*** Scraping cars on page: 5 ***


*** Scraping cars on page: 6 ***


*** Scraping cars on page: 7 ***


*** Scraping cars on page: 8 ***


*** Scraping cars on page: 9 ***


*** Scraping cars on page: 10 ***


*** Scraping cars on page: 11 ***


----------------------------------------Well done---------------------------------------------- 
-----------------------------------Scraping completed------------------------------------------ 
------------Please find the csv file in the folder where this scraping file exists------------- 
