In [24]:
# start scraping information from just one page

import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup

url='https://boston.craigslist.org/search/cta?s=0'

html = urlopen(url)
bs = BeautifulSoup(html.read(),'html.parser')
cars=bs.find_all('li',{ 'class':'result-row'})

scrapedCarsList=[]
for car in cars:
    salesTitle=car.find('a',{'class':'result-title hdrlnk'})
    price=car.find('span',{'class':'result-price'})
    postingDate=car.find('time',{'class':'result-date'})
    #Some listings do not have a price.
    if price!=None:
        new_car=[salesTitle.get_text(),postingDate.get_text(),price.get_text()]
        #print(new_car) #uncomment to see all the cars with a newline
        scrapedCarsList.append(new_car)
print(scrapedCarsList[0:3]) #uncomment to see the list of cars on the first page
len(scrapedCarsList)

[['2007 Jeep Grand Cherokee Limited Trail Rated Edition 4.7 V8', 'Oct 10', '$1,000'], ['chrysler 300s', 'Oct 10', '$18,000'], ['Very Low Miles 2004 Ford F-150 4X4, V-8 Truck For Sale', 'Oct 10', '$6,500']]


120

In [25]:
# now let's revise the code to write the results of the first page into a csv file named 'CarCraglist.csv'.

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

with open('CarCraglist.csv', 'w',newline='') as myFile:
    writer = csv.writer(myFile)
    writer.writerow(["sales Title", "Listing Date", "Price"])

url='https://boston.craigslist.org/search/cta?s=0'
html = urlopen(url)
bs = BeautifulSoup(html.read(),'html.parser')
cars=bs.find_all('li',{ 'class':'result-row'})

scrapedCarsList=[]
for car in cars:
    salesTitle=car.find('a',{'class':'result-title hdrlnk'})
    price=car.find('span',{'class':'result-price'})
    postingDate=car.find('time',{'class':'result-date'})
    #Some listings do not have a price.
    if price!=None:
        new_car=[salesTitle.get_text(),postingDate.get_text(),price.get_text()]
        scrapedCarsList.append(new_car)

with open('CarCraglist.csv', 'a',newline='',encoding='utf-8') as myFile:
    writer = csv.writer(myFile)
    writer.writerows(scrapedCarsList)


In [26]:
pwd

'/Users/avijames/Dropbox/Mac (2)/Documents/Course/capstone'

In [27]:
baseURL='https://boston.craigslist.org/search/cta?s='
urlList=[]
for i in range(0,1201,120):
    newURL=baseURL+str(i)
    urlList.append(newURL)

print(urlList[0:50]) #uncomment to see the urls
len(urlList)

['https://boston.craigslist.org/search/cta?s=0', 'https://boston.craigslist.org/search/cta?s=120', 'https://boston.craigslist.org/search/cta?s=240', 'https://boston.craigslist.org/search/cta?s=360', 'https://boston.craigslist.org/search/cta?s=480', 'https://boston.craigslist.org/search/cta?s=600', 'https://boston.craigslist.org/search/cta?s=720', 'https://boston.craigslist.org/search/cta?s=840', 'https://boston.craigslist.org/search/cta?s=960', 'https://boston.craigslist.org/search/cta?s=1080', 'https://boston.craigslist.org/search/cta?s=1200']


11

In [28]:
#  trun the scraping script into a function so that it can  takes the page number (0, 120, 240, ...) as input and returns a list of all the cars on the page in a list of lists format.

def craigslistCarsScrape(pageNumber):
    print('*** Scraping cars on page:',int(pageNumber/120+1),'***\n\n')

    baseURL='https://boston.craigslist.org/search/cta?s='
    url=baseURL+str(pageNumber)
    html = urlopen(url)
    bs = BeautifulSoup(html.read(),'html.parser')
    cars=bs.find_all('li',{ 'class':'result-row'})
    scrapedCarsList=[]            
    for car in cars:
        salesTitle=car.find('a',{'class':'result-title hdrlnk'})
        price=car.find('span',{'class':'result-price'})
        postingDate=car.find('time',{'class':'result-date'})
        #Some listings do not have a price.
        if price!=None:
            new_car=[salesTitle.get_text(),postingDate.get_text(),price.get_text()]
            scrapedCarsList.append(new_car)
    return scrapedCarsList

In [29]:
# error handling to make the codes more robust

from urllib.error import HTTPError
from urllib.error import URLError

def craigslistCarsScraper(pageNumber):
    print('*** Scraping cars on page:',int(pageNumber/120+1),'***\n\n')

    baseURL='https://boston.craigslist.org/search/cta?s='
    url=baseURL+str(pageNumber)
    
    try:
        
        html = urlopen(url)
    
    except HTTPError as e:
        print(e)
        print('-----------------------HTTPError----------------------')
        return None
    except URLError as e:
        print('Server cound not be found')
        print('-----------------------URLError----------------------')
        return None
    
    bs = BeautifulSoup(html.read(),'html.parser')
    
    try:
        
        cars=bs.find_all('li',{ 'class':'result-row'})
    
    except AttributeError as e:
        print('Tag was not found')
        print('-----------------------AttributeError----------------------')
    
    else:
        scrapedCarsList=[]
        for car in cars:
            salesTitle=car.find('a',{'class':'result-title hdrlnk'})
            price=car.find('span',{'class':'result-price'})
            postingDate=car.find('time',{'class':'result-date'})
            #Some listings do not have a price.
            if price!=None:
                new_car=[salesTitle.get_text(),postingDate.get_text(),price.get_text()]
                    
                scrapedCarsList.append(new_car)
               
        return scrapedCarsList

In [30]:
craigslistCarsScraper(600)


*** Scraping cars on page: 6 ***




[['2011 FORD RANGER', 'Oct  8', '$4,495'],
 ['2004 TOYOTA COROLLA', 'Oct  8', '$3,995'],
 ['2008 Nissan Altima, 92k, 1 owner', 'Oct  8', '$5,990'],
 ['2004 Honda Accord Ex Coupe, 5 speed manual', 'Oct  8', '$4,490'],
 ['2010 Acura ZDX SH AWD w/Tech 4dr SUV w/Technology Package',
  'Oct  8',
  '$15,995'],
 ['2010 mercedes E350', 'Oct  8', '$9,500'],
 ['2007 Subaru Outback 2.5i Limited 4dr Wagon AWD', 'Oct  8', '$1,800'],
 ['2016 Chevrolet Chevy Silverado 1500 High Country 4x4 4dr Crew Cab 5.8 ft. SB - S',
  'Oct  8',
  '$39,995'],
 ['2018 Chrysler Pacifica Touring L', 'Oct  8', '$31,990'],
 ['2018 Hyundai Tucson SEL AWD 4dr SUV BAD CREDIT FINANCING ',
  'Oct  8',
  '$20,995'],
 ['2015 Jeep Wrangler Sport - BAD CREDIT OK!', 'Oct  8', '$99'],
 ['2014 HONDA CR-V EX-L Gray', 'Oct  8', '$14,900'],
 ['2016 Toyota Tacoma GRN305L *Diesel Truck / Trucks*', 'Oct  8', '$24,990'],
 ['2007 Honda Civic EX', 'Oct  8', '$5,750'],
 ['2006 GMC Savana 3500 12ft Trademaster Utility Van 6.0L Gas SKU:14021',

In [31]:
# run the function in a loop and write the resutls on a csv

with open('craigslist_cars_final.csv', 'w',newline='') as myFile:
    writer = csv.writer(myFile)
    writer.writerow(["Listing Title", "Listing Date", "Price"])

with open('craigslist_cars_final.csv', 'a',newline='',encoding='utf-8') as myFile:
    writer = csv.writer(myFile)
    for i in range(0,1201,120):
        scrapedCarsList=craigslistCarsScraper(i)
        writer.writerows(scrapedCarsList)

print('----------------------------------------Well done---------------------------------------------- ')
print('-----------------------------------Scraping completed------------------------------------------ ')
print('------------Please find the csv file in the folder where this scraping file exists------------- ')

*** Scraping cars on page: 1 ***


*** Scraping cars on page: 2 ***


*** Scraping cars on page: 3 ***


*** Scraping cars on page: 4 ***


*** Scraping cars on page: 5 ***


*** Scraping cars on page: 6 ***


*** Scraping cars on page: 7 ***


*** Scraping cars on page: 8 ***


*** Scraping cars on page: 9 ***


*** Scraping cars on page: 10 ***


*** Scraping cars on page: 11 ***


----------------------------------------Well done---------------------------------------------- 
-----------------------------------Scraping completed------------------------------------------ 
------------Please find the csv file in the folder where this scraping file exists------------- 


In [32]:
df = pd.read_csv('CarCraglist.csv')

In [33]:
df

Unnamed: 0,sales Title,Listing Date,Price
0,2007 Jeep Grand Cherokee Limited Trail Rated E...,Oct 10,"$1,000"
1,chrysler 300s,Oct 10,"$18,000"
2,"Very Low Miles 2004 Ford F-150 4X4, V-8 Truck ...",Oct 10,"$6,500"
3,2017 RAM Ram Pickup 1500 Express 4x4 4dr Quad ...,Oct 10,"$31,995"
4,2018 Chrysler 300 Limited - BAD CREDIT OK!,Oct 10,$99
...,...,...,...
115,"2009 Buick Lucerne CX - 67,000 miles",Oct 10,"$6,000"
116,2006 Honda Civic Lx,Oct 9,"$5,500"
117,2014 Nissan Rogue Select S 4wd/Everyone is APP...,Oct 9,$0
118,1998 GMC RCSB 2WD,Oct 9,"$18,000"


In [34]:
df = pd.read_csv('craigslist_cars_final.csv')

In [35]:
df

Unnamed: 0,Listing Title,Listing Date,Price
0,2007 Jeep Grand Cherokee Limited Trail Rated E...,Oct 10,"$1,000"
1,chrysler 300s,Oct 10,"$18,000"
2,"Very Low Miles 2004 Ford F-150 4X4, V-8 Truck ...",Oct 10,"$6,500"
3,2017 RAM Ram Pickup 1500 Express 4x4 4dr Quad ...,Oct 10,"$31,995"
4,2018 Chrysler 300 Limited - BAD CREDIT OK!,Oct 10,$99
...,...,...,...
1315,2018 Chrysler 300 Limited - BAD CREDIT OK!,Oct 6,$99
1316,2019 GMC Sierra 3500HD Denali 4x4 4dr Crew Cab...,Oct 6,"$79,990"
1317,2017 Jeep Wrangler Unlimited,Oct 6,"$30,000"
1318,2004 Ford F-350 F350 F 350 Super Duty XLT 4dr ...,Oct 6,"$22,995"


In [36]:
df.head()

Unnamed: 0,Listing Title,Listing Date,Price
0,2007 Jeep Grand Cherokee Limited Trail Rated E...,Oct 10,"$1,000"
1,chrysler 300s,Oct 10,"$18,000"
2,"Very Low Miles 2004 Ford F-150 4X4, V-8 Truck ...",Oct 10,"$6,500"
3,2017 RAM Ram Pickup 1500 Express 4x4 4dr Quad ...,Oct 10,"$31,995"
4,2018 Chrysler 300 Limited - BAD CREDIT OK!,Oct 10,$99


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1320 entries, 0 to 1319
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Listing Title  1320 non-null   object
 1   Listing Date   1320 non-null   object
 2   Price          1320 non-null   object
dtypes: object(3)
memory usage: 31.1+ KB
