# Webscrape Sprinter Camper Vans

In [51]:
from time import sleep
from datetime import datetime
import re
from random import randint #avoid throttling by not sending too many requests one after the other
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import display, Markdown

# import get to call a get request on the site
from requests import get
from jinja2 import Template

In [1]:
#get the first page of the east bay housing prices
response = get('https://portland.craigslist.org/search/sss?max_price=100000&query=sprinter+4wd&min_price=40000')
html_soup = BeautifulSoup(response.text, 'html.parser')

#get the macro-container for the housing posts
posts = html_soup.find_all('li', class_= 'result-row')
print(type(posts)) #to double check that I got a ResultSet
print(len(posts)) #to double check I got 120 (elements/page)

<class 'bs4.element.ResultSet'>
18


In [2]:
#grab the first post
post_one = posts[0]

In [3]:
#grab the price of the first post
post_one_price = post_one.a.text
post_one_price.strip()

'$42995'

In [4]:
#grab the time of the post in datetime format to save on cleaning efforts
post_one_time = post_one.find('time', class_= 'result-date')
post_one_datetime = post_one_time['datetime']
post_one_datetime

'2020-01-28 07:10'

In [5]:
#title is a and that class, link is grabbing the href attribute of that variable
post_one_title = post_one.find('a', class_='result-title hdrlnk')
post_one_link = post_one_title['href']

#easy to grab the post title by taking the text element of the title variable
post_one_title_text = post_one_title.text
post_one_title_text

'2016 Porsche Cayenne All Wheel Drive S E-Hybrid AWD 26k Mi Vented Seats Pano Roo'

In [None]:
#grabs the whole segment of housing details. We will need missing value handling in the loop as this kind of detail is not common in posts
#the text can be split, and we can use indexing to grab the elements we want. number of bedrooms is the first element.
#sqft is the third element

#post_one_num_bedrooms = post_one.find('span', class_ = 'housing').text.split()[0]

#post_one_sqft = post_one.find('span', class_ = 'housing').text.split()[2][:-3] #cleans the ft2 at the end

#the neighborhood is grabbed by finding the span class 'result-hood' and pulling the text element from that
#post_one_hood = posts[0].find('span', class_='result-hood').text

In [87]:
#build out the loop
#find the total number of posts to find the limit of the pagination
#results_num = html_soup.find('div', class_= 'search-legend')
#results_total = int(results_num.find('span', class_='totalcount').text) #pulled the total count of posts as the upper bound of the pages array

#each page has 119 posts so each new page is defined as follows: s=120, s=240, s=360, and so on. So we need to step in size 120 in the np.arange function
#pages = np.arange(0, results_total+1, 120)

iterations = 0

days_ago = []
post_title_texts = []
post_links = []
post_prices = []
search_ixs = []
post_ixs = [] 

# Craiglist cities to search
cities = (
    'anchorage',
    'fairbanks',
    'portland',
    'seattle',
    'denver',
    'madison',
    'boulder',
    'bozeman',
    'boise',
    'sfbay',
    'phoenix',
    'saltlakecity',
    'albuquerque',
    'minneapolis',
    'wyoming',
    'losangeles',
)

# Search strings
searches = (
    'min_price=40000&max_price=100000&query=sprinter+4x4+camper',
    'min_price=40000&max_price=100000&query=sprinter+4x4+conversion',
    #'min_price=40000&max_price=100000&query=sprinter+4wd+camper',
    #'query=electric+bicycle&min_price=700&max_price=2000',
)
post_ix = 0    # counts the posts downloaded
for city in cities:
        
    for search_ix, search in enumerate(searches):

        #get request
        response = get(f"https://{city}.craigslist.org/search/sss?{search}&" 
                       + "s=0" #the parameter for defining the page number 
                      )
        sleep(randint(1,5))

        #throw warning for status codes that are not 200
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))

        #define the html text
        page_html = BeautifulSoup(response.text, 'html.parser')

        #define the posts
        posts = page_html.find_all('li', class_= 'result-row')

        #extract data item-wise
        for post in posts:

            # posting date
            # grab the datetime element 0 for date and 1 for time
            post_datetime = post.find('time', class_= 'result-date')['datetime']
            dt = datetime.strptime(post_datetime, '%Y-%m-%d %H:%M')
            ago = (datetime.now() - dt).days

            # title text
            post_title = post.find('a', class_='result-title hdrlnk')
            post_title_text = post_title.text

            # Filter out some posts
            title_lower = post_title_text.lower()

            excl_words = (
                'promaster',
                'ford',
                'porsche',
                'dodge',
                'isuzu',
                'roadster',
                'toyota',
                'jeep',
            )
            done = False
            for wd in excl_words:
                if wd in title_lower:
                    done = True
            if done:
                continue

            # post link
            post_link = post_title['href']

            # removes the \n whitespace from each side, removes the currency symbol, and turns it into an int
            try:
                post_price = int(post.a.text.strip().replace("$", "")) 
            except:
                continue
                
            
            days_ago.append(ago)
            post_title_texts.append(post_title_text)
            post_links.append(post_link)
            post_prices.append(post_price)
            search_ixs.append(search_ix)
            post_ixs.append(post_ix)
            post_ix += 1

        iterations += 1
        print('.', end='')

print("\nScrape complete!")

vans = pd.DataFrame({'days_ago': days_ago,
                       'title': post_title_texts,
                        'URL': post_links,
                       'price': post_prices,
                       'search_ix': search_ixs,
                       'post_ix': post_ixs,
                    })

# first things first, drop duplicate URLs because people are spammy on Craigslist. 
vans = vans.drop_duplicates(subset='URL')

# Download the detailed post page for the remaining items
print('Downloading Posts...')
local_urls = []
for row_ix, row in vans.iterrows():
    post_html = get(row.URL).text
    post_lines = post_html.splitlines()
    for ix, line in enumerate(post_lines):
        if 'class="tryapp"' in line:
            start_ix = ix
            break
    for ix, line in enumerate(post_lines):
        if 'class="postingtitle"' in line:
            end_ix = ix
            break
    post_lines = post_lines[:start_ix] + post_lines[end_ix:]
    local_url = f'posts/{row_ix}.html'
    open(local_url, 'w').write('\n'.join(post_lines))
    local_urls.append(local_url)
    print('.', end='')
    sleep(randint(1,5))
vans['local_url'] = local_urls

result = ''
for ix, srch in enumerate(searches):
    result += f'\n### {srch}\n\n'
    for _, row in vans.sort_values(by=['days_ago']).query(f'search_ix == {ix}').iterrows():
        result += f'[${row.price:.0f}, {row.days_ago} days ago, {row.title}]({row.URL})\n\n'
Markdown(result)

................................
Scrape complete!
Downloading Posts...
...................................


### min_price=40000&max_price=100000&query=sprinter+4x4+camper

[$86000, 5 days ago, 4x4 sprinter crew camper conversion van](https://portland.craigslist.org/wsc/for/d/tacoma-4x4-sprinter-crew-camper/7065037633.html)

[$86000, 5 days ago, 2019 Mercedes Sprinter 4x4 camper conversion van](https://seattle.craigslist.org/tac/for/d/tacoma-2019-mercedes-sprinter-4x4/7064917221.html)

[$86000, 6 days ago, 2019 Mercedes Sprinter 4x4 Crew Camper van](https://seattle.craigslist.org/tac/for/d/tacoma-2019-mercedes-sprinter-4x4-crew/7064275055.html)

[$84000, 7 days ago, 2017 Mercedes Benz Sprinter 2500 4X4  **Price Reduced **](https://bozeman.craigslist.org/ctd/d/belgrade-2017-mercedes-benz-sprinter-x4/7063350440.html)

[$61000, 9 days ago, Brand new 4x4 Sprinter](https://portland.craigslist.org/mlt/rvs/d/sherwood-brand-new-4x4-sprinter/7062408953.html)

[$42000, 10 days ago, 2016 4x4 Mercedes Sprinter 170 Crew](https://sfbay.craigslist.org/sfc/cto/d/san-francisco-x4-mercedes-sprinter-170/7061971670.html)

[$45000, 16 days ago, 1989 VW Vanagon Syncro Westfalia - CARB legal 1.8t - 200hp](https://losangeles.craigslist.org/lgb/cto/d/la-mirada-1989-vw-vanagon-syncro/7058237079.html)

[$90000, 21 days ago, 2016 Sprinter 4x4 Camper Van](https://ventura.craigslist.org/cto/d/ventura-2016-sprinter-4x4-camper-van/7055020507.html)


### min_price=40000&max_price=100000&query=sprinter+4x4+conversion

[$84500, 3 days ago, 2018 Mercedes-Benz Sprinter 2500 Passenger High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2018-mercedes-benz-sprinter-2500/7065874797.html)

[$84500, 3 days ago, 2018 Mercedes-Benz Sprinter 2500 Passenger High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2018-mercedes-benz-sprinter-2500/7065873655.html)

[$69999, 3 days ago, 2019 Mercedes-Benz Sprinter 2500 Cargo Standard Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2019-mercedes-benz-sprinter-2500/7065872653.html)

[$67500, 3 days ago, 2019 Freightliner Sprinter 2500 Cargo High Roof w/170 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2019-freightliner-sprinter-2500/7065872274.html)

[$69998, 3 days ago, 2016 Mercedes-Benz Sprinter 2500 Crew High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2016-mercedes-benz-sprinter-2500/7065875623.html)

[$69999, 6 days ago, 2019 Mercedes-Benz Sprinter 2500 Cargo High Roof w/170 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2019-mercedes-benz-sprinter-2500/7064172509.html)

[$84500, 6 days ago, 2018 Mercedes-Benz Sprinter 2500 Passenger High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2018-mercedes-benz-sprinter-2500/7064177849.html)

[$84500, 6 days ago, 2018 Mercedes-Benz Sprinter 2500 Passenger High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2018-mercedes-benz-sprinter-2500/7064176802.html)

[$69998, 6 days ago, 2016 Mercedes-Benz Sprinter 2500 Crew High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2016-mercedes-benz-sprinter-2500/7064174801.html)

[$67500, 6 days ago, 2019 Freightliner Sprinter 2500 Cargo High Roof w/170 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2019-freightliner-sprinter-2500/7064175223.html)

[$69998, 9 days ago, 2019 Freightliner Sprinter 2500 Cargo High Roof w/170 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2019-freightliner-sprinter-2500/7062114231.html)

[$69998, 9 days ago, 2016 Mercedes-Benz Sprinter 2500 Crew High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2016-mercedes-benz-sprinter-2500/7062114512.html)

[$61000, 9 days ago, Brand New 2019 Sprinter 4x4](https://sfbay.craigslist.org/sby/rvs/d/san-jose-brand-new-2019-sprinter-4x4/7062406165.html)

[$84500, 9 days ago, 2018 Mercedes-Benz Sprinter 2500 Passenger High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2018-mercedes-benz-sprinter-2500/7062114980.html)

[$69999, 9 days ago, 2019 Mercedes-Benz Sprinter 2500 Cargo High Roof w/170 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2019-mercedes-benz-sprinter-2500/7062118370.html)

[$84500, 9 days ago, 2018 Mercedes-Benz Sprinter 2500 Passenger High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2018-mercedes-benz-sprinter-2500/7062117664.html)

[$58500, 16 days ago, Sprinter 2018 4x4 Passenger 144](https://bozeman.craigslist.org/cto/d/bozeman-sprinter-x4-passenger-144/7058034388.html)

[$69999, 18 days ago, 2019 Mercedes-Benz Sprinter 2500 Cargo High Roof w/170 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2019-mercedes-benz-sprinter-2500/7056668133.html)

[$82500, 20 days ago, 2018 Mercedes-Benz Sprinter 2500 Passenger High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2018-mercedes-benz-sprinter-2500/7055543154.html)

[$79999, 20 days ago, 2018 Mercedes-Benz Sprinter 2500 Passenger High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2018-mercedes-benz-sprinter-2500/7055537077.html)

[$74500, 20 days ago, 2019 Mercedes-Benz Sprinter 2500 Cargo Standard Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2019-mercedes-benz-sprinter-2500/7055537254.html)

[$69998, 20 days ago, 2016 Mercedes-Benz Sprinter 2500 Crew High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2016-mercedes-benz-sprinter-2500/7055537396.html)

[$69998, 20 days ago, 2019 Freightliner Sprinter 2500 Cargo High Roof w/170 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2019-freightliner-sprinter-2500/7055535194.html)

[$89900, 23 days ago, 2016 4x4 Fully-Loaded 170" Sprinter Van: Seat 4/Sleep 4 Conversion](https://denver.craigslist.org/cto/d/golden-x4-fully-loaded-170-sprinter-van/7053600219.html)

[$79999, 25 days ago, 2018 Mercedes-Benz Sprinter 2500 Passenger High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2018-mercedes-benz-sprinter-2500/7052207080.html)

[$82500, 25 days ago, 2018 Mercedes-Benz Sprinter 2500 Passenger High Roof w/144 WB Van 3D](https://boise.craigslist.org/ctd/d/boise-2018-mercedes-benz-sprinter-2500/7052205181.html)

[$89000, 27 days ago, 2017 Sprinter Van 4x4 Conversion Build](https://phoenix.craigslist.org/nph/cto/d/flagstaff-2017-sprinter-van-4x4/7050776154.html)



In [89]:
results = []
for ix, srch in enumerate(searches):
    srch = srch.replace('&', '<br>').replace('+', ' ')
    items = []
    for _, row in vans.query(f'search_ix == {ix}').sort_values(by=['days_ago']).iterrows():
        items.append((f'${row.price:.0f}, {row.days_ago} days ago, {row.title}', row.URL, row.local_url))
    results.append((srch, items))
t = Template(open('results_tmpl.html',).read())
open('results.html', 'w').write(t.render(results=results))

10553

In [88]:
len(vans)

35