In [None]:
max_price=500000
debug=False

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import pickle
from datetime import date
from pathlib import Path
import json
import pycountry
import functools

import requests_cache
requests_cache.install_cache(expire_after=3600)
requests_cache.core.remove_expired_responses()

Path('out').mkdir(parents=True, exist_ok=True)

In [None]:
def memoize(f):
    memo = {}
    def helper(x):
        if x not in memo:            
            memo[x] = f(x)
        return memo[x]
    return helper

def none_on_error(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            if debug: print(func.__name__+' FAILED with '+str(e))
            return None
    return wrapper

req_headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15'
}
def get_page(url):
    r = requests.get(url, headers=req_headers, allow_redirects=True)
    return r.text

def get_soup(url, parser='html.parser'):
    return BeautifulSoup(
        get_page(url),
        parser)

In [None]:
# Nettivene.com

@none_on_error
def nv_year(div):
    sub_div = div.find('div',class_='vehicle_other_info')
    return int(sub_div.ul.li.text)

@none_on_error
def nv_country_city(div):
    sub_div = div.find('div',class_='location_info')
    s = sub_div.b.text.split()[0].strip()
    if s in [c.name for c in pycountry.countries]: return s, None
    return 'Finland', s

@none_on_error
def nv_price(div):
    sub_div = div.find('div',class_='main_price')
    return int(sub_div.text.replace(' ','').replace('€',''))


def nv_parse_list_page(make,soup):
    divs = soup.find_all('div', class_='listingVifUrl')
    l = []
    for div in divs:
        country, city = nv_country_city(div)
        l.append(
            (
                div.div.a['href'],
                div.div.a.text.replace(make,''),
                nv_year(div),
                country,
                city,
                nv_price(div)

            )
        )
    return l

@none_on_error
def nv_next_page_url(soup):
    return soup.find(
            'a',
            class_='pageNavigation next_link'
        )['href']


def nv_listings(make):
    next_url = 'https://www.nettivene.com/en/purjevene/'+make.replace(' ','-').lower()
    l = []
    while next_url:
        soup = get_soup(next_url)
        msg = soup.find('div', id='msg')
        if msg:
            print(msg.text)
            return l        
        l += nv_parse_list_page(
            make,
            soup
        )
        next_url = nv_next_page_url(soup)
    return l


In [None]:
#yachtworld

@none_on_error
def yw_redux_state_json(soup):
    script_tag = soup.find('script',string=re.compile('__REDUX_STATE__')).contents[0]
    json_str = script_tag[script_tag.index('window.__REDUX_STATE__ = ')+25:script_tag.rfind('}')+1]
    return json.loads(json_str)    

@none_on_error
def yw_price(record):
    return record['price']['type']['amount']['EUR']

def yw_country(record):
    cc = record['location']['countryCode']
    country = pycountry.countries.get(alpha_2=cc)
    if country: return country.name
    return cc

@none_on_error
def yw_parse_record(r):
    return (
        r['mappedURL'],
        r['model'],
        r['year'],
#        r['boat']['specifications']['dimensions']['lengths']['nominal']['m'],
        yw_country(r),
        str(r['location']['city']),
        yw_price(r)
    )

def yw_collect_listings(js):
    records = js['search']['searchResults']['search']['records']
    return [yw_parse_record(r) for r in records]

def yw_has_next(js):
    curr_page = int(js['search']['searchResults']['search']['currentPage'])
    last_page = int(js['search']['searchResults']['search']['lastPage'])
    return (curr_page<last_page)    


def yw_listings(make):
    url_template='https://www.yachtworld.com/boats-for-sale/condition-used/type-sail/make-{}/?currency=EUR&price=0-{}'
    base_url = url_template.format(make.lower().replace(' ','-'),max_price)
    url = base_url

    l = []
    page = 1
    while True:
        js = yw_redux_state_json(get_soup(url))
        if js:
            l += yw_collect_listings(js)
        
            if yw_has_next(js):
                page += 1
                url = base_url+'&page='+str(page)
            else: break
        else: break
        
    return l

In [None]:
#boat24

re_year = re.compile(r'Year Built (\d\d\d\d)')

@none_on_error
def b24_year(div):
    return int(re_year.search(div.find('ul', {'class':'blurb__description'}).text).group(1))

re_price = re.compile(r'EUR (\d\d*).(\d\d\d)')

@none_on_error
def b24_price(div):
    g = re_price.search(div.find('p', {'class':'blurb__price'}).text).groups()
    return int(g[0]+g[1])

def b24_scrape(make,soup):    
    
    divs = soup.find_all('div', class_='blurb')

    l = []
    for div in divs:
        loc_a = div.find('p', {'class':'blurb__location'}).text.split('»')
        city = None,
        if len(loc_a) > 1: city = loc_a[1].strip()
        l.append(
            (
                div['data-link'],
                div['title'].replace(make,''),
                b24_year(div),
                loc_a[0].strip(),
                city,
                b24_price(div),
            )
        )
    return l
    
@none_on_error
def b24_next_url(soup):
    return soup.find('a', class_='pagination__next')['href']

def b24_listings(make):
    next_url = 'https://www.boat24.com/en/sailboats/?src={}&mode=AND&whr=EUR&prs_min=&prs_max={}'.format(
        make.replace(' ','+'),
        max_price
    )
    l = []
    while next_url:
        soup = get_soup(next_url)
        l += b24_scrape(make,soup)
        next_url = b24_next_url(soup)
    return l


In [None]:
#yachtmarket

re_tym_year = re.compile(r'(\d\d\d\d)\D')

@none_on_error
def tym_year(ov):
    return int(re_tym_year.search(ov.text).group(1))


@none_on_error
def tym_price(div):
    return int(div.find('div',class_='pricing').span.text.replace('€','').replace('EUR','').replace(',',''))


def tym_scrape(make, soup):

    divs = soup.find_all('div',class_='result')

    l = []
    for div in divs:
        a = div.find('a', class_='boat-name')
        ov = div.find('div', class_='overview')
        loc_a = div.find('div', class_='location').text.split(',')
        l.append(
            (
                'https://www.theyachtmarket.com'+a['href'].split('?')[0],
                a.text.replace(make,'').strip(),
                tym_year(ov),
                loc_a[-1],
                loc_a[0],
                tym_price(div)
            )
        )
        
    return l


@none_on_error
def tym_next_url(soup):
    return 'https://www.theyachtmarket.com/en/boats-for-sale/search/'+soup.find('a', rel='next')['href']

def tym_listings(make):
    next_url = 'https://www.theyachtmarket.com/en/boats-for-sale/search/?manufacturermodel={}&currency=eur&lengthunit=metres&showsail=1'.format(
            make.replace(' ','+').lower()
        )
    
    l = []
    while next_url:
        soup = get_soup(next_url)
        l += tym_scrape(make,soup)
        next_url = tym_next_url(soup)
    return l


In [None]:
#scanboat.com
@none_on_error
def sb_price(s):
    return int(s.p.text.replace('EUR','').replace(',',''))

re_sb_year = re.compile(r'Year : (\d\d\d\d)')
re_sb_country= re.compile(r'Country : (.*)')

def sb_scrape(make, soup):
    
    divs = soup.find_all('div', class_='item')

    l = []
    for div in divs:
        header = div.find('header',class_='item__header')
        body = div.find('section',class_='item__body')
        url = 'https://www.scanboat.com'+div.a['href']
        price_tags = header.find_all('p')

        if price_tags:
            l.append(
                (
                    url,
                    header.section.text.replace(make,'').replace(' - ',''),
                    int(re_sb_year.search(body.p.text).group(1)),
                    re_sb_country.search(body.p.text).group(1),
                    None,
                    int(price_tags[-1].text.replace('EUR','').replace(',','')),
                )
            )
    return l    

@none_on_error
def sb_next_url(soup):
    return 'https://www.scanboat.com'+soup.find('a',string='Next')['href']

    
def sb_listings(make):
    next_url = 'https://www.scanboat.com/en/boats?SearchCriteria.BoatModelText={}&SearchCriteria.BoatTypeID=1&SearchCriteria.Searched=true&SearchCriteria.ExtendedSearch=False'.format(make.replace(' ','+').lower())
    
    l = []
    while next_url:
        soup = get_soup(next_url)
        l += sb_scrape(make,soup)
        next_url = sb_next_url(soup)
    return l



In [None]:
#boats.com
re_bcom_price = re.compile(r'€(\d{1,3})\,(\d\d\d)')

@none_on_error
def bcom_price(div):
    match = re_bcom_price.search(div.find('div',class_='price').text)
    return int(match.group(1)+match.group(2))

@none_on_error
def bcom_country(loc_a):
    s = loc_a[-1]
    if s in [c.name for c in pycountry.countries]: return s
    return 'United States'

@none_on_error
def bcom_city(loc_a):
    return loc_a[0]


def bcom_scrape(make, soup):
    list_items = soup.find_all('li', {'data-listing-id': True})
    l = list()
    for li in list_items:
        a = li.div.a
        details = li.find('div', class_='details')
        loc_a = li.find('div',class_='country').text.split(',')

        l.append(
            (
                'https://www.boats.com'+a['href'],
                details.div.h2.text.replace(make,'').strip(),
                int(details.find('div',class_='year').text),
                bcom_country(loc_a),
                bcom_city(loc_a),
                bcom_price(details)
            )
        )
    return l

@none_on_error
def bcom_next_url(soup):
    return 'https://www.boats.com'+soup.find_all('a',class_='next')[-1]['href']

def bcom_listings(make):
    url_template = 'https://www.boats.com/boats-for-sale/?boat-type=sail&make={}&price-to={}&currency=eur'
    next_url = url_template.format(
        make.lower().replace(' ','-'),
        max_price
    )
    l = []
    while next_url:
        soup = get_soup(next_url,parser='lxml')
        l += bcom_scrape(make,soup)
        next_url = bcom_next_url(soup)
    return l

In [None]:
re_euro_price = re.compile(r'€\s(\d{1,3}),(\d\d\d)')
@none_on_error
def ya_price(div):
    span = div.find('span', string=re.compile('price.*'))
    m = re_euro_price.search(str(span.parent.contents))
    return int(m.group(1)+m.group(2))

@none_on_error
def ya_city(loc_span):
    return loc_span.next_sibling.next_sibling.text[1:]

def ya_scrape(make,soup):
    divs = soup.find_all('div', class_='boatlist-subbox')

    l = list()
    for div in divs:
        yard_str = div.find('h3').next_sibling.next_sibling.span.text
        if make in yard_str:
            a = div.find('a',class_='js-hrefBoat')
            price_span = div.find('span', string=re.compile('price.*'))
            loc_span = div.find('span', string=re.compile('location.*'))
            l.append(
                (
                    'https://www.yachtall.com'+a['href'],
                    a.text.replace(make,''),
                    int(div.find('b',string=re.compile('\d\d\d\d')).text),
                    loc_span.next_sibling.text,
                    ya_city(loc_span),
                    ya_price(div)
                )
            )
        else:
            print('skipped a boat from: '+yard_str)
    return l

@none_on_error
def ya_next_url(soup):
    return 'https://www.yachtall.com'+soup.find('a', string='►')['href']

def ya_listings(make):
    url_template = 'https://www.yachtall.com/en/sailboats/used-boats?q={}&sprct={}'
    next_url = url_template.format(make.replace(' ','%20'),max_price)
    
    l = []
    while next_url:
        soup = get_soup(next_url)
        l += ya_scrape(make,soup)
        next_url = ya_next_url(soup)
    return l

In [None]:
from forex_python.converter import CurrencyRates
fx = CurrencyRates()
eur_dkk = fx.get_rate('EUR', 'DKK')

@none_on_error
def dba_year(tr):
    return int(tr.find('td',title='Modelår').text)

def dba_scrape(make,soup):
    listings = soup.find_all('tr', class_='dbaListing')
    
    l = list()
    for tr in listings:
        desc = tr.find_all('a', class_='listingLink')[1].text
        if make in desc:
            script_tag = tr.find('script',{'type':'application/ld+json'})
            if script_tag:
                j = json.loads(script_tag.contents[0])
                l.append(
                    (
                        j['url'],
                        j['name'].split(',')[0].replace(make,''),
                        dba_year(tr),
                        'Denmark',
                        None,
                        int(int(j['offers']['price'])/eur_dkk)
                    )
                )
    return l        

@none_on_error
def dba_next_url(soup):
    return 'https://www.dba.dk'+soup.find('a', string='Næste ')['href']

def dba_listings(make):
    url_template = 'https://www.dba.dk/baade/baade/sejlbaade/?soeg={}&pris=(-{})'
    next_url = url_template.format(make.replace(' ','+'),max_price*eur_dkk)
    
    l = []
    while next_url:
        soup = get_soup(next_url)
        l += dba_scrape(make,soup)
        next_url = dba_next_url(soup)
    return l

In [None]:
def scrape_listings(make):
    nv = nv_listings(make)
    yw = yw_listings(make)
    b24 = b24_listings(make)
    tym = tym_listings(make)
    sb = sb_listings(make)
    bcom = bcom_listings(make)
    ya = ya_listings(make)
    dba = dba_listings(make)

    
    
    df = pd.DataFrame(
        nv+yw+b24+tym+sb+bcom+ya+dba,
        columns=['url','model','year','country','city','price']
    )

    df.model = df.model.str.strip()
    df.country = df.country.str.strip()
    df.city = df.city.str.strip()


    df = df[df.price <= max_price]
    
    df = df.round({
        'year': 0,
        'price': 0
    })

    return df

if not debug: listings_make = memoize(scrape_listings)   
else: listings_make = scrape_listings

In [None]:
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

a4_landscape = (11.7, 8.27)
a4_portrait = (8.27,11.7)
    
def scatter_year(df, 
                 ref=None, 
                 hline=100000):
    
    fig, ax = plt.subplots(figsize=a4_landscape,facecolor='w')
    ax = sns.scatterplot(
        ax=ax, 
        data=df, 
        x='year', 
        y='price',
        style=df.model.str.extract(r'\D*([\d\.]*)')[0].tolist(), #https://github.com/mwaskom/seaborn/issues/2194
        hue=df.country.tolist(), #https://github.com/mwaskom/seaborn/issues/2194
#        size=df.loa.tolist(), #https://github.com/mwaskom/seaborn/issues/2194
#        sizes=(200,400),
        s=300,
        legend='brief'
    )
    sns.regplot(
        ax=ax, 
        data=df, 
        x='year', 
        y='price',
        scatter=False
    )
    ax.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
    if ref: ax.plot(ref[0], ref[1], 'rx', markersize=25)
    if hline: ax.axhline(hline,ls='--',color='r')
    return ax


import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning

def regplot(df,ax=None):
    if not ax: fig, ax = plt.subplots(figsize=a4_landscape)    
    warnings.simplefilter('ignore', ConvergenceWarning)
    return sns.regplot(ax = ax, x="year", y="price", data=df, robust=True);

In [None]:
def listings(make, model=None, model_excl=None, min_year=None, max_year=None):
    df = listings_make(make)
    
    if min_year: df = df[df.year >= min_year]
    if max_year: df = df[df.year <= max_year]
        
    if model: df = df[df.model.str.contains(model,case=False)]
    if model_excl: df = df[~df.model.str.contains(model_excl,case=False)]

        
    return df.sort_values(by='price')

def url_to_html_anchor(url):
    return '<a target="_blank" href="{}">{}</a>'.format(url,url)

def diplay_listings(df):
    display(df.style.format(
        {
            'url': url_to_html_anchor,
            'year': '{:n}',
            'price': '{:n} €',
        }
    ))

def summary(make, 
            model=None, 
            model_excl=None, 
            min_year=None, 
            max_year=None, 
            ref=None, 
            hline=None, 
            excl_ids=[]):
    
    title = make
    if model: title += '_model({})'.format(model)
    if min_year or max_year:
        title += '_year[{},{}]'.format(min_year,max_year)
    
    df = listings(make, model, model_excl, min_year, max_year)
    df.to_csv('out/listings-'+title+'.csv')
    
    df = df.drop_duplicates(subset=['year','country','price'])
    df = df.drop_duplicates(subset=['year','city'])
    
    df = df.drop(excl_ids)

    ax = scatter_year(df,ref,hline)
    ax.set_title(title)
    plt.show()
    plt.savefig('out/'+title+'.pdf')
    diplay_listings(df)
    

# Example Queries

In [None]:
summary(
    'Elan',
    model='40',
    ref=(2002,57500),
    max_year=2008
)