In [1]:
max_price=500000
debug=False

In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import pickle
from datetime import date
from pathlib import Path
import json
import pycountry
import functools

In [3]:
Path('out').mkdir(parents=True, exist_ok=True)
Path('cache').mkdir(parents=True, exist_ok=True)


cache_filename = 'cache/page_cache-'+date.today().isoformat()+'.pickle'
page_cache = {}
try:
    with open(cache_filename, 'rb') as f:
        page_cache = pickle.load(f)
except FileNotFoundError:
    print(cache_filename+' not found, starting fresh')
    
def save_obj(obj, name):
    with open(cache_filename, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [4]:
def memoize(f):
    memo = {}
    def helper(x):
        if x not in memo:            
            memo[x] = f(x)
        return memo[x]
    return helper

def none_on_error(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            if debug: print(func.__name__+' FAILED with '+str(e))
            return None
    return wrapper

def get_page(url):
    if url not in page_cache:
        if debug: print('GET '+url)
        r = requests.get(url, allow_redirects=False)
        if r.status_code == 200:
            page = r.text
            page_cache[url] = page
            return page
        else:
            print('GOT '+str(r.status_code)+' for GET '+url)
            print(r.headers)
            return ''
    if debug: print('CACHE HIT '+url)
    return page_cache[url]

def make_soup(url):
    return BeautifulSoup(
        get_page(url),
        'html.parser')

if not debug: get_soup = memoize(make_soup)
else: get_soup = make_soup

In [5]:
# Nettivene.com

re_loa = re.compile(r'(\D*)([\d\.\,]+)(\D*)')

@none_on_error
def nv_loa(url):
    soup = get_soup(url)
    str_loa = soup.find('td',string='Length').next_sibling.next_sibling.text
    return float(
        re_loa.match(
            str_loa
        ).group(2).replace(',','.')
    )

@none_on_error
def nv_year(div):
    sub_div = div.find('div',class_='vehicle_other_info')
    return int(sub_div.ul.li.text)

@none_on_error
def nv_country_city(div):
    sub_div = div.find('div',class_='location_info')
    s = sub_div.b.text.split()[0].strip()
    if s in [c.name for c in pycountry.countries]: return s, None
    return 'Finland', s

@none_on_error
def nv_price(div):
    sub_div = div.find('div',class_='main_price')
    return int(sub_div.text.replace(' ','').replace('€',''))


def nv_parse_list_page(make,soup):
    divs = soup.findAll('div', class_='listingVifUrl')
    l = []
    for div in divs:
        country, city = nv_country_city(div)
        l.append(
            (
                div.div.a['href'],
                div.div.a.text.replace(make,''),
                nv_year(div),
                nv_loa(div.div.a['href']),
                country,
                city,
                nv_price(div)

            )
        )
    return l

@none_on_error
def nv_next_page_url(soup):
    return soup.find(
            'a',
            class_='pageNavigation next_link'
        )['href']


def nv_listings(make):
    next_url = 'https://www.nettivene.com/en/purjevene/'+make.replace(' ','-').lower()
    l = []
    while next_url:
        soup = get_soup(next_url)
        msg = soup.find('div', id='msg')
        if msg:
            print(msg.text)
            return l        
        l += nv_parse_list_page(
            make,
            soup
        )
        next_url = nv_next_page_url(soup)
    return l


In [6]:
#yachtworld

@none_on_error
def yw_redux_state_json(soup):
    script_tag = soup.find('script',string=re.compile('__REDUX_STATE__')).contents[0]
    json_str = script_tag[script_tag.index('window.__REDUX_STATE__ = ')+25:script_tag.rfind('}')+1]
    return json.loads(json_str)    

@none_on_error
def yw_price(record):
    return record['price']['type']['amount']['EUR']

def yw_country(record):
    cc = record['location']['countryCode']
    country = pycountry.countries.get(alpha_2=cc)
    if country: return country.name
    return cc

@none_on_error
def yw_parse_record(r):
    return (
        r['mappedURL'],
        r['model'],
        r['year'],
        r['boat']['specifications']['dimensions']['lengths']['nominal']['m'],
        yw_country(r),
        str(r['location']['city']),
        yw_price(r)
    )

def yw_collect_listings(js):
    records = js['search']['searchResults']['search']['records']
    return [yw_parse_record(r) for r in records]

def yw_has_next(js):
    curr_page = int(js['search']['searchResults']['search']['currentPage'])
    last_page = int(js['search']['searchResults']['search']['lastPage'])
    return (curr_page<last_page)    


def yw_listings(make):
    url_template='https://www.yachtworld.com/boats-for-sale/condition-used/type-sail/make-{}/?currency=EUR&price=0-{}'
    base_url = url_template.format(make,max_price)
    url = base_url

    l = []
    page = 1
    while True:
        js = yw_redux_state_json(get_soup(url))
        if js:
            l += yw_collect_listings(js)
        
            if yw_has_next(js):
                page += 1
                url = base_url+'&page='+str(page)
            else: break
        else: break
        
    return l

In [7]:
#boat24

@none_on_error
def b24_year(div):
    return int(div.find('label',string='Year Built').next_sibling)

re_b24_loa = re.compile(r'([\d\.]+) x .*')
@none_on_error
def b24_loa(div):
    loa_str = re_b24_loa.search(div.find('div',class_='details').text).group(1)
    return float(loa_str)

@none_on_error
def b24_country_city(div):
    ss = div.find('div',class_='location').text.split('»')
    country = ss[0].split('(')[0]
    return country, ss[-1]

re_b24_price = re.compile(r'EUR ([\d\.]*),-')
@none_on_error
def b24_price(div):
    s = re_b24_price.search(div.find('p',class_='price').text).group(1)
    return int(s.replace('.',''))

def b24_scrape(make,soup):    
    
    divs = soup.findAll('div', class_='resultViewEntry')

    l = []
    for div in divs:
        country, city = b24_country_city(div)
        l.append(
            (
                div.div.a['href'],
                div.div.a['title'].replace(make,''),
                b24_year(div),
                b24_loa(div),
                country,
                city,
                b24_price(div),
            )
        )
    return l
    
@none_on_error
def b24_next_url(soup):
    return soup.find('a', class_='next')['href']

def b24_listings(make):
    next_url = 'https://www.boat24.com/en/sailboats/?src={}&mode=AND&whr=EUR&prs_min=&prs_max={}'.format(
        make.replace(' ','+'),
        max_price
    )
    l = []
    while next_url:
        soup = get_soup(next_url)
        l += b24_scrape(make,soup)
        next_url = b24_next_url(soup)
    return l


In [8]:
#yachtmarket

re_tym_loa = re.compile(r'\s([\d\.]*)[Mm]')

@none_on_error
def tym_loa(ov):
    return float(re_tym_loa.search(ov.text).group(1))

re_tym_year = re.compile(r'(\d\d\d\d)\D')

@none_on_error
def tym_year(ov):
    return int(re_tym_year.search(ov.text).group(1))


@none_on_error
def tym_price(div):
    return int(div.find('div',class_='pricing').span.text.replace('€','').replace('EUR','').replace(',',''))


def tym_scrape(make, soup):

    divs = soup.findAll('div',class_='result')

    l = []
    for div in divs:
        a = div.find('a', class_='boat-name')
        ov = div.find('div', class_='overview')
        loc_a = div.find('div', class_='location').text.split(',')
        l.append(
            (
                'https://www.theyachtmarket.com'+a['href'].split('?')[0],
                a.text.replace(make,''),
                tym_year(ov),
                tym_loa(ov),
                loc_a[-1],
                loc_a[0],
                tym_price(div)
            )
        )
        
    return l


@none_on_error
def tym_next_url(soup):
    return 'https://www.theyachtmarket.com/en/boats-for-sale/search/'+soup.find('a', rel='next')['href']

def tym_listings(make):
    next_url = 'https://www.theyachtmarket.com/en/boats-for-sale/search/?manufacturermodel={}&currency=eur&lengthunit=metres&showsail=1'.format(
            make.replace(' ','+').lower()
        )
    
    l = []
    while next_url:
        soup = get_soup(next_url)
        l += tym_scrape(make,soup)
        next_url = tym_next_url(soup)
    return l


In [9]:
#scanboat.com
@none_on_error
def sb_loa(url):
    soup = get_soup(url)
    l = soup.find('p',string='Length')
    return float(l.next_sibling.next_sibling.text)

@none_on_error
def sb_price(s):
    return int(s.p.text.replace('EUR','').replace(',',''))

re_sb_year = re.compile(r'Year : (\d\d\d\d)')
re_sb_country= re.compile(r'Country : (.*)')

def sb_scrape(make, soup):
    
    divs = soup.findAll('div', class_='item')

    l = []
    for div in divs:
        header = div.find('header',class_='item__header')
        body = div.find('section',class_='item__body')
        url = 'https://www.scanboat.com'+div.a['href']
        price_tags = header.findAll('p')

        if price_tags:
            l.append(
                (
                    url,
                    header.section.text.replace(make,'').replace(' - ',''),
                    int(re_sb_year.search(body.p.text).group(1)),
                    sb_loa(url),
                    re_sb_country.search(body.p.text).group(1),
                    None,
                    int(price_tags[-1].text.replace('EUR','').replace(',','')),
                )
            )
    return l    

@none_on_error
def sb_next_url(soup):
    return 'https://www.scanboat.com'+soup.find('a',string='Next')['href']

    
def sb_listings(make):
    next_url = 'https://www.scanboat.com/en/boats?SearchCriteria.BoatModelText={}&SearchCriteria.BoatTypeID=1&SearchCriteria.Searched=true&SearchCriteria.ExtendedSearch=False'.format(make.replace(' ','+').lower())
    
    l = []
    while next_url:
        soup = get_soup(next_url)
        l += sb_scrape(make,soup)
        next_url = sb_next_url(soup)
    return l



In [10]:
def scrape_listings(make):
    nv = nv_listings(make)
    yw = yw_listings(make)
    b24 = b24_listings(make)
    tym = tym_listings(make)
    sb = sb_listings(make)
    
    df = pd.DataFrame(
        nv+yw+b24+tym+sb,
        columns=['url','model','year','loa','country','city','price']
    )

    df.model = df.model.str.strip()
    df.country = df.country.str.strip()
    df.city = df.city.str.strip()


    df = df[df.price <= max_price]
    
    df = df.round({
        'year': 0,
        'loa': 2,
        'price': 0
    })

    return df

if not debug: listings_make = memoize(scrape_listings)   
else: listings_make = scrape_listings

In [11]:
from forex_python.converter import CurrencyRates
fx = CurrencyRates()

def sokbat_history(make,model):
    make = make.lower().replace(' ','-')
    model = model.lower().replace(' ','-')
    url = 'https://www.sokbat.se/Modell/{}/{}'.format(make,model)
    page = get_page(url)
    item_id = re.search(r'CurentItemId = (\d+);',page).group(1)
    str_json = requests.post('https://www.sokbat.se/DataBase/GetPrices?itemId='+item_id).text
    
    df = pd.read_json(str_json[8:-1],orient='records')
    
    df['age'] = df.SalesYear.astype(int) - df.ItemYear.astype(int)
    df['price_sek'] = df.SalesPrice.str.replace(re.compile(r'\s'), '')
    df['price_eur'] = df.price_sek.astype(float) * fx.get_rate('SEK', 'EUR')
    
    return sns.lmplot(x="age", y="price_eur", data=df[df.ItemYear > 0], robust=True)

In [12]:
ba_re_year_sold = re.compile(r'Sold: (\d\d\d\d-\d\d-\d\d)')
@none_on_error
def ba_date_sold(td):
    return date.fromisoformat(re.search(ba_re_year_sold,td.font.text).group(1))

ba_re_price = re.compile(r'(\d[\d\s]+)\sEUR')
@none_on_error
def ba_price(td):
    return int(re.search(ba_re_price,td.p.text).group(1).replace(u'\xa0', ''))

def ba_listings(make):
    soup = get_soup('http://www.boatagent.com/?sajt=kopbat_sokmotor&sokord='+make.lower().replace(' ','+'))
    tds = soup.findAll('td', class_='batkatalog')
    
    urls = ['http://www.boatagent.com'+td.a['href'] for td in tds]
    
    models = [td.h2.text.replace(make,'') for td in tds]
    
    re_year = re.compile(r'Year of production: (\d\d\d\d)')
    years = [re.search(re_year,td.p.text).group(1) for td in tds]
    
    dates_sold = [ba_date_sold(td) for td in tds]
        
    prices = [ba_price(td) for td in tds]
    
    df = pd.DataFrame(data=list(zip(urls,models,years,dates_sold,prices)),columns=['url','model','year','date_sold','price',])
    
    #df['age'] = df.year - df.date_sold.dt.year
    
    return df


In [13]:
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

a4_landscape = (11.7, 8.27)
a4_portrait = (8.27,11.7)
    
def scatter_year(df):
    fig, ax = plt.subplots(figsize=a4_landscape,facecolor='w')
    ax = sns.scatterplot(
        ax=ax, 
        data=df, 
        x='year', 
        y='price',
        style=df.model.str.extract(r'\D*([\d\.]*)')[0].tolist(), #https://github.com/mwaskom/seaborn/issues/2194
        hue=df.country.tolist(), #https://github.com/mwaskom/seaborn/issues/2194
#        size=df.loa.tolist(), #https://github.com/mwaskom/seaborn/issues/2194
#        sizes=(200,400),
        s=300,
        legend='brief'
    )
    sns.regplot(
        ax=ax, 
        data=df, 
        x='year', 
        y='price',
        scatter=False
    )
    ax.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
    ax.axhline(100000,ls='--',color='r')
    return ax


import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning

def regplot(df,ax=None):
    if not ax: fig, ax = plt.subplots(figsize=a4_landscape)    
    warnings.simplefilter('ignore', ConvergenceWarning)
    return sns.regplot(ax = ax, x="year", y="price", data=df, robust=True);

In [14]:
def listings(make, model=None, min_year=None, max_year=None, min_loa=None, max_loa=None):
    df = listings_make(make)
    
    if min_year: df = df[df.year >= min_year]
    if max_year: df = df[df.year <= max_year]
    if min_loa: df = df[df.loa >= min_loa]
    if max_loa: df = df[df.loa <= max_loa]
        
    if model: df = df[df.model.str.contains(model,case=False)]
        
    return df.sort_values(by='price')

def url_to_html_anchor(url):
    return '<a target="_blank" href="{}">{}</a>'.format(url,url)

def diplay_listings(df):
    display(df.style.format(
        {
            'url': url_to_html_anchor,
            'year': '{:n}',
            'loa': '{:.2f} m',
            'price': '{:n} €',
        }
    ))

def summary(make, model=None, min_year=None, max_year=None, min_loa=None, max_loa=None, ref=None):
    title = make
    if model: title += '_model({})'.format(model)
    if min_year or max_year:
        title += '_year[{},{}]'.format(min_year,max_year)
        
    if min_loa or max_loa:
        title += '_loa[{},{}]'.format(min_loa,max_loa)
    
    df = listings(make, model, min_year, max_year, min_loa, max_loa)
    df.to_csv('out/listings-'+title+'.csv')
    
    df = df.drop_duplicates(subset=['year','country','price'])
    df = df.drop_duplicates(subset=['year','city'])


    ax = scatter_year(df)
    if ref: ax.plot(ref[0], ref[1], 'rx', markersize=25)
    ax.set_title(title)
    plt.show()
    plt.savefig('out/'+title+'.pdf')
    diplay_listings(df)
    

In [None]:
summary(
    'Salona',
    model='37|38'
)

In [None]:
#saving page cache file
save_obj(page_cache,cache_filename)