# crautoscrape

A naive and blunt web scraper for http://crautos.com/, along with some trivial graphs courtesy of pandas and matplotlib - in case you ever want to buy or sell a used car in Costa rica!

Disclaimer: Whipped together over the weeked mainly by copypasting examples.. whatever. It's incredibly inefficient but solves the task at hand ;)


# Installation

Set up a python venv and get dependencies - these (shell) instructions work on Fedora 28 - adapt as needed:
```
python3 -m venv crauto
cd crauto
. bin/activate
pip install jupyter pandas matplotlib beautifulsoup4 dateparser requests
jupyter notebook
```


# Part 1 - utilities

Run this first - after here, run parts selectively depending on what's changed

In [None]:
from requests import get, post
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import dateparser
from IPython.display import display

def log_progress(sequence, every=None, size=None, name='Items'):
    '''https://github.com/alexanderkuk/log-progress'''

    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )


def simple_get(url):
    with closing(get(url, stream=True)) as resp:
        if is_good_response(resp):
            return resp.content
        raise ValueError()
def simple_post(url, data):
    with closing(post(url, stream=True, data=data)) as resp:
        if is_good_response(resp):
            return resp.content
        raise ValueError()

def is_good_response(resp):
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

def get_urls(raw_html):
    html = BeautifulSoup(raw_html, 'html.parser')
    urls = [a['href'] for a in html.select('a') if a['href'].find('cardetail.cfm') > -1]
    return [a[16:].split('&')[0] for a in urls]


# Part 2 - download list of ids

Each car on crautos has an id - this will download the list and put it in a variable

In [None]:
def get_auto_ids():
    # Search pattern - edit it to limit search results from upstream
    data = {
        "brand":"00",
    "doors":"4",
    "financed":"00",
    "fuel":"0",
    "modelstr":"",
    "orderby":"0",
    "p":"3",
    "pricefrom":"100000",
    "priceto":"200000000",
    "province":"7",
    "style":"00",
    "trans":"0",
    "yearfrom":"1960",
    "yearto":"2018",
    }
    
    results = set()
    
    for i in log_progress(range(1, 401), every=1):
        data['p']=i
        results.update(get_urls(simple_post("http://crautos.com/rautosusados/searchresults.cfm",data)))
    return results

ids=get_auto_ids()
len(ids)


# Part 3 - download car data

Each car from the list downloaded in Part2 will be downloaded and cached in a local database.
TODO: could prune the database too, of old cards that have already been sold

In [None]:
import shelve

def get_car(id):
    return simple_get("http://crautos.com/rautosusados/cardetail.cfm?c=" + str(id))

with shelve.open('crautos') as d:
    for id in ids:
        if id in d:
            continue
        display("Getting " + str(id))
        d[id]=get_car(id)



# Part 4 - load and convert to HTML objects

Load the database from Part 3 and parse into BeautifulSoup html objects

In [None]:
import pandas as pd
import shelve

parsed = {}
with shelve.open('crautos') as d:
    for k,v in d.items():
        try:
            parsed[k]=BeautifulSoup(v)
        except AttributeError:
            pass


# Part 5 - parse into pandas DataFrame

Pick out interesting parts of HTML and build a DataFrame

In [None]:
def parse_car(html):
    results={}
    
    makes = html.find('div', class_='col-lg-8 col-md-8 col-sm-8 col-xs-12').find('h2')
    name, *_, year = makes
    results['name']=str(name)
    results['year']=int(year)
    
    price = html.find('div', class_='col-lg-4 col-md-4 col-sm-4 text-right').find('h2')
    results['price']=int(price.text[1:].replace(',',''))/567
    
    tbl = html.find('table', attrs={'class':'technical'})

    for d in tbl.find_all('tr'):
        tds = d.find_all('td')
        if len(tds) != 2: 
            continue
        results[tds[0].text.strip()]=tds[1].text.strip()

    if 'Kilometraje' in results:
        results['Kilometraje'] =int( results['Kilometraje'].replace(' km', '').replace(',', ''))
    if 'Fecha de ingreso' in results:
        results['Fecha de ingreso'] = dateparser.parse(results['Fecha de ingreso'], locales=['es-CR'])
        
    return results


df_raw = pd.DataFrame()

for k,v in parsed.items():
    try:
        v = parse_car(v)
        df_raw = df_raw.append(pd.DataFrame(v, index=[k]))
    except (ValueError,AttributeError):
        pass


# Part 6 - look at data

This is where the fun starts :)

In [None]:
%matplotlib notebook

with pd.option_context('display.max_rows', None):
    df = df_raw.copy().query('price>0 & Kilometraje>0')
    df.groupby(['year','name']).mean().sort_values('price')
    rav=df[df['name'].str.contains('BMW X3')].query('price>0 & Kilometraje>0')
    rav.plot.scatter(x='Kilometraje', y='year')
    rav.plot.scatter(x='price', y='year')
    rav.plot.scatter(x='price', y='Kilometraje')

    display(rav.sort_values(['Fecha de ingreso', 'year','price']))

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 4):
    display(df[df['Estilo'].str.contains('no 4x4')].groupby(['name']).count() .reset_index() .sort_values('year', ascending=False))


In [None]:
df = df_raw.copy().query('price>0 & Kilometraje>0 & year >= 2000')
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    display(df.groupby(['year','name','Estilo']).mean().sort_values(['year', 'Estilo','price']))
