# Web scraping for flats

In [1]:
import urllib
from pprint import pprint
from bs4 import BeautifulSoup as BS

In [2]:
# ---------------------------------------
#             get the 1st 100
base_url='http://paris.craigslist.fr'
CL_paris_apa_rss=""
# ----------------------------------------
#            a shortcut
urlget = urllib.urlopen

## Downlad page number 

The default search criteria are for flats with 1 bedroom, size > 10 sqm, in Paris.
The function returns the `Beautifulsoup`-ped HTML of a given page.
Each page contains a list of flats.

In [613]:
def get_page_from_url(pgnr,base_url):
    CL_paris_apa_html="{0:s}/search/apa?lang=en&cc=gb&bedrooms=1&minSqft=10&s={1:d}".format(base_url,pgnr)
    try:
        CL_html = urlget(CL_paris_apa_html).read()
    except IOError: # in case of network error
        CL_html = ''
    return BS(CL_html)
human_page=get_page_from_url(0,base_url)

## List flats urls from page

From each page given by the previous function, we record the list of flats, their publication date, and pid.

In [614]:
def get_listings_from_html(human_page,base_url):
    main_listing=human_page.findAll('div',attrs={'id':'sortable-results'})[0]
    each_apa=main_listing.findAll('p',attrs={"class":'row'})
    results=[]
    for apa in each_apa:
        time = apa.time['datetime']
        pid = apa['data-pid']
        url= '{0:s}/apa/{1:s}.html'.format(base_url,pid)
        results.append((time,pid,url))
    return results
apas=get_listings_from_html(human_page,base_url)
apas[0]

('2016-07-24 22:05',
 '5685419671',
 'http://paris.craigslist.fr/apa/5685419671.html')

## From flat url to flat record

We have the urls of all flats listed in CL.. well, we have the functions to get the urls, the actual work will be done at the end.
Finally, we can get the informations out of each flat individual page. This is really boring, but needed. The results is a list of features for each flat. These features will need further refinement, that I will do in future notebooks

In [615]:
# -----------------------------------------------------------------------------
#                 MATCH SOME INFO with regex magics
import re
re_rue = re.compile(r'(?:rue|avenue|boulevard) .*?[\d,.]+[^.,]*?',re.IGNORECASE)
re_arr1 = re.compile(r'(Paris \d{1,2}\s*(?:th|st|nd|rd){0,1})',re.IGNORECASE)
re_arr2 = re.compile(r'(\d{1,2}\s*(?:th|st|nd|rd){0,1} arr.*)',re.IGNORECASE)
re_metro = re.compile(r'Metro\s*(?:station|line)\s*.+?[\s|$]',re.IGNORECASE|re.MULTILINE)
re_floor = re.compile(r'(?:ground|\d+|\d+[a-z]{2,2})\s*floor',re.IGNORECASE|re.MULTILINE)

get_floor = lambda x: ', '.join(re_floor.findall(x))
get_metro = lambda x: ', '.join(re_metro.findall(x))

def extloc(text):
    _=''
    for rg in (re_rue,re_arr1,re_arr2):
        out = rg.findall(text)
        _+=', '.join(out)
    return _

# -----------------------------------------------------------------------------
#                 REMOVE DOUBLE ENTER
no_doubleenter = re.compile('\n{2,}',re.MULTILINE)
# =============================================================================
def findgetremove(token,lst):
    token = token.lower()
    for i,j in enumerate(lst):
        if token in j.lower():
            lst.pop(i)
            return j
    return ''

def apafeatures(apainfo):
#     try:
    url = apainfo[2]
    apa=BS(urlget(url).read())

    features_raw = apa.findAll('div',attrs={'class':'mapAndAttrs'})[0].findAll('span')
    features =  map(lambda x:x.text, features_raw)

    shortdesc =  findgetremove('Br',features)
    size = findgetremove('m2',features)
    movein = apa.findAll('div',attrs={'class':'mapAndAttrs'})[0].find('span',attrs={'class':'housing_movein_now'})['date']
    desc_raw = apa.findAll('section',attrs={'id':'postingbody'})[0].text.strip()
    desc_raw = no_doubleenter.sub('\n',desc_raw)
    price = apa.find('span',attrs={'class':'price'}).text[1:]
    title =  apa.find('span',attrs={'id':'titletextonly'}).text
    desc_raw = title + '\n' + desc_raw

    floor = get_floor(desc_raw)
    metro = get_metro(desc_raw)
    #location is a bit complex -- let's put togheter more info from the title
    try:
        loc = apa.find('span',attrs={'class':'postingtitletext'}).small.text.strip()
    except  AttributeError:
        loc = ''
    if u'\u20ac' in loc or loc == '': 
        loc = extloc(desc_raw)
    else:
        loc += ', '+ extloc(title)

    out = list(apainfo)
    out += [size,price,floor,loc,metro,movein]
    out.extend(features)
    out += [title,desc_raw]
  
    return out

## A memoizing page getter
As the docstring says: `sort of memoizing for apafeatures, specialized for the apas tuple`.

This class saves its cache as pickle

In [620]:
import cPickle as pkl
from datetime import datetime as DT
import os
class get_features_cache(object):
    """
        sort of memoizing for apafeatures, specialized for the apas tuple
    """
    def __init__(self,fname,maxdt=-1):
        self.fname=fname
        try:
            self.db=pkl.load(open(fname,'rb'))
        except IOError:
            self.db=[]
        self.clids=[j[1] for j in self.db]
        self.dirty = 0
    def __call__(self,apainfo):
        _,clid,url=apainfo
        # check if this app was already retrieved
        if clid in self.clids:
            #TODO check if retrieved version is too old/invalid
            return self.db[self.clids.index(clid)]
        else:
            #retrieve data, store, update self.clids, and finally return
            out = apafeatures(apainfo)
            
            self.clids.append(clid)
            self.db.append(out)
            self.dirty = 1
            return out
    def __len__(self):
        return len(self.db)
    
    def __del__(self):
        self._save()
        
    def _save(self):
        if self.dirty:
            print "saving apas db"
            print "db rows: {0:d}".format(len(self.db))
            pkl.dump(self.db,open(self.fname,'wb'))
            self.dirty = 0

apagetter = get_features_cache(fname='data/locations.pkl',)

## Ask pages at random times

This is class that behaves as a function (see the `__call__`) that waits a random time before retrieving the requested URL.
The waiting time is sampled from a Poisson distribution.

In [617]:
# ----------------------------------------------------------
#            Poissonian waiting time in the urlget function
class urlgetter(object):
    def __init__(self,waiting_time):
        self.mean=waiting_time
        
    def __call__(self,url):
        import time
        #waiting time [s]
        wt = self.poisson()
        time.sleep(wt)
        return urllib.urlopen(url)
    
    def poisson(self):
        from math import log
        from random import random
        return -log(1.0 - random()) / self.mean


## Put all togheter

The culprit of all these efforts, the loop that rules them all, where the  work is  truly done.

In [1]:
pages=range(0,1000,100)
apagetter = get_features_cache(fname='data/locations.pkl',)
print 'db {0:s} contains {1:d} locations'.format(apagetter.fname,len(apagetter))
urlget = urlgetter(1/.5)
failed=[]
for page in pages:
    human_page=get_page_from_url(page,base_url)
    apas=get_listings_from_html(human_page,base_url)
    print 'page x 100', page,len(apas)
    if not len(apas): break
    for apa in apas:
        last = apa
        try:
            _ = apagetter(apa)
        except:
            failed.append(apa)
            print 'failed {0:s}'.format(apa)
    
apagetter._save()

NameError: name 'get_features_cache' is not defined

In [623]:
print 'db {0:s} contains {1:d} locations'.format(apagetter.fname,len(apagetter))

db locations.pkl contains 601 locations


## Styling <<to 'elp the medicine go down>>

In [1]:
from IPython.display import HTML
import urllib2

style=open('style.css','r').read()
HTML("""
<style>{0}</style>
""".format(style))