In [78]:
import logging
try:
    from Queue import Queue  # PY2
except ImportError:
    from queue import Queue  # PY3
from threading import Thread
try:
    from urlparse import urljoin  # PY2
except ImportError:
    from urllib.parse import urljoin  # PY3

from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException
from six import iteritems
from six.moves import range

sites_url = 'http://www.craigslist.org/about/sites'


def get_all_sites():
    response = requests.get(sites_url)
    response.raise_for_status()  # Something failed?
    soup = BeautifulSoup(response.content, 'html.parser')
    sites = set()

    for box in soup.findAll('div', {'class': 'box'}):
        for a in box.findAll('a'):
            # Remove protocol and get subdomain
            site = a.attrs['href'].rsplit('//', 1)[1].split('.')[0]
            sites.add(site)

    return sites
ALL_SITES = get_all_sites()  # All the Craiglist sites
RESULTS_PER_REQUEST = 100  # Craigslist returns 100 results per request

In [79]:
def requests_get(*args, **kwargs):
    """
    Retries if a RequestException is raised (could be a connection error or
    a timeout).
    """

    logger = kwargs.pop('logger', None)
    try:
        return requests.get(*args, **kwargs)
    except RequestException as exc:
        if logger:
            logger.warning('Request failed (%s). Retrying ...', exc)
        return requests.get(*args, **kwargs)

In [80]:
def get_list_filters(url):
    list_filters = {}
    response = requests_get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    for list_filter in soup.find_all('div', class_='search-attribute'):
        filter_key = list_filter.attrs['data-attr']
        filter_labels = list_filter.find_all('label')
        options = [opt.text.strip() for opt in filter_labels]
        list_filters[filter_key] = {'url_key': filter_key, 'value': options}
    return list_filters

In [139]:
from requests_futures.sessions import FuturesSession
import re
class CraigslistBase(object):
    """ Base class for all Craiglist wrappers. """

    url_templates = {
        'base': 'http://%(site)s.craigslist.org',
        'no_area': 'http://%(site)s.craigslist.org/search/%(category)s',
        'area': 'http://%(site)s.craigslist.org/search/%(area)s/%(category)s'
    }

    default_site = 'sfbay'
    default_category = None

    base_filters = {
        'query': {'url_key': 'query', 'value': None},
        'search_titles': {'url_key': 'srchType', 'value': 'T'},
        'has_image': {'url_key': 'hasPic', 'value': 1},
        'posted_today': {'url_key': 'postedToday', 'value': 1},
        'search_distance': {'url_key': 'search_distance', 'value': None},
        'zip_code': {'url_key': 'postal', 'value': None},
    }
    extra_filters = {}

    # Set to True to subclass defines the customize_results() method
    custom_result_fields = False

    sort_by_options = {
        'newest': 'date',
        'price_asc': 'priceasc',
        'price_desc': 'pricedsc',
    }

    def __init__(self, site=None, area=None, category=None, filters=None,
                 log_level=logging.WARNING):
        # Logging
        self.set_logger(log_level, init=True)

        self.site = site or self.default_site
        if self.site not in ALL_SITES:
            msg = "'%s' is not a valid site" % self.site
            self.logger.error(msg)
            raise ValueError(msg)

        if area:
            if not self.is_valid_area(area):
                msg = "'%s' is not a valid area for site '%s'" % (area, site)
                self.logger.error(msg)
                raise ValueError(msg)
        self.area = area

        self.category = category or self.default_category

        url_template = self.url_templates['area' if area else 'no_area']
        self.url = url_template % {'site': self.site, 'area': self.area,
                                   'category': self.category}

        list_filters = get_list_filters(self.url)

        self.filters = {}
        for key, value in iteritems((filters or {})):
            try:
                filter = (self.base_filters.get(key) or
                          self.extra_filters.get(key) or
                          list_filters[key])
                if filter['value'] is None:
                    self.filters[filter['url_key']] = value
                elif isinstance(filter['value'], list):
                    valid_options = filter['value']
                    if not hasattr(value, '__iter__'):
                        value = [value]  # Force to list
                    options = []
                    for opt in value:
                        try:
                            options.append(valid_options.index(opt) + 1)
                        except ValueError:
                            self.logger.warning(
                                "'%s' is not a valid option for %s"
                                % (opt, key)
                            )
                    self.filters[filter['url_key']] = options
                elif value:  # Don't add filter if ...=False
                    self.filters[filter['url_key']] = filter['value']
            except KeyError:
                self.logger.warning("'%s' is not a valid filter", key)

    def set_logger(self, log_level, init=False):
        if init:
            self.logger = logging.getLogger('python-craiglist')
            self.handler = logging.StreamHandler()
            self.logger.addHandler(self.handler)
        self.logger.setLevel(log_level)
        self.handler.setLevel(log_level)

    def is_valid_area(self, area):
        base_url = self.url_templates['base']
        response = requests_get(base_url % {'site': self.site},
                                logger=self.logger)
        soup = BeautifulSoup(response.content, 'html.parser')
        sublinks = soup.find('ul', {'class': 'sublinks'})
        return sublinks and sublinks.find('a', text=area) is not None

    def get_results(self, limit=None, start=0, sort_by=None, geotagged=False):
        """
        Get results from Craigslist based on the specified filters.
        If geotagged=True, the results will include the (lat, lng) in the
        'geotag' attrib (this will make the process a little bit longer).
        """

        if sort_by:
            try:
                self.filters['sort'] = self.sort_by_options[sort_by]
            except KeyError:
                msg = ("'%s' is not a valid sort_by option, "
                       "use: 'newest', 'price_asc' or 'price_desc'" % sort_by)
                self.logger.error(msg)
                raise ValueError(msg)

        total_so_far = start
        results_yielded = 0
        total = 0

        while True:
            self.filters['s'] = start
            response = requests_get(self.url, params=self.filters,
                                    logger=self.logger)
            self.logger.info('GET %s', response.url)
            self.logger.info('Response code: %s', response.status_code)
            response.raise_for_status()  # Something failed?

            soup = BeautifulSoup(response.content, 'html.parser')
            if not total:
                totalcount = soup.find('span', {'class': 'totalcount'})
                total = int(totalcount.text) if totalcount else 0

            for row in soup.find_all('p', {'class': 'result-info'}):
                if limit is not None and results_yielded >= limit:
                    break
                self.logger.debug('Processing %s of %s results ...',
                                  total_so_far + 1, total)

                link = row.find('a', {'class': 'hdrlnk'})
                id = link.attrs['data-id']
                name = link.text
                url = urljoin(self.url, link.attrs['href'])

                time = row.find('time')
                if time:
                    datetime = time.attrs['datetime']
                else:
                    pl = roprintw.find('span', {'class': 'pl'})
                    datetime = pl.text.split(':')[0].strip() if pl else None
                price = row.find('span', {'class': 'result-price'})
                where = row.find('span', {'class': 'result-hood'})
                if where:
                    where = where.text.strip()[1:-1]  # remove ()
                tags_span = row.find('span', {'class': 'result-tags'})
                tags = tags_span.text if tags_span else ''

                result = {#'id': id,
                          'name': name,
                          'url': url,
                          'datetime': datetime,
                          'price': price.text if price else None,
                          'where': where,
                          'has_image': 'pic' in tags#,
                          # TODO: Look into this, looks like all show map now
                          #'has_map': 'map' in tags,
                #          'geotag': None
                }
                
                session=FuturesSession()
                future = session.get(url)
                response_detail= future.result()
                soup_response_detail=BeautifulSoup(response_detail.text,"lxml")
                
                mapaddress_pre=soup_response_detail.find_all(['div','p'], {'class': 'mapaddress'})
                mapaddress=mapaddress_pre[0].text.replace('\n\n','\n')
                result.update({'mapaddress':mapaddress})
                
                
                detail_content_pre=soup_response_detail.find_all(id='postingbody')
                detail_content=detail_content_pre[0].text.replace(unicode('\n\nQR Code Link to This Post\n\n\n'),'')\
                .replace('\n\n','\n')
                result.update({'detail_content':detail_content})
                
                
                chinese_flag=re.findall(ur'[\u4e00-\u9fff]+', detail_content)
                
                
                if len(chinese_flag)>0:
                    chinese_content=True
                else:
                    chinese_content=False
                result.update({'chinese_content':chinese_content})
                
                
                attrgroup_pre=soup_response_detail.find_all(['div','p'], {'class': 'attrgroup'})
                attr_text=''
                for tmp in attrgroup_pre:
                    attr_text=attr_text+tmp.text.replace('\n\n','\n')
                result.update({'attr_text':attr_text})
                
                
                geolocation=geolocation=soup_response_detail.find_all(['div','p'], {'class': 'viewposting'})
                geolocation=geolocation[0]
                geolocation_latitude=geolocation.attrs['data-latitude']
                geolocation_longitude=geolocation.attrs['data-longitude']
                result.update({'geolocation_latitude':geolocation_latitude})
                result.update({'geolocation_longitude':geolocation_longitude})
                
                
                if self.custom_result_fields:
                    self.customize_result(result, row)

                if geotagged and result['has_map']:
                    self.geotag_result(result)

                yield result
                results_yielded += 1
                total_so_far += 1

            if results_yielded == limit:
                break
            if (total_so_far - start) < RESULTS_PER_REQUEST:
                break
            start = total_so_far

    def customize_result(self, result, html_row):
        """ Add custom/delete/alter fields to result. """
        pass  # Override in subclass to add category-specific fields.

    def geotag_result(self, result):
        """ Adds (lat, lng) to result. """

        self.logger.debug('Geotagging result ...')

        if result['has_map']:
            response = requests_get(result['url'], logger=self.logger)
            self.logger.info('GET %s', response.url)
            self.logger.info('Response code: %s', response.status_code)

            if response.ok:
                soup = BeautifulSoup(response.content, 'html.parser')
                map = soup.find('div', {'id': 'map'})
                if map:
                    result['geotag'] = (float(map.attrs['data-latitude']),
                                        float(map.attrs['data-longitude']))

        return result

    def geotag_results(self, results, workers=8):
        """
        Add (lat, lng) to each result. This process is done using N threads,
        where N is the amount of workers defined (default: 8).
        """

        results = list(results)
        queue = Queue()

        for result in results:
            queue.put(result)

        def geotagger():
            while not queue.empty():
                self.logger.debug('%s results left to geotag ...',
                                  queue.qsize())
                self.geotag_result(queue.get())
                queue.task_done()

        threads = []
        for _ in range(workers):
            thread = Thread(target=geotagger)
            thread.start()
            threads.append(thread)

        for thread in threads:
            thread.join()
        return results

    @classmethod
    def show_filters(cls, category=None):
        print('Base filters:')
        for key, options in iteritems(cls.base_filters):
            value_as_str = '...' if options['value'] is None else 'True/False'
            print('* %s = %s' % (key, value_as_str))
        print('Section specific filters:')
        for key, options in iteritems(cls.extra_filters):
            value_as_str = '...' if options['value'] is None else 'True/False'
            print('* %s = %s' % (key, value_as_str))
        url = cls.url_templates['no_area'] % {
            'site': cls.default_site,
            'category': category or cls.default_category,
        }
        list_filters = get_list_filters(url)
        for key, options in iteritems(list_filters):
            value_as_str = ', '.join([repr(opt) for opt in options['value']])
            print('* %s = %s' % (key, value_as_str))

            

In [140]:
class CraigslistHousing(CraigslistBase):
    """ Craigslist housing wrapper. """

    default_category = 'hhh'
    custom_result_fields = True

    extra_filters = {
        'private_room': {'url_key': 'private_room', 'value': 1},
        'private_bath': {'url_key': 'private_bath', 'value': 1},
        'cats_ok': {'url_key': 'pets_cat', 'value': 1},
        'dogs_ok': {'url_key': 'pets_dog', 'value': 1},
        'min_price': {'url_key': 'min_price', 'value': None},
        'max_price': {'url_key': 'max_price', 'value': None},
        'min_ft2': {'url_key': 'minSqft', 'value': None},
        'max_ft2': {'url_key': 'maxSqft', 'value': None},
        'min_bedrooms': {'url_key': 'min_bedrooms', 'value': None},
        'max_bedrooms': {'url_key': 'max_bedrooms', 'value': None},
        'min_bathrooms': {'url_key': 'min_bathrooms', 'value': None},
        'max_bathrooms': {'url_key': 'max_bathrooms', 'value': None},
        'no_smoking': {'url_key': 'no_smoking', 'value': 1},
        'is_furnished': {'url_key': 'is_furnished', 'value': 1},
        'wheelchair_acccess': {'url_key': 'wheelchaccess', 'value': 1},
    }

    def customize_result(self, result, html_row):
        housing_info = html_row.find('span', {'class': 'housing'})
        # Default values
        result.update({'bedrooms': None, 'area': None})
        if housing_info:
            for elem in housing_info.text.split('-'):
                elem = elem.strip()
                if elem.endswith('br'):
                    # Don't convert to int, too risky
                    result['bedrooms'] = elem[:-2]
                if elem.endswith('2'):
                    result['area'] = elem

In [141]:
cl_h = CraigslistHousing(site='sfbay', area='sby', category='roo',
                         filters={'max_price': 2000, 'private_room': True})

In [143]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
df = pd.DataFrame(cl_h.get_results(sort_by='newest', limit=20))
df

Unnamed: 0,area,attr_text,bedrooms,chinese_content,datetime,detail_content,geolocation_latitude,geolocation_longitude,has_image,mapaddress,name,price,url,where
0,1275ft2,\n1275ft2\navailable jun 3\n\nno private bath\nprivate room\n,,False,2017-07-03 14:38,"Sunnyvale Apartment at Encasa near the heart of downtown San Jose. \nWe are looking for another roommate for our 2 BD/2 BA. The apartment has a brand new modern interior finishes with luxury flooring, custom tile baths and showers, and built-in stainless-steel appliances, keyless entry, Nest thermostats, and LED lighting. Community wise it has large private garages, community graden, Indoor-outdoor lounges with cozy fireplaces\nTwo expansive fitness centers with yoga, spinning and cross training areas, and a off-lash brake park. This is a great place to share for an unbeatable price. Super close to San Jose, sp you can still have fun without breaking the wallet \nGet more info here: http://homesharehomie.com/encasa?c=clp_be8.31.63",37.3861,-122.0839,True,\n (google map)\n \n,Single Room for Rent in Sunnyvale in a 2BD/2BR,$1060,http://sfbay.craigslist.org/sby/roo/6161026117.html,mountain view
1,1795ft2,\n1795ft2\navailable jul 1\n\nfurnished\nlaundry on site\nno smoking\nstreet parking\nprivate bath\nprivate room\n,,False,2017-07-03 14:31,"available \n1BR + 1BA furnished clean peaceful home / homeowner easy going low key \nmature professional working full time / + PGE usage \n\location close proximity to restaurants banks , shop easy access 17/85 \nhistorical DT /pruneyard upscale neighborhood dry creek farmer market\n / looking for long term flexible/\n phone number for reply - appointment / \nhave your own P.O Box, no mail delivered to address \n NOT ALLOW pets visitor. alcohol drugs smokers/ n/storage \n\n\n\n",37.282065,-121.927699,False,\n (google map)\n \n,1BR + PRIV BA __ share House w/1person ___,$1400,http://sfbay.craigslist.org/sby/roo/6203516470.html,san jose west
2,1795ft2,\n1795ft2\navailable jul 1\n\nfurnished\nlaundry on site\nno smoking\nstreet parking\nprivate bath\nprivate room\n,,False,2017-07-03 14:30,"available \nshare house with 1 person\n1BR + 1BA furnished clean peaceful home/ easy going low key, looking for similar\nhistorical DT /pruneyard upscale neighborhood dry creek farmer market\nsome utilities usages w/reference / easy access mayor HWY\nplus some utilities duration someone to stay awhile\n phone number for reply - appointment / \nhave your own P.O Box, no mail delivered to address \n NOT ALLOW pets visitor. alcohol drugs smokers/ n/storage \n\n\n\n",37.282065,-121.927699,False,\n (google map)\n \n,1BR + PRIV BA __pruneyard DT/bascom __(_HOUSE ),$1300,http://sfbay.craigslist.org/sby/roo/6203525408.html,campbell
3,400ft2,\n400ft2\navailable jun 22\n\nlaundry on site\nno smoking\nstreet parking\nno private bath\nprivate room\n,,True,2017-07-03 14:19,"Renraw Dr, San Jose, 95127\n全新豪华装修, 一个雅房分租。\n房间全新装修干净明亮，价钱1000刀/月，欢迎看房。\n3个月以上Lease，Deposit $500，提前1个月遷出通知。\n無寵物, 不煙不酒和派对\n水，电，煤气，网络，全包。\n免费提供洗衣机 烘干机冰箱使用，免费Wi-Fi，免费停车\nshow contact info\n",37.36234,-121.801663,True,Renraw,雅房分租,$1000,http://sfbay.craigslist.org/sby/roo/6188373774.html,san jose east
4,200ft2,\n200ft2\navailable jul 1\n\nw/d in unit\nattached garage\nprivate bath\nprivate room\n,,False,2017-07-03 14:16,"Looking to share a spacious and airy 3 bedroom/3 bathroom townhouse. This is a new house with 9 foot high ceiling for all rooms. There are no shared walls between bedrooms. \nYours will be on the third floor and your very own bathroom. Has a nice view. The room is very spacious and has a huge walk in closet. Parking is included in the attached garage. \nThe property is 3 minutes from highway 87, and 5-7 minutes from 280 and 85. It is in 20 minute range from Apple, AMD, Oracle, Google. Close to Mountain View, Sunnyvale, Santa Clara, Cupertino, Los Gatos, Campbell. Short drive to the San Jose airport. \nStarting date: now\nUtilities are split 3 ways, usually end up below $100/month per person (summer time with AC ~$100, spring, winter, fall ~$60).\nDeposit is one month rent.\nTotal moving in cost is 2 months rent (deposit + one month rent)\nLooking for:\nSingle professional with good credit, non-smoker, clean (especially for common area), no pets, no regular overnight guests. This is a sublet to pick up 2 months left on the lease. It is possible to get long term lease as well. \nIf you see this ad, it means the room is still available. Text, email, or call me. If text/email, please, introduce yourself, describe your situation, and when you plan to move in. Please include your phone number, so I can call you back.",37.281634,-121.859236,True,hillsdale ave,"big room with private bathroom, garage parking",$1100,http://sfbay.craigslist.org/sby/roo/6197927926.html,san jose south
5,,\navailable jul 3\n\nlaundry in bldg\ndetached garage\nprivate bath\nprivate room\n,,True,2017-07-03 14:10,主卧7月至九月空出。主卧自带私人厕所淋浴间洗手台。外带私人露天停车位。空间宽敞，舒适清凉。附近交通便利，环境幽静安全，适合爱干净喜好安静的通勤单身人士。有意者可短信联系。,37.352347,-122.032385,True,1279 Sunnyvale Saratoga Rd.,短租主卧，靠El Camino Real，近,$900,http://sfbay.craigslist.org/sby/roo/6203500726.html,sunnyvale
6,1100ft2,\n1100ft2\navailable aug 1\n\ndogs are OK - wooof\nfurnished\nw/d in unit\nstreet parking\nno private bath\nprivate room\n,,False,2017-07-03 14:07,"3 bdrm house in West San Jose with large fenced in yard to share with one roommate -- Rent $1475 month/each. Utilities, cable, internet and cleaning service total about $150/mo each. \nBusy professional roommate seeking like-wise. I'm in my 30s and have a great house in a great location. My present roommate is moving away and I want to find another to move in starting August 1 (one year lease). The place will be available to move stuff in the week prior if you like. The house is furnished, but I am open to accommodating your furniture/moving stuff as needed. The bedroom is unfurnished.\nPlease include a description of yourself and if you have a dog. I have a dog (lab mix) and am open to having another if they get along.\n• NO live-in girlfriends/boyfriends, NO poor credit.\n• 5 minute walk to Westgate Shopping Center, and Planet Fitness, Walgreens, many restaurants on next block!\n• 10 minutes to downtown Campbell/Santana Row/Macy's Valley Fair, Hwy 280/85 5 minutes away.\n• 1100+ sq ft with huge backyard for dogs (also, there is a dog park 5 min away too).\n• Spacious living/dining room w/fireplace & hardwood floors.\n• Laundry (washer, dryer) in garage.\n• Front lawn area gardener maintained. Clean, safe and centrally located.\n• No smoking.\n• Current credit report and references required with application.",37.298841,-121.983004,True,Saratoga at Latimer,Dog-friendly! 3BR House w/Large Yard to share $1475/mo,$1475,http://sfbay.craigslist.org/sby/roo/6203496677.html,san jose west
7,1267ft2,\n1267ft2\navailable jul 3\n\nno private bath\nprivate room\n,,False,2017-07-03 14:07,"We have a converted (11 x 9) single room available created using heavy duty, almost floor to ceiling partitions (not completely floor to ceiling due to fire code regulations). You will be sharing the common area (living room & kitchen) with 2 other housemates.\nThe building is brand new, built in 2016, in the heart of Sunnyvale. The unit has around 1150 square feet.\nCommunity Amenities\n24 hour Expansive Fitness Center\nLounge with conference rooms\nBicycle parking area\nWiFi access in lounges\nParking available with garage\nCourtyard with Fountain and Fireplace\nApartment Amenities:\nLarge closet\nBrand New, built in 2016\nIn-home, full-size Washer and Dryer\n9 foot ceilings and huge windows\nStainless Steel, new appliances, built in 2016\nPowerful A/C and Heater",37.398839,-122.013511,True,E Weddell Dr at Fair Oaks Avenue,[OFFER] Converted bedroom in Sunnyvale,$1050,http://sfbay.craigslist.org/sby/roo/6203486704.html,sunnyvale
8,1600ft2,\n0BR / sharedBa\n1600ft2\navailable sep 2\n\nfurnished\nw/d in unit\nno smoking\nstreet parking\nprivate bath\nprivate room\n,,False,2017-07-03 14:01,"Fully furnished bedroom available for rent in a beautiful 1600 sqr ft (3 bed, 2 bath) 3 story town home with large and comfortable living space in a safe and friendly neighborhood in Mountain View. \nRoom is furnished with double size bed, computer desk, computer chair, and closet\nRoom is available from SEPTEMBER 2nd to DECEMBER 31st, 2017\n$1300/month (including utilities)\nAmenities:\n- In-house washer/dryer\n- Large shared living space with home theater system\n- Kitchen is shared for general light cooking only\n- Jacuzzi and swimming pool right outside of the unit\n- Street parking available outside of apartment complex\n- High Speed Wifi\n- Utilities included\nPlaces nearby:\n- Google Mountain View campus: <10 minute bike ride\n- Downtown Mountain View: <10 minute minute bike ride\n- San Antonio Cal train station: 10 minute walk\n- Stanford University/Palo Alto: 10 minute drive / 20 minute bike ride\n- Nearest grocery store: 5 minute walk\n- Costco: 5 minute drive\nMinimum requirements for potential roommate:\n- Safe, friendly, mature, responsible, and respectful professional or working student intern\n- Non-smoker\n- Must have a good habit of cleaning up right after him/herself.\n- No outdoor shoes inside the house. \n- Be considerate. No loud music or noises after 10 PM. \n- Single occupant only. No overnight guests.\n- No pets\n\nPlease email if interested.",37.40694,-122.095173,True,N Rengstorff Ave at Montecito,Mountain View Bedroom for rent,$1300,http://sfbay.craigslist.org/sby/roo/6185734668.html,mountain view
9,,\navailable jul 3\n\nw/d in unit\nno smoking\nstreet parking\nno private bath\nprivate room\n,,False,2017-07-03 14:01,"-Married couple looking for roommate\n-$1,400 includes utilities (cable TV, high-speed internet, gas, electric, etc)\n-No smoking, drugs, or pets\n-Available immediately\n-$1,000 security deposit required upon lease signing\n-Community has wi-fi lounge, movie theatre, and gym\n-Washer/dryer in unit\n-Community is extremely safe and quiet\n-Easy access to Hwy 101 & 237, Lawrence & Central Expressway, & Caltrain\nWe are very easy-going, quiet, friendly people who are looking for a friendly, clean, quiet person to rent a bedroom to. You will have access to your own bathroom, and bedroom comes with a bed frame (with storage) if you need it, free of charge.\nPlease feel free to reach out to me if you have any questions.",37.38824,-122.003371,True,e duane ave at deguigne ave,"$1,400 / month (utilities included) - BR in beautiful 2br condo",$1400,http://sfbay.craigslist.org/sby/roo/6203478380.html,sunnyvale


In [128]:
df[3

KeyError: 3