###Initial scrape of Craigslist rideshare page

In [41]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import time
import pickle

import numpy as np

In [54]:
def _return_soup_UScities():
    """ Return BeautifulSoup object of rideshare page """

    url = 'https://www.craigslist.org/about/sites#US'
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"html.parser")
    return soup

def links_US_locations():
    """ Return dictionary of US locations mapped to craigslist URL """

    soup = _return_soup_UScities()

    # finds first instance of colmask tag in link pointing to #US
    US_locations = soup.find('div',{'class':'colmask'}).findAll('a')
    
    # set URL as dict key due to possible duplicate city names
    return {str(item['href']):str(item.text) for item in US_locations}

def choose_random_city(US_locations_dict):
    
    city_URL = np.random.choice(US_locations_dict.keys())
    city = US_locations_dict.get(city_URL)
    
    return city_URL, city

In [58]:
# load dictionary of US locations mapped to URLS on Craigslist
US_locations = links_US_locations()

In [4]:
# sample master database file
with open('./data/houston.pkl', 'r') as f:
    master_df = pickle.load(f)

In [67]:
master_df['city'] = 'houston'
master_df['url'] = '//houston.craigslist.org/'

In [94]:
class cityRideshare:
    
    def __init__(self, US_locations_dict, master_db, url=None):
        
        if url:
            self.url = url
            self.city = US_locations_dict[url]
        else:
            self.url, self.city = choose_random_city(US_locations_dict)
        
        self.master_db = master_db
        self.master_db_this_city = self.master_db[self.master_db['city']==self.city]
        
        print "scraping: ",self.city
    
    def _return_soup(self, rid = None, page = 0):
        """ Return BeautifulSoup object of rideshare page """

        url = 'https:{0}search/rid?s={1}00'.format(self.url,page)
        if rid:
            url = 'https:{0}rid/{1}.html'.format(self.url,rid)
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page,"html.parser")
        return soup
    
    def _scrape_one_page(self, soup): 
        """ Returns pandas DataFrame with RID, title and timestamp of each posting"""
    
        info = []
        for line in soup.find_all('span',{'class':'pl'}):
            d = {}
            d['id'] = str(line.find('a',{'class':'hdrlnk'})['data-id'])
            d['title'] = line.find('span',{'id':'titletextonly'}).text
            d['timestamp'] = line.find('time')['datetime']
            info.append(d)

        return pd.DataFrame(info)
    
    def _scrape_all_pages(self):
        """ Returns pandas DataFrame with information from all rideshare pages of URL"""
    
        all_posts = pd.DataFrame()

        soup = self._return_soup()
        pages = int(soup.find('span',{'class','totalcount'}).text)/100

        for page in range(0,pages+1):
            print "scraping page:", page
            soup = self._return_soup(page=page)
            df = self._scrape_one_page(soup) # scrape each page for id, timestamp, title of each ride
            all_posts = all_posts.append(df)
            time.sleep(1) #avoid crease and desist from Craigslist

        all_posts.set_index('id', inplace=True)
        
        all_posts['url'] = self.url
        all_posts['city'] = self.city

        return all_posts

    def _RIDS_to_maintain(self, df):
        """ Check master dataset (to be stored in a db) for existence of RIDs 
        For now use master_db as placeholder, assume index is set to RID """

        master_db_this_city = self.master_db_this_city
        
        new_rids = df.index.difference(master_db_this_city.index)
        same_rids = master_db_this_city.index.intersection(df.index)
        #old_rids = master_db_this_city.index.difference(df.index)
        
        keep_rids = new_rids.union(same_rids)
        
        return keep_rids
    
    def _scrape_posting_body(self, df):
        """ Scrape each post for posting body and type of ride.
        Add info to existing DataFrame using previously collected RIDs.
        Return DataFrame """
        
        for n, rid in enumerate(df.index):
            if n%10==0:
                print "scraping text from post {0} of {1}".format(n, len(df))

            soup_post = self._return_soup(rid=rid)
            df.ix[rid,'text'] = soup_post.find('section',{'id':'postingbody'}).text.replace('\n','')
            df.ix[rid,'ride_type'] = str(soup_post.find('p',{'class':'attrgroup'}).span.text)
            time.sleep(1) #avoid crease and desist from Craigslist
            
        return df
    
    def update_rides(self):
        
        rides_without_body = self._scrape_all_pages()
        
        RIDS_to_maintain = self._RIDS_to_maintain(rides_without_body)
        
        print "original number of rides:", len(self.master_db_this_city)
        print "updated number of rides:", len(RIDS_to_maintain)
        
        rides_with_body = self._scrape_posting_body(rides_without_body.ix[RIDS_to_maintain])
        
        return rides_with_body

In [95]:
%%timeit
df_houston = cityRideshare(US_locations, master_df, url='//houston.craigslist.org/').update_rides()

scraping:  houston
scraping page: 0
scraping page: 1
scraping page: 2
scraping page: 3
original number of rides: 359
updated number of rides: 370
scraping text from post 0 of 370
scraping text from post 10 of 370
scraping text from post 20 of 370
scraping text from post 30 of 370
scraping text from post 40 of 370
scraping text from post 50 of 370
scraping text from post 60 of 370
scraping text from post 70 of 370
scraping text from post 80 of 370
scraping text from post 90 of 370
scraping text from post 100 of 370
scraping text from post 110 of 370
scraping text from post 120 of 370
scraping text from post 130 of 370
scraping text from post 140 of 370
scraping text from post 150 of 370
scraping text from post 160 of 370
scraping text from post 170 of 370
scraping text from post 180 of 370
scraping text from post 190 of 370
scraping text from post 200 of 370
scraping text from post 210 of 370
scraping text from post 220 of 370
scraping text from post 230 of 370
scraping text from post 2

KeyboardInterrupt: 

In [None]:
#todo: scrape every x days but store past data
#compare newly scraped RIDs to existing "active" posts and update accordingly
#three datasets: A) newly scraped RIDs, B) existing RIDs, C) old RIDs
#1) only scrape postingbody of A not in B 
#2) move B not in A to C

#todo: use https://www.craigslist.org/about/sites#US to navigate to different cities. 
#auto-fill options to only show these cities

#inherit from master soup class

In [94]:
with open('houston.pkl','wb') as f:
    pickle.dump(df_houston,f)