###Initial scrape of Craigslist rideshare page

In [91]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import time
import pickle

In [44]:
def _return_soup_UScities():
    """ Return BeautifulSoup object of rideshare page """

    url = 'https://www.craigslist.org/about/sites#US'
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"html.parser")
    return soup

def links_US_locations():
    """ Return dictionary of US locations mapped to craigslist URL """

    soup = _return_soup_UScities()

    #finds first instance of colmask tag in link pointing to #US
    US_locations = soup.find('div',{'class':'colmask'}).findAll('a')
    
    return {str(item.text):str(item['href']) for item in US_locations}

In [45]:
# load dictionary of US locations mapped to URLS on Craigslist
US_locations = links_US_locations()

In [79]:
class cityRideshare:
    
    def __init__(self, city, dict_cities_links):
        self.city = city.lower()
        self.dict_cities_links = dict_cities_links
        
        # fetch Craigslist URL from dictionary
        self.url = self.dict_cities_links.get(self.city)
        
    def _return_soup(self, rid = None, page = 0):
        """ Return BeautifulSoup object of rideshare page """

        url = 'https:{0}search/rid?s={1}00'.format(self.url,page)
        if rid:
            url = 'https:{0}rid/{1}.html'.format(self.url,rid)
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page,"html.parser")
        return soup
    
    def _scrape_one_page(self, soup): 
        """ Returns pandas DataFrame with RID, title and timestamp of each posting"""
    
        info = []
        for line in soup.find_all('span',{'class':'pl'}):
            d = {}
            d['id'] = str(line.find('a',{'class':'hdrlnk'})['data-id'])
            d['title'] = line.find('span',{'id':'titletextonly'}).text
            d['timestamp'] = line.find('time')['datetime']
            info.append(d)

        return pd.DataFrame(info)
    
    def _scrape_posts(self, df):
        """ Scrape each post for posting body and type of ride.
        Add info to existing DataFrame using previously collected RIDs.
        Return DataFrame """
        
        for i,row in df.iterrows():
            if i%10==0:
                print "scraping text from post {0} of {1}".format(i,len(all_posts))
            
            soup_post = self._return_soup(rid=row['id'])
            df.ix[i,'text'] = soup_post.find('section',{'id':'postingbody'}).text.replace('\n','')
            df.ix[i,'ride_type'] = str(soup_post.find('p',{'class':'attrgroup'}).span.text)
            time.sleep(1) #avoid crease and desist from Craigslist
            
        return df
    
    def scrape_all_pages(self):
        """ Returns pandas DataFrame with information from all pages of search"""
    
        all_posts = pd.DataFrame()

        soup = self._return_soup()
        pages = int(soup.find('span',{'class','totalcount'}).text)/100

        for page in range(0,pages+1):
            print "scraping page:",page
            soup = self._return_soup(page=page)
            df = self._scrape_one_page(soup)
            all_posts = all_posts.append(df)
            time.sleep(1)

        all_posts.reset_index(drop=True, inplace=True)

        all_posts = _scrape_posts(all_posts)

        return all_posts

In [80]:
%timeit
df_houston = cityRideshare('houston',US_locations).scrape_all_pages()

scraping page: 0
scraping page: 1
scraping page: 2
scraping page: 3
scraping text from post 0 of 359
scraping text from post 10 of 359
scraping text from post 20 of 359
scraping text from post 30 of 359
scraping text from post 40 of 359
scraping text from post 50 of 359
scraping text from post 60 of 359
scraping text from post 70 of 359
scraping text from post 80 of 359
scraping text from post 90 of 359
scraping text from post 100 of 359
scraping text from post 110 of 359
scraping text from post 120 of 359
scraping text from post 130 of 359
scraping text from post 140 of 359
scraping text from post 150 of 359
scraping text from post 160 of 359
scraping text from post 170 of 359
scraping text from post 180 of 359
scraping text from post 190 of 359
scraping text from post 200 of 359
scraping text from post 210 of 359
scraping text from post 220 of 359
scraping text from post 230 of 359
scraping text from post 240 of 359
scraping text from post 250 of 359
scraping text from post 260 of 35

In [None]:
#todo: find/write code to scrape dates in all different formats

#todo: detect origin and destination location

#todo: scrape every x days but store past data

#todo: use https://www.craigslist.org/about/sites#US to navigate to different cities. 
#auto-fill options to only show these cities

#inherit from master soup class

In [92]:
df_houston.head()

Unnamed: 0,id,timestamp,title,text
0,5519837630,2016-04-11 22:23,DO YOU NEED RIDE TO ANY PART OF HOUSTON AND OT...,For trips to functions and jobs call jason. Fo...
1,5519835301,2016-04-11 22:23,DO YOU NEED RIDE TO ANY PART OF HOUSTON AND OT...,For trips to functions and jobs call jason. Fo...
2,5493065802,2016-04-11 22:23,DO YOU NEED A RIDE TO ANY PART OF HOUSTON,For trips to functions and jobs call jason. Fo...
3,5535002861,2016-04-11 21:59,Seeking ride to Dallas.,"I posted once before, seeking ride to clear up..."
4,5534974217,2016-04-11 21:18,ISO Houston to Austin on 4/29 (Levitation Fest),Yo!I will be flying into Houston around noon o...


In [94]:
with open('houston.pkl','wb') as f:
    pickle.dump(df_houston,f)