# Scraping multiple pages of listings

In [1]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import numpy as np

from random import randint
from time import sleep

%matplotlib inline
import matplotlib as plt

from scrape_craigslist import *

## Steps

1. URL formatting
    * pagingation
    * timing between pages
2. Calling on `full_page_scrape()` to get listings & post details
3. Return list of dataframes
4. Concatonate list of dataframes into one super_df

### URL Formatting

Note: search paramenters = `bundle duplicates`

URL formatting: 

```
start_url = 'https://sfbay.craigslist.org/search/apa?availabilityMode=0&bundleDuplicates=1'
```

```
page_2 = 'https://sfbay.craigslist.org/search/apa?availabilityMode=0&bundleDuplicates=1&s=120'
```

Difference = `&s=120`

In [8]:
def get_results_urls(start_url):

    response = requests.get(start_url)
    page = response.text
    soup = BeautifulSoup(page, 'html.parser')

    total_listings = int(soup.find('span', class_='totalcount').text)
    total_listings

    pages = np.arange(0, total_listings+1, 120)
    pages = pages[:len(pages)-1]

    results_urls = []

    for page in pages:
    
        url_prefix = start_url
        suffix = '&s='

        url = url_prefix + suffix + str(page)
    
        results_urls.append(url)
        
    return results_urls        

In [24]:
def full_listings_scrape(start_url):

    df_list = []
    page_counter = 1
    
    results_urls = get_results_urls(start_url)
    total_pages = len(results_urls)

    for url in results_urls[0:2]:    # Limit output for now
    
        response = requests.get(url)
        code = response.status_code
    
        #print("Status Code: ", code)
        #print(url)
        #print("")
    
        sleep(randint(1,3))
    
        print("Scraping page {} of {}...".format(page_counter, total_pages))
        print("")
        df = full_page_scrape(url)
        df_list.append(df)
    
        print("")
        print("Page {} of {} scrape complete!".format(page_counter, total_pages))
        print("")
    
        page_counter += 1
    
    compiled_df = pd.concat(df_list)    
    
    return compiled_df

In [25]:
start_url = 'https://sfbay.craigslist.org/search/eby/apa?bundleDuplicates=1'

In [26]:
df = full_listings_scrape(start_url)

Scraping page 1 of 25...

Listing page scrape complete!
Number of postings scraped: 120
Individual posts scrape complete!
Number of posts scraped:  120

Page 1 of 25 scrape complete!

Scraping page 2 of 25...

Listing page scrape complete!
Number of postings scraped: 121
Individual posts scrape complete!
Number of posts scraped:  121

Page 2 of 25 scrape complete!



In [22]:
df.head()

Unnamed: 0,date,title,link,price,brs,sqft,hood,bath,amenities
0,Oct 1,Home for rent in Crow Canyon Country Club A ga...,https://sfbay.craigslist.org/eby/apa/d/danvill...,3400,2.0,1496.0,danville / san ramon,2Ba,"[townhouse, w/d in unit, no smoking, attached ..."
1,Oct 1,Find Your Studio Home Today (So You Can Use th...,https://sfbay.craigslist.org/eby/apa/d/walnut-...,1742,,466.0,walnut creek,1Ba,"[application fee details: $25.00 per adult, ca..."
2,Oct 1,"ASAP,STUDIO/InLaw,Furnish,Negotiable,BathW/Tub...",https://sfbay.craigslist.org/eby/apa/d/berkele...,1595,,,berkeley,1Ba,"[application fee details: $25.00 per adult, ca..."
3,Oct 1,Massive and Remodeled Top Floor 1 BD w/Small D...,https://sfbay.craigslist.org/eby/apa/d/berkele...,1925,1.0,600.0,berkeley,1Ba,[application fee details: 30.00 (non-refundabl...
4,Oct 1,*Stylish Apt w/hw floors(2440 8th Ave #17),https://sfbay.craigslist.org/eby/apa/d/oakland...,2250,2.0,,oakland lake merritt / grand,1Ba,"[apartment, no smoking]"


In [23]:
len(df)

242