# Scraping multiple pages of listings

In [1]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import numpy as np

from random import randint
from time import sleep

%matplotlib inline
import matplotlib as plt

from scrape_craigslist import *

## Steps

1. URL formatting
    * pagingation
    * timing between pages
2. Calling on `full_page_scrape()` to get listings & post details
3. Return list of dataframes
4. Concatonate list of dataframes into one super_df

### URL Formatting

Note: search paramenters = `bundle duplicates`

URL formatting: 

```
start_url = 'https://sfbay.craigslist.org/search/apa?availabilityMode=0&bundleDuplicates=1'
```

```
page_2 = 'https://sfbay.craigslist.org/search/apa?availabilityMode=0&bundleDuplicates=1&s=120'
```

Difference = `&s=120`

In [2]:
def get_results_urls(start_url):

    response = requests.get(start_url)
    page = response.text
    soup = BeautifulSoup(page, 'html.parser')

    total_listings = int(soup.find('span', class_='totalcount').text)
    total_listings

    pages = np.arange(0, total_listings+1, 120)
    pages = pages[:len(pages)-1]

    results_urls = []

    for page in pages:
    
        url_prefix = start_url
        suffix = '&s='

        url = url_prefix + suffix + str(page)
    
        results_urls.append(url)
        
    return results_urls        

In [25]:
def full_listings_scrape(start_url):

    df_list = []
    page_counter = 1
    
    results_urls = get_results_urls(start_url)
    total_pages = len(results_urls)

    for url in results_urls[0:2]:    # Limit output for now
    
        response = requests.get(url)
        code = response.status_code
    
        #print("Status Code: ", code)
        #print(url)
        #print("")
    
        sleep(randint(1,4))
    
        print("Scraping page {} of {}...".format(page_counter, total_pages))
        print("")
        df = full_page_scrape(url)
        df_list.append(df)
    
        print("")
        print("Page {} of {} scrape complete!".format(page_counter, total_pages))
        print("")
    
        page_counter += 1
    
    compiled_df = pd.concat(df_list).reset_index()
    
    return compiled_df

In [26]:
start_url = 'https://sfbay.craigslist.org/search/eby/apa?bundleDuplicates=1'

In [27]:
df = full_listings_scrape(start_url)

Scraping page 1 of 25...

Listing page scrape complete!
Number of postings scraped: 122
Individual posts scrape complete!
Number of posts scraped:  122

Page 1 of 25 scrape complete!

Scraping page 2 of 25...

Listing page scrape complete!
Number of postings scraped: 120
Individual posts scrape complete!
Number of posts scraped:  120

Page 2 of 25 scrape complete!



In [29]:
df.head()

Unnamed: 0,index,date,title,link,price,brs,sqft,hood,bath,amenities
0,0,Oct 1,Crazy Deals On Cozy 1Beds!! Tour The Bridge&Ge...,https://sfbay.craigslist.org/eby/apa/d/walnut-...,1999,1,600.0,walnut creek,1Ba,"[cats are OK - purrr, dogs are OK - wooof, apa..."
1,1,Oct 1,"Spa, near EMC Corp.",https://sfbay.craigslist.org/eby/apa/d/pleasan...,2531,2,1073.0,dublin / pleasanton / livermore,2Ba,"[cats are OK - purrr, dogs are OK - wooof, apa..."
2,2,Oct 1,"Pet friendly, renovated townhome with two mont...",https://sfbay.craigslist.org/eby/apa/d/newark-...,2229,2,850.0,fremont / union city / newark,1Ba,"[application fee details: 39, cats are OK - pu..."
3,3,Oct 1,2BR / 2 Ba available now $800.00 Sec. deposit ...,https://sfbay.craigslist.org/eby/apa/d/walnut-...,2175,2,900.0,walnut creek,2Ba,"[cats are OK - purrr, dogs are OK - wooof, apa..."
4,4,Oct 1,"Pet friendly, renovated townhome with in home ...",https://sfbay.craigslist.org/eby/apa/d/newark-...,2397,2,850.0,fremont / union city / newark,1Ba,"[application fee details: 39, cats are OK - pu..."


In [30]:
df.tail()

Unnamed: 0,index,date,title,link,price,brs,sqft,hood,bath,amenities
237,115,Oct 1,Price Reduction on Luxury Living on South Faci...,https://sfbay.craigslist.org/eby/apa/d/hercule...,1959,,549.0,"hercules, pinole, san pablo, el sob",1Ba,"[EV charging, cats are OK - purrr, dogs are OK..."
238,116,Oct 1,Studio Home Close to UC Berkeley! Apt 610 8WKS...,https://sfbay.craigslist.org/eby/apa/d/berkele...,2000,,,berkeley,1Ba,"[EV charging, cats are OK - purrr, dogs are OK..."
239,117,Oct 1,Ground Floor Two Bedroom Beauty! W/D Inside Too!,https://sfbay.craigslist.org/eby/apa/d/concord...,2380,2.0,825.0,concord / pleasant hill / martinez,2Ba,"[cats are OK - purrr, dogs are OK - wooof, apa..."
240,118,Oct 1,"Quiet, Clean and Peaceful, below market rents!...",https://sfbay.craigslist.org/eby/apa/d/oakland...,1914,,990.0,Oakland,1Ba,"[cats are OK - purrr, dogs are OK - wooof, apa..."
241,119,Oct 1,Beautiful Upgrades for This Large One Bedroom ...,https://sfbay.craigslist.org/eby/apa/d/dublin-...,2190,1.0,650.0,danville / san ramon,1Ba,"[cats are OK - purrr, dogs are OK - wooof, apa..."


In [31]:
len(df)

242

In [33]:
amens_list = df.amenities

In [34]:
len(amens_list)

242

In [36]:
# for item in amens_list:
#     print(item)
#     print("")