# Scrape individual posts for amentities

In [1]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib as plt

In [2]:
from scrape_craigslist import get_listing_details, get_page_listings, clpage_to_df

## Getting some URLs to test a few posts

In [3]:
# PDX Housing/Apartment Search Page
url = 'https://portland.craigslist.org/search/mlt/apa?s=120&availabilityMode=0&bundleDuplicates=1'

response = requests.get(url)
    
if response.status_code == 200:
    page = response.text

    # Create soup object from URL
    soup = BeautifulSoup(page, 'html.parser')
    
    # Create DF
    df = clpage_to_df(soup)

Scrape Complete!
Number of Postings Scraped: 124


In [4]:
df.head()

Unnamed: 0,date,title,link,price,brs,sqft,hood
0,Sep 30,"Penthouse, Full size refrigerator with ice mak...",https://portland.craigslist.org/mlt/apa/d/port...,2287,2,1185.0,1030 SW Jefferson St
1,Sep 30,How Does 1 Month Free Sound?! Snag This 1 BR w...,https://portland.craigslist.org/mlt/apa/d/port...,1459,1,,
2,Sep 30,1 bedroom available now! Rent before it's gone!,https://portland.craigslist.org/mlt/apa/d/port...,1420,1,765.0,910 N. Harbour Drive
3,Sep 30,Fall In Love in w/ This Corner 1 bedroom Home ...,https://portland.craigslist.org/mlt/apa/d/port...,1785,1,823.0,"Slabtown, Alphabet District, Pearl, NW Portland"
4,Sep 30,"The Emery, The ideal space to balance life, wo...",https://portland.craigslist.org/mlt/apa/d/port...,1795,2,718.0,Portland South Waterfront


## Function to scrape URL listings for amenities details 

In [11]:
def get_post_amenities(url_list):
    '''
    Function to scrape a list of Craigslist URLs for bathroom & amenities details. 
    Returns bathroom_list for bathrooms
    Returns amenties_list for amenities 
    '''
    
    bathrooms_list = []
    amenities_list = []
    
    index = 0
    
    for url in url_list:
        
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page, 'html.parser')
        
        
        # Each
        # Test how many groups there are: 
        
        # If 2 groups
        
        if len(soup.find_all('p', class_='attrgroup')) > 1:
    
            group1 = soup.find_all('p', class_='attrgroup')[0].text.split('\n')
            item_list1 = [item for item in group1 if item != '']
            
            # Check to see if first grouping contains number of bathrooms
            
            if item_list1[0][-2:] == 'Ba':
                brba = item_list1[0].split(' / ')
                bath = brba[-1]
            
            # if not bathrooms, then NaN and move on
            else:
                bath = np.nan
    
            # Second group should contain amenities - append raw list to amentities list
            
            group2 = soup.find_all('p', class_='attrgroup')[1].text.split('\n')
            amenities = [item for item in group2 if item != '']
        
        # If only 1 group 
        
        elif len(soup.find_all('p', class_='attrgroup')) == 1:
            
            items = soup.find_all('p', class_='attrgroup')[0].text.split('\n')
            item_list = [item for item in group1 if item != '']
            
            # Check to see if that group contains number of bathrooms
            if item_list[0][-2:] == 'Ba':
                brba = item_list1[0].split(' / ')
                bath = brba[-1]
            
            # otherwise, we just have amenities
            else:
                amenities = item_list
        
        # If no details on post page, fill with NaN
        else:
            bath = np.nan
            amenities = np.nan
        
        print("Index: ", index)      # Comment all this out later
        print("Baths: ", bath)
        print("Amens: ", amenities)
        print("")
        
        bathrooms_list.append(bath)
        amenities_list.append(amenities)

        index += 1
        
    print("Number of Posts Scraped: ", index)        
    return bathrooms_list, amenities_list

In [12]:
urls = list(df.link)

post_details = get_post_amenities(urls)

Index:  0
Baths:  2Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking', 'attached garage']

Index:  1
Baths:  1Ba
Amens:  ['EV charging', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking', 'attached garage']

Index:  2
Baths:  1Ba
Amens:  ['open house dates', 'thursday 2020-10-01', 'friday 2020-10-02']

Index:  3
Baths:  1Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking', 'attached garage', 'wheelchair accessible']

Index:  4
Baths:  2Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking', 'off-street parking', 'wheelchair accessible']

Index:  5
Baths:  1Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry in bldg', 'no smoking', 'street parking']

Index:  6
Baths:  1Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry in bldg', 'no smoking', 'street parking

Index:  59
Baths:  1Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking']

Index:  60
Baths:  2Ba
Amens:  ['application fee details: 45.00 per adult applicant', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking', 'carport']

Index:  61
Baths:  2Ba
Amens:  ['application fee details: 45.00 per adult applicant', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking', 'carport']

Index:  62
Baths:  2Ba
Amens:  ['condo', 'w/d in unit', 'no smoking']

Index:  63
Baths:  2Ba
Amens:  ['EV charging', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'street parking', 'wheelchair accessible']

Index:  64
Baths:  1Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'flat', 'w/d in unit', 'no smoking', 'off-street parking']

Index:  65
Baths:  1Ba
Amens:  ['application fee details: $45.00 Screening Fee', 'cats are OK - purrr', 'dogs are OK - wooof', 'furni

Index:  116
Baths:  1Ba
Amens:  ['condo', 'w/d in unit', 'no smoking', 'detached garage']

Index:  117
Baths:  1Ba
Amens:  ['apartment', 'w/d in unit', 'no smoking']

Index:  118
Baths:  1Ba
Amens:  ['condo', 'w/d in unit', 'no smoking', 'street parking']

Index:  119
Baths:  1.5Ba
Amens:  ['apartment', 'w/d in unit', 'no smoking', 'street parking']

Index:  120
Baths:  2Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'carport']

Index:  121
Baths:  2Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking', 'attached garage']

Index:  122
Baths:  1Ba
Amens:  ['application fee details: $40', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d hookups', 'no smoking']

Index:  123
Baths:  2Ba
Amens:  ['EV charging', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking', 'attached garage', 'wheelchair accessible']

Number of Posts Scraped:  124


In [7]:
len(urls)

124

In [14]:
links = urls

index = 0

ones = []
twos = []
threes = []

for url in urls:
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'html.parser')
    
    items = soup.find_all('p', class_='attrgroup')
    groupings = len(items)
    if groupings == 1:
        ones.append((index, groupings, list(items)))
    elif groupings == 2:
        twos.append((index, groupings, list(items)))
    else:
        threes.append((index, groupings, list(items)))
    
    index += 1  

threes have **"OPEN HOUSE"**

In [17]:
print("Number of 3-groups: ", len(threes))

for group in threes:
    print("index: ", group[0])
    print(group[2])
    print("")

Number of 3-groups:  4
index:  2
[<p class="attrgroup">
<span class="shared-line-bubble"><b>1BR</b> / <b>1Ba</b></span>
<span class="shared-line-bubble"><b>765</b>ft<sup>2</sup></span>
<span class="housing_movein_now property_date shared-line-bubble" data-date="2020-09-30" data-today_msg="available now">available sep 30</span>
</p>, <p class="attrgroup">
<b>open house dates</b>
<br/>
<span class="otherpostings">
<a href="/search/apa?sale_date=2020-10-01">thursday 2020-10-01</a>
</span>
<br/>
<span class="otherpostings">
<a href="/search/apa?sale_date=2020-10-02">friday 2020-10-02</a>
</span>
</p>, <p class="attrgroup">
<span>cats are OK - purrr</span>
<br/>
<span>dogs are OK - wooof</span>
<br/>
<span>apartment</span>
<br/>
<span>w/d in unit</span>
<br/>
<span>no smoking</span>
<br/>
<span>carport</span>
<br/>
</p>]

index:  29
[<p class="attrgroup">
<span class="shared-line-bubble"><b>2BR</b> / <b>2Ba</b></span>
<span class="shared-line-bubble"><b>966</b>ft<sup>2</sup></span>
<span cl