# Scrape individual posts for amentities

In [1]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib as plt

In [2]:
from scrape_craigslist import get_listing_details, get_page_listings, clpage_to_df

## Getting some URLs to test a few posts

In [3]:
# PDX Housing/Apartment Search Page
url = 'https://portland.craigslist.org/search/mlt/apa?s=120&availabilityMode=0&bundleDuplicates=1'

response = requests.get(url)
    
if response.status_code == 200:
    page = response.text

    # Create soup object from URL
    soup = BeautifulSoup(page, 'html.parser')
    
    # Create DF
    df = clpage_to_df(soup)

Scrape Complete!
Number of Postings Scraped: 121


In [4]:
df.head()

Unnamed: 0,date,title,link,price,brs,sqft,hood
0,Sep 30,Immaculate 2 bed 2 bath w/ an open concept!,https://portland.craigslist.org/mlt/apa/d/lake...,1792,2.0,1176.0,"1 Jefferson Parkway Lake Oswego, OR"
1,Sep 30,Upscale condo in the heart of the Pearl,https://portland.craigslist.org/mlt/apa/d/port...,1725,,780.0,Pearl
2,Sep 30,2 Bedroom Apartment Available with Washer/Drye...,https://portland.craigslist.org/mlt/apa/d/port...,1750,2.0,887.0,Portland
3,Sep 30,Classic 1 Bedroom Downtown Apartment *Special*,https://portland.craigslist.org/mlt/apa/d/port...,999,1.0,624.0,221 NW Fifth Avenue
4,Sep 30,"Soak In The Amenities! Fitness Room, Spa, Saun...",https://portland.craigslist.org/mlt/apa/d/port...,1077,,,


## Function to scrape URL listings for amenities details 

In [13]:
def get_post_amenities(url_list):
    '''
    Function to scrape a list of Craigslist URLs for bathroom & amenities details. 
    Returns bathroom_list for bathrooms
    Returns amenties_list for amenities 
    '''
    
    bathrooms_list = []
    amenities_list = []
    
    index = 0
    
    for url in url_list:
        
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page, 'html.parser')
        
        
        # Each
        # Test how many groups there are: 
        
        # If 2 groups
        
        if len(soup.find_all('p', class_='attrgroup')) > 1:
    
            group1 = soup.find_all('p', class_='attrgroup')[0].text.split('\n')
            item_list1 = [item for item in group1 if item != '']
            
            # Check to see if first grouping contains number of bathrooms
            
            if item_list1[0][-2:] == 'Ba':
                brba = item_list1[0].split(' / ')
                bath = brba[-1]
            
            # if not bathrooms, then NaN and move on
            else:
                bath = np.nan
    
            # Grouping 2 will be either Open House dates or Amenities:
            # If only 2 groups, then return amenities
            # If there are 3 groups, skip group 2 (Open House dates) and return amenities
            
            if len(soup.find_all('p', class_='attrgroup')) == 2:
            
                group2 = soup.find_all('p', class_='attrgroup')[1].text.split('\n')
                amenities = [item for item in group2 if item != '']
                
            else:
                group2 = soup.find_all('p', class_='attrgroup')[2].text.split('\n')
                amenities = [item for item in group2 if item != '']         
        
        # If only 1 group 
        
        elif len(soup.find_all('p', class_='attrgroup')) == 1:
            
            items = soup.find_all('p', class_='attrgroup')[0].text.split('\n')
            item_list = [item for item in group1 if item != '']
            
            # Check to see if that group contains number of bathrooms
            if item_list[0][-2:] == 'Ba':
                brba = item_list1[0].split(' / ')
                bath = brba[-1]
            
            # otherwise, we just have amenities
            else:
                amenities = item_list
        
        # If no details on post page, fill with NaN
        else:
            bath = np.nan
            amenities = np.nan
        
        print("Index: ", index)      # Comment all this out later
        print("Baths: ", bath)
        print("Amens: ", amenities)
        print("")
        
        bathrooms_list.append(bath)
        amenities_list.append(amenities)

        index += 1
        
    print("Number of Posts Scraped: ", index)        
    return bathrooms_list, amenities_list

In [14]:
urls = list(df.link)

post_details = get_post_amenities(urls)

Index:  0
Baths:  2Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit']

Index:  1
Baths:  1Ba
Amens:  ['loft', 'w/d in unit', 'no smoking', 'attached garage']

Index:  2
Baths:  1Ba
Amens:  ['application fee details: 45.00', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking']

Index:  3
Baths:  1Ba
Amens:  ['application fee details: 45.00', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry on site', 'no smoking']

Index:  4
Baths:  1Ba
Amens:  ['application fee details: 45.00', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry on site', 'no smoking']

Index:  5
Baths:  2Ba
Amens:  ['EV charging', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking', 'attached garage', 'wheelchair accessible']

Index:  6
Baths:  1Ba
Amens:  ['application fee details: 45.00', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry in bldg', 'no smoking', 

Index:  59
Baths:  1Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry on site', 'no smoking', 'off-street parking']

Index:  60
Baths:  1Ba
Amens:  ['apartment']

Index:  61
Baths:  1Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking', 'attached garage', 'wheelchair accessible']

Index:  62
Baths:  1Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry in bldg', 'no smoking', 'street parking']

Index:  63
Baths:  1Ba
Amens:  ['apartment', 'laundry on site', 'no smoking', 'no parking']

Index:  64
Baths:  1Ba
Amens:  ['apartment', 'laundry on site', 'no smoking', 'no parking']

Index:  65
Baths:  1Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit']

Index:  66
Baths:  1Ba
Amens:  ['EV charging', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking', 'attached garage', 'wheelchair accessible']

Index:  67
Baths:  1Ba
Ame

Index:  114
Baths:  2Ba
Amens:  ['application fee details: 45.00', 'cats are OK - purrr', 'apartment', 'laundry in bldg', 'no smoking', 'street parking']

Index:  115
Baths:  1Ba
Amens:  ['application fee details: 45.00', 'cats are OK - purrr', 'apartment', 'laundry in bldg', 'no smoking', 'off-street parking']

Index:  116
Baths:  1Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'no smoking', 'attached garage', 'wheelchair accessible']

Index:  117
Baths:  1Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment']

Index:  118
Baths:  1Ba
Amens:  ['application fee details: 45.00', 'cats are OK - purrr', 'apartment', 'laundry in bldg', 'no smoking', 'off-street parking']

Index:  119
Baths:  2Ba
Amens:  ['cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry in bldg', 'no smoking', 'off-street parking']

Index:  120
Baths:  1Ba
Amens:  ['application fee details: 45.00', 'cats are OK - purrr', 'apartment', 'laundry in bldg'

In [15]:
len(urls)

121

In [16]:
links = urls

index = 0

ones = []
twos = []
threes = []

for url in urls:
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'html.parser')
    
    items = soup.find_all('p', class_='attrgroup')
    groupings = len(items)
    if groupings == 1:
        ones.append((index, groupings, list(items)))
    elif groupings == 2:
        twos.append((index, groupings, list(items)))
    else:
        threes.append((index, groupings, list(items)))
    
    index += 1  

threes have **"OPEN HOUSE"**

In [9]:
print("Number of 3-groups: ", len(threes))



for group in threes:
    print("index: ", group[0])
    print(group[2])
    print("")

Number of 3-groups:  3
index:  5
[<p class="attrgroup">
<span class="shared-line-bubble"><b>2BR</b> / <b>2Ba</b></span>
<span class="shared-line-bubble"><b>1208</b>ft<sup>2</sup></span>
<span class="housing_movein_now property_date shared-line-bubble" data-date="2020-09-30" data-today_msg="available now">available sep 30</span>
</p>, <p class="attrgroup">
<b>open house dates</b>
<br/>
<span class="otherpostings">
<a href="/search/apa?sale_date=2020-10-01">thursday 2020-10-01</a>
</span>
<br/>
<span class="otherpostings">
<a href="/search/apa?sale_date=2020-10-02">friday 2020-10-02</a>
</span>
<br/>
<span class="otherpostings">
<a href="/search/apa?sale_date=2020-10-03">saturday 2020-10-03</a>
</span>
</p>, <p class="attrgroup">
<span>EV charging</span>
<br/>
<span>cats are OK - purrr</span>
<br/>
<span>dogs are OK - wooof</span>
<br/>
<span>apartment</span>
<br/>
<span>w/d in unit</span>
<br/>
<span>no smoking</span>
<br/>
<span>attached garage</span>
<br/>
<span>wheelchair accessible<