In [1]:
# Import libraries:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
from uszipcode import SearchEngine, SimpleZipcode, Zipcode
import re

In [2]:
# Function to gather housing data from trulia.com based on page URL.
def gather_housing_data(url, zipcode):
    # Initialize results list to store all data from this page.
    results = []
    
    # Initialize header, request, and beautifulsoup.
    header = {'User-Agent':'cjbratkov'}
    res = requests.get(url, headers = header)
    soup = BeautifulSoup(res.content, 'lxml')
    
    # Find and store the price of each house; most important.
    house_prices = soup.find_all('span', {'class': 'cardPrice h5 man pan typeEmphasize noWrap typeTruncate'})
    prices = []
    for i in house_prices:
        prices.append(i.text)

    # Find and store each house's address.
    house_addresses = soup.find_all('div', {'class': 'h6 typeWeightNormal typeTruncate typeLowlight mvn'})
    addresses = []
    for i in house_addresses:
        addresses.append(i.text)

    # Find and store the city/state of each house.
    house_cities = soup.find_all('div', {'class': 'typeTruncate typeLowlight'})
    cities = []
    for i in house_cities:
        cities.append(i.text)
    
    # Find and store the bedrooms, bathrooms, and area of each house.
    general_house_info = soup.find_all('ul', {'data-testid': 'cardDescription'})
    info = []
    for g in general_house_info:
        s = g.text
        s = s.replace(',', '')
        info.append(s)

    # Save all data for each house in the result dictionary.
    for i in range(len(house_prices)):
        # Initialize empty result dictionary to store results from each page.
        result = {}
        
        # Store all meaningful values to result dictionary.
        result['zipcode'] = zipcode
        
        # Validate all input.
        try: 
            result['price'] = prices[i]
        except IndexError:
            result['price'] = np.nan
            
        try:
            result['qualities'] = info[i]
        except IndexError:
            result['qualities'] = np.nan
            
        try:
            result['address'] = addresses[i]
        except IndexError:
            result['address'] = np.nan
        
        try:
            result['city/state'] = cities[i]
        except IndexError:
            result['city/state'] = np.nan
    
        # Add all result dictionary data to results list.
        results.append(result)
    
    # Return the results from the page and url.
    return results, url

In [3]:
# Function to update the URL to the next page.
def update_url(page_url):
    
    # Assume there is no next page.
    next_exists = False
    
    # Initialize header, request, and beautifulsoup.
    header = {'User-Agent':'cjbratkov'}
    res = requests.get(page_url, headers = header)
    soup = BeautifulSoup(res.content, 'lxml')

    # Find and store the next page url.
    next_pages = (soup.find_all('a', {'class': 'pvl phm'}))
    
    # If additional pages are found, update the next page URL.
    if next_pages:
        for p in next_pages:
            if (p.attrs['aria-label'] == 'Next page'):
                next_page_url = p.attrs['href']
                next_exists = True
            else:
                next_page_url = page_url
                
    # If next page is not found, assign to current page URL.      
    else:
        next_page_url = page_url
        next_exists = False

    # Return the next page's url.
    return next_page_url

In [4]:
# Function to clean scraped data and save it to a temporary data frame.
def data_cleaning(df):
    # Is the property a studio or not.
    df['studio'] = df['qualities'].str.extract(r'(Studio)')
    
    # Extract number of bathrooms for each property.
    df['bath'] = df['qualities'].str.extract(r'([0123456789]ba)')
    df['bath'] = df['bath'].map(lambda bath_cell: np.nan if bath_cell == 'NaN' else str(bath_cell))
    df['bath'] = df['bath'].map(lambda bath_cell: bath_cell.replace('ba', ''))
    df['bath'] = df['bath'].map(lambda bath_cell: bath_cell.replace('0', '10'))
    df['bath'] = df['bath'].astype(float)
    
    # Extract number of bedrooms for each property.
    df['bed'] = df['qualities'].str.extract(r'([0123456789]bd)')
    df['bed'] = df['bed'].map(lambda bed_cell: np.nan if bed_cell == 'NaN' else str(bed_cell))
    df['bed'] = df['bed'].map(lambda bath_cell: bath_cell.replace('bd', ''))
    df['bed'] = df['bed'].astype(float)
    
    # Extract and clean price values for each property.
    df['price'] = df['price'].map(lambda price_cell: price_cell.replace('$', ''))
    df['price'] = df['price'].map(lambda price_cell: price_cell.replace(',', ''))
    df['price'] = df['price'].map(lambda price_cell: price_cell.replace('+', ''))
    df['price'] = df['price'].astype(float)
    
    # Zipcode feature.
    df['zipcode'] = df['zipcode'].astype(int)
    
    # Extract and clean area of each property.
    df['drop_sqft'], df['sqft_test'] = df['qualities'].str.split('[0123456789]ba', 1).str
    df['sqft_test'] = df['sqft_test'].map(lambda sqft_cell: np.nan if sqft_cell == '' else str(sqft_cell))
    df['sqft'], df['sqft_test_2'] = df['sqft_test'].str.split(' ', 1).str
    df['sqft'] = df['sqft'].astype(float)
    
    # Assign assumed values of bedroom and bathroom if the property is a studio.
    df.loc[df['studio'] == 'Studio', 'bed'] = 1
    df.loc[df['studio'] == 'Studio', 'bath'] = 1
    # Feature engineering binary representation of whether the property is a studio or not.
    df.loc[df['studio'] != 'Studio', 'studio'] = 0
    df.loc[df['studio'] == 'Studio', 'studio'] = 1
    df['studio'] = df['studio'].astype(float)
    
    # Drop unnecessary features.
    df.drop(columns= ['drop_sqft', 'sqft_test', 'sqft_test_2', 'qualities'], inplace=True)
    
    # Return cleaned temporary dataframe.
    return df

In [5]:
# Custom dictionary class to track the last page to scrape.
class last_page_dict(dict):  
    def __init__(self):  
        self = dict()  
    def add_link_with_num(self, link, num):  
        self[link] = num

In [6]:
# Driver function for the entire process.
def driver(zipcode):
    # Initialize list of all results.
    all_results = []
    # Search Engine from uszipcode library to validate zipcode.
    search = SearchEngine()
    zip_search = search.by_zipcode(zipcode)
    base_url = 'https://www.trulia.com/'
    
    # Gather URL elements if they exist for a zipcode.
    try:
        city = zip_search.major_city
        state = zip_search.state_abbr
        city = city.replace(' ', '_')
        zcode = zip_search.zipcode
        
    # If not, skip to next zipcode.
    except AttributeError:
        return all_results

    # Initialize url based on zipcode.
    first_url = (base_url + str(state) + '/' + str(city) + '/' + str(zcode) + '/')

    # Initialize header, request, and beautifulsoup.
    header = {'User-Agent':'cjbratkov'}
    res = requests.get(first_url, headers = header)
    soup = BeautifulSoup(res.content, 'lxml')
    
    # Find and store the last page url.
    next_pages = (soup.find_all('a', {'class': 'pvl phm'}))
    
    link_list = []
    link_with_nums = last_page_dict()
    num_link = {}
    
    for p in next_pages:
        num_link['text'] = p.text
        num_link['attributes'] = p.attrs['href']
        link_list.append(p.attrs['href'])

    # Extract the last page number and link.
    for link in link_list:
        num_string_finder = re.search(r'[\d+]?[\d+]?[\d+][_][p][\/]', link)
        num_finder = re.match(r'[\d+]?[\d+]?[\d+]', num_string_finder.group(0))
        page_num = num_finder.group(0)
        page_num = int(page_num)
        link_with_nums.add_link_with_num(link, page_num)
    
    # Assign defaults to the last page number and last page URL.
    last_page_num = 0
    last_page_url = first_url
    
    # Store the last page number and last page URL.
    for key, value in link_with_nums.items():
        if value > last_page_num:
            last_page_num = value
            last_page_url = key
    
    # Gather data from the first zipcode search result.
    results, first_page_url = gather_housing_data(first_url, zipcode)
    all_results.append(results)
    page_url = update_url(first_page_url)
    
    # Initialize count of scraped pages.
    count = 0
    
    # Gather all data while the last page 
    while count < last_page_num:
        # Gather data from the following pages.
        results, next_page_url = gather_housing_data(page_url, zcode)
        
        all_results.append(results)
        
        # Update the next page URL after data is gathered from a page.
        page_url= update_url(page_url)
        count += 1

    results, page_url = gather_housing_data(last_page_url, zcode)
    all_results.append(results)
            
    return all_results

In [7]:
# DataFrame builder based on results gathered from trulia.com.
def df_builder(all_results):
    
    # Unpack the nested values from the gathered data.
    r = []
    for i in all_results:
        for j in i:
            for n in j:  
                r.append(n)
          
    # Instantiate dataframe.
    df = pd.DataFrame(r)
    
    # Remove duplicate values
    df.drop_duplicates(subset = 'address', inplace=True)
    
    # Return the dataframe.
    return df

In [8]:
# Function to generate summary statistics based on gathered data.
def summary_stats(df):
    # Set float formatting for readability.
    pd.set_option('display.float_format', lambda x: '%.2f' % x)
    # Initialize list of all summary statistics.
    all_stats = []
    # Select only numeric columns.
    numeric_cols = list(df.select_dtypes(include=['float64']).columns)
    # Exclude studio from summary statistics.
    df_cols = [c for c in numeric_cols if c != 'studio']
    
    # For all existing values in each numeric column, generate summary statistics.
    for col in df_cols:
        # Exclude null values.
        stat_col = df[col].dropna()
        stats = {}
        # Name of feature.
        stat_name = col
        stats['name'] = col
        # Sum
        stat_sum = np.sum(stat_col)
        stats['sum'] = stat_sum
        # Average
        stat_mean = np.mean(stat_col)
        stats['mean'] = stat_mean
        # Minimum
        stat_min = np.min(stat_col)
        stats['min'] = stat_min
        # Maximum
        stat_max = np.max(stat_col)
        stats['max'] = stat_max
        # Median
        stat_med = np.median(stat_col)
        stats['median'] = stat_med
        
        # Append all statistics to list for each column.
        all_stats.append(stats)
       
    # Return the summary statistics.
    return all_stats

In [9]:
def big_driver(zipcode_list):
    # Initialize empty list of cumulative results.
    all_results = []
    # Initialize empty list of results per zipcode.
    result = []
    # Initialize list of invalid zipcodes.
    invalid_zips = []
    
    # Gather data for each zipcode input by the user.
    for z in zipcode_list:
        result = driver(z)
            
        if result:
            all_results.append(result)
        else:
            invalid_zips.append(z)
    
    # Build temporary dataframe of gathered data.
    df = df_builder(all_results)
    # Clean gathered data.
    df = data_cleaning(df)
    # Generate summary statistics on cleaned data.
    stats = summary_stats(df)
    # Delete dataframe.
    del(df)
    # Return summary statistics and invalid zipcodes.
    return stats, invalid_zips

In [10]:
def user_zips():
    # Set boolean variable to control while loop.
    valid_input = True
    # Set a dummy string to trigger value error.
    s = 'dummy_string'
    # Initialize empty list of zipcodes.
    zip_list = []
    
    # Prompt the user for input until no input is given.
    while valid_input == True:
        print('Enter a zipcode or press enter to get your summary statistics: ')
        zipcode = input('--->')
        
        # Input validation.
        try:
            if (len(zipcode) == 5):
                zipcode = int(zipcode)
                
                if (zipcode <= 0):
                    int(s)
            else:
                int(s)
                
            # Append input to list if zipcode is valid.
            zip_list.append(zipcode)
            
        except ValueError:
            # If input is empty, execute code.
            if zipcode == '':
                return zip_list
            
            print('Value error. Please try again.')
            valid_input = True
    
    # Return list of zipcodes with no duplicates.
    return list(set(zip_list))

In [11]:
# Function to drive entire process.
def damage_estimate():
    # Get the user input list of zipcodes.
    zipcode_list = user_zips()
    
    # Get the invalid zipcodes and summary statistics.
    stats, invalid_zips = big_driver(zipcode_list)
    
    # List of invalid zipcodes.
    zipcode_list = [z for z in zipcode_list if z not in invalid_zips]
    
    # Display invalid zipcodes to user if they exist.
    if len(invalid_zips) != 0:
        print('Invalid or Unknown Zipcodes: ', invalid_zips)
        print()

    # Display zipcodes used to generate summary statistics.
    print('Estimated Damage Summary Statistics Zipcode Areas : ', zipcode_list)
    print()
    
    # Allow the user to save the statistics as a DataFrame, then CSV. 
    stats_df = pd.DataFrame.from_dict(stats)
    # Display summary statistics.
    print(stats_df)

In [12]:
# testing_zipcode_list = [10011, 10002, 10022, 10001]
damage_estimate()

Enter a zipcode or press enter to get your summary statistics: 
--->10001
Enter a zipcode or press enter to get your summary statistics: 
--->10011
Enter a zipcode or press enter to get your summary statistics: 
--->10002
Enter a zipcode or press enter to get your summary statistics: 
--->1002a
Value error. Please try again.
Enter a zipcode or press enter to get your summary statistics: 
--->12
Value error. Please try again.
Enter a zipcode or press enter to get your summary statistics: 
--->10022
Enter a zipcode or press enter to get your summary statistics: 
--->
Estimated Damage Summary Statistics Zipcode Areas :  [10001, 10011, 10002, 10022]

          max       mean     median      min   name           sum
0 67000000.00 3564583.93 1800000.00 62900.00  price 4972594589.00
1       10.00       2.32       2.00     1.00   bath       3233.00
2        9.00       2.09       2.00     1.00    bed       2912.00
3   967300.00   33014.98    1500.00   250.00   sqft   39551947.00
