## Scrap content from selected propsearch property links

Having scrapped area lists containing all projects/buildings, this nb helps us scrap content for specific projects by selecting areas and project completion status.

<Project, Completion Status> : <Dubai Marina, Under Development>

It returns a csv file where each row is a project and each column contains features of that project.

### Auxiliary functions used in main

In [94]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np
import re

def get_last_photo_update(soup):
    # get string and link of last project photo update to check progress
    construction_updates = soup.find(id='jump-to-construction-updates')
    gallery = construction_updates.find('div', class_='fotorama ps-gallery-portrait')
    raw_caption = gallery.find('a')

    raw_caption = str(raw_caption)
    caption = re.search('data-caption="(.*) Image &amp;copy', raw_caption)
    caption = caption.group(1).replace('\"', '')
    photo_url = re.search('data-full=(.*) href=', raw_caption)
    photo_url = photo_url.group(1).replace('\"', '')
    return caption, photo_url

def scrap_df_to_dict(scrap_df):
    # input: a dataframe with links to scrap from propsearch
    # output: dictionary with scraped data
    # set bounds for urls to scrap
    url_idx_start = 0
    url_idx_end = len(scrap_df) + 1
    # construct an iterable with the original index and the url
    url_list = zip(list(scrap_df.link.iloc[url_idx_start:url_idx_end].index), 
                   list(scrap_df.link.iloc[url_idx_start:url_idx_end]))
    # instantiate dic to hold scrapped data
    scrapped_dict = {}
    # iterate over a list of urls
    for url_key, url in url_list:
        # cast url_key as str to index the dictionary
        url_key = str(url_key)
        # set request header with appropriate permissions
        headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
        # request url data
        r = requests.get(url, headers=headers)#, proxies=proxies)
        # get content
        content = r.content
        # parse content
        soup = BeautifulSoup(content, "html.parser")

        # add this entry to dictionary, using the same key as in the original dataframe index
        scrapped_dict[url_key] = {}
        # save the url
        scrapped_dict[url_key]['url'] = url
        # add space to save each table with info
        scrapped_dict[url_key]['tables'] = {}

        # get coordinates
        coordinates_div = soup.find('div', attrs={'class':'ps-guide-sub-subheading'})
        coordinates = coordinates_div.text.strip().replace('place', '').replace('\'', ' ')
        # add coordinates to dict
        scrapped_dict[url_key]['coordinates'] = coordinates

        # get las photo update
        try:
            caption, photo_url = get_last_photo_update(soup)
            scrapped_dict[url_key]['last_photo_update'] = {'caption': caption, 'photo_url': photo_url}
        except:
            print('Failed to get photo update for: {}'.format(url))
            pass

        # get tables with contents
        kv_list = soup.findAll('div', class_='ps-kv-list')
        # iterate through tables
        for k in kv_list:
            # get table title
            # some tables won't have a title, which we ignore
            try:
                title = k.find(class_='ps-kv-list-title').text.lower().replace(' ', '-')
            except AttributeError as a_err:
                print('Error on table for: {}'.format(url))
                continue
            # instantiate dict for each title
            scrapped_dict[url_key]['tables'][title] = {}
            # get table content
            contents = k.findAll('div', class_='grid-x')
            # iterate through table content
            for content in contents:
                entry = content.text.strip().split('\n')
                scrapped_dict[url_key]['tables'][title][entry[0]] = entry[1]
                
    return scrapped_dict

def merge_scrapped_dict_to_df(df, scrapped_dict):
    # input: df: main df with links and project area details
    # input: scrapped_dict: dict with granular details per project
    # output: merged_df: concat main df with df created with scrapped_dict
    
    # create df from scrapped dictionary, transpose to get the numbers as index
    scrapped_df = pd.DataFrame(scrapped_dict).T
    # scrapped_dict dataframe will have an 'object' type index, whereas the df is 'int64'
    # recast scrapped_dict index to match df, otherwise the concatenation operation returns nan values
    scrapped_df.index = scrapped_df.index.astype('int64')
    # merge with df
    merged_df = pd.concat([df, scrapped_df], axis=1)
    # drop duplicate url-link column
    merged_df.drop(columns=['link'], inplace=True)
    return merged_df
    
    
# function that renames the keys in the tables to remove the project name
def rename_table_keys(table):
    # Input: table: contents of each rown on the tables column of the dataframe
    # eg transform: 'mama-shelter-dubai-key-information' -->  #'key-information'
    # extract all available keys
    key_ids = list(table)
    # iterate over keys and rename them to remove the project name from each and make them universal keys
    for key_id in key_ids:
        if 'key-information' in key_id:
            table['key-information'] = table.pop(key_id)
        elif 'companies-associated' in key_id:
            table['companies-associated'] = table.pop(key_id)
        elif  'milestones' in key_id:
            table['milestones'] = table.pop(key_id)
    return table

# lower case column name and use underscore as space separator
def lower_dash_column_rename(column_list):
    # transform to lower case and use undesrcore as word separator
    return [col.strip().lower().replace(' ', '_') for col in column_list]

# shorted the info caption for the last photo taken of the project, to remove the developer name and keep the date
def shorten_photo_caption(caption):
    # input: caption from caption row
    # output: trimmed caption without the tower name
    # e.g.: Waterfall Tower, construction update May 2021. --> update May 2021
    try:
        short_caption = re.search('construction (.*)', caption)
    except:
        print('error reading caption: {}'.format(caption))
        return caption
    # check regex found a match, else return original caption
    if short_caption != None:
        return short_caption.group(1).replace('\"', '').replace('.', '')
    return caption

def flatten_data_df(df):
    # since scrapped data was saved in dictionaries and placed in cells in the df, we wish to/
    # extract those field and convert them into columns, this function accomplishes that
    # input: df with project data and scrapped data which is saved as dictionaries within the df
    # output: a df that contains all data but as a flat df, no more dictionaries within
    
    # For each table with data, extract the data into a dictionary which we will use to create dataframes
    # each dataframe contains all table info for each row, which we will concatenate to the global dataframe
    companies_associated = {}
    key_information = {}
    milestones = {}
    last_photo = {}
    for index, row in df.iterrows():
        companies_associated[index] = row.tables.get('companies-associated', np.nan)
        key_information[index] = row.tables.get('key-information', np.nan)
        milestones[index] = row.tables.get('milestones', np.nan)
        try:
            last_photo[index] = row.last_photo_update
        except AttributeError:
            print('No photos on this project')

    # build dataframes from the dictionaries created
    try:
        companies_associated_df = pd.DataFrame(companies_associated).T
    except ValueError:
        print('No companies associated found')
        # creating empty dataframe instead
        companies_associated_df = pd.DataFrame()
        
    key_information_df = pd.DataFrame(key_information).T
    try:
        milestones_df = pd.DataFrame(milestones).T
    except ValueError:
        print('No milestones found: {}'.format(ValueError))
        # creating empty dataframe instead
        milestones_df = pd.DataFrame()
    last_photo_df = pd.DataFrame(last_photo).T

    # cluster all dataframes into a list for easy access
    compound_df_list = [companies_associated_df, key_information_df, milestones_df, last_photo_df]

    # reaname columns
    for comp_df in compound_df_list:
        comp_df.columns = lower_dash_column_rename(comp_df.columns)

    # since key_information_df has an area column which clashes in name with the original dataframe /
    # we rename this to area_propsearch
    key_information_df_column_rename = {'area': 'area_propsearch'}
    key_information_df.rename(columns=key_information_df_column_rename, inplace=True)

    # apply shorten_photo_caption on the last_photo_df caption column
    try:
        last_photo_df.caption = last_photo_df.caption.apply(shorten_photo_caption);
    except AttributeError:
        print('Found no caption')

    # concat all table dfs 
    compound_info_df = pd.concat(compound_df_list, axis=1)

    return compound_info_df


### main

In [67]:
# main function
def process_scrap_output(project_status, project_area, df):
    
    # get df window by status and area
    df = df[df.status == project_status]
    df = df[df.area == project_area]
    
    # scrap links in df and build a dictionary with all info
    print('scrapping links')
    scrapped_dict = scrap_df_to_dict(df)
    
    # convert the scrapped dict into a dataframe and concat it with main df
    print('merging scrap with main')
    merged_df = merge_scrapped_dict_to_df(df, scrapped_dict)
    
    # apply rename_table_keys function to rename keys in tables column for easy referencing
    # this removes the project name from the key strings in each dictionary
    print('renaming table keys')
    merged_df.tables.apply(rename_table_keys)
    
    # flatten the info contained in merged_df in dictionaries, into columns on a new df
    print('flattening dfs')
    compound_df = flatten_data_df(merged_df)

    # concat concat table df with main df
    flat_df = pd.concat([merged_df, compound_df], axis=1)

    # drop 'tables' and 'last_photo_update' columns since these have been flattened into new columns
    # before, they used to be compressed into dictionaries in these columns
    try:
        flat_df.drop(columns=['tables', 'last_photo_update'], inplace=True)
    except KeyError:
        print('Drop columns: tables, last_photo not found')
    
    print('returning flat df')
    print()
    
    return flat_df

### Define search areas and load data

Load data with links to scrap and specify project status and project area

First, load data and describe it, then run the scrapper to extract desired areas with specified project status.

#### Describe data to see what to extract

In [42]:
# set pandas display options
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [13]:
# set source file name, read data and show head
source_file_path = 'towers_df.csv'
df = pd.read_csv(source_file_path)
df.head(2)

Unnamed: 0,area,sub_area,project,status,link
0,Arjan,Arjan,Dania Building 2,Complete,https://propsearch.ae/dubai/dania-building-2
1,Arjan,Arjan,Joya Blanca,Under development (In progress),https://propsearch.ae/dubai/joya-blanca


In [14]:
# project count by status
df.status.value_counts()

Complete                             934
Under development                    277
Under development (Cancelled)        136
Under development (On hold)          134
Planned                               47
Envisioned                            22
Under development (In progress)       15
Planned (Cancelled)                    5
Under development (Progress slow)      3
Demolished                             2
Complete (Handover underway)           1
Name: status, dtype: int64

In [17]:
# project count by area
df.area.value_counts()

Jumeirah Village Circle      498
Business Bay                 222
Dubai Marina                 187
Downtown                     123
Arjan                         96
Sports City                   95
Jumeirah Lakes Towers         93
Palm Jumeirah                 74
Motor City                    61
Jumeirah Village Triangle     57
Dubai Creek Harbour           27
Dubai Hills Estate            27
Sobha Hartland                16
Name: area, dtype: int64

In [36]:
# project area counts grouped by status, or the other way around
df.groupby('status').area.value_counts()

status                             area                     
Complete                           Jumeirah Village Circle      248
                                   Dubai Marina                 159
                                   Business Bay                 128
                                   Downtown                      87
                                   Jumeirah Lakes Towers         70
                                   Sports City                   56
                                   Motor City                    53
                                   Arjan                         49
                                   Palm Jumeirah                 47
                                   Jumeirah Village Triangle     18
                                   Dubai Hills Estate            11
                                   Sobha Hartland                 5
                                   Dubai Creek Harbour            3
Complete (Handover underway)       Downtown            

In [86]:
mask_df = df[df.status == 'Under development (Cancelled)']
mask_df.area.value_counts()

Jumeirah Village Circle      86
Sports City                  15
Business Bay                 13
Dubai Marina                  7
Jumeirah Village Triangle     5
Jumeirah Lakes Towers         4
Downtown                      4
Motor City                    1
Arjan                         1
Name: area, dtype: int64

In [87]:
mask_df.head(3)

Unnamed: 0,area,sub_area,project,status,link
92,Arjan,Arjan,Untitled Plot 6731103,Under development (Cancelled),https://propsearch.ae/dubai/untitled-plot-6731103
103,Business Bay,Business Bay,Omniyat Sky Palaces,Under development (Cancelled),https://propsearch.ae/dubai/omniyat-sky-palaces
109,Business Bay,Business Bay,La Residence At The Lotus,Under development (Cancelled),https://propsearch.ae/dubai/la-residence-at-th...


In [91]:
mask_df[mask_df.area=='Motor City']

Unnamed: 0,area,sub_area,project,status,link
1348,Motor City,Motor City,The Vertex,Under development (Cancelled),https://propsearch.ae/dubai/the-vertex


In [88]:
mask_df.area.unique()

array(['Arjan', 'Business Bay', 'Downtown', 'Dubai Marina',
       'Jumeirah Lakes Towers', 'Jumeirah Village Circle',
       'Jumeirah Village Triangle', 'Motor City', 'Sports City'],
      dtype=object)

In [57]:
mask_df.loc[540].link

'https://propsearch.ae/dubai/marina-101'

#### Create output files for single area and project status

In [70]:
import os
# set target areas and project status
project_area = 'Dubai Creek Harbour'
project_status = 'Under development (On hold)'

### DO not change variables below this line
# set output file name
output_file_path = project_area.lower().replace(' ', '_') + '_' + project_status.lower().replace(' ', '_') + '.csv'

# set source file name
source_file_path = 'towers_df.csv'
# output directory
out_directory = 'output/on-hold'
# read source file
df = pd.read_csv(source_file_path)

# run main
flat_df = process_scrap_output(project_status, project_area, df)

# save output
flat_df.to_csv(os.path.join(out_directory, output_file_path), index=False)


scrapping links
Failed to get photo update for: https://propsearch.ae/dubai/dubai-creek-tower
merging scrap with main
renaming table keys
flattening dfs
No photos on this project
Found no caption
Drop columns: tables, last_photo not found
returning flat df



#### Create output files for multpiple areas and single project status

In [95]:
import os
# select project areas, by previously filtering areas by project status and extracted existing ones
project_areas = ['Arjan', 'Business Bay', 'Downtown', 'Dubai Marina',
       'Jumeirah Lakes Towers', 'Jumeirah Village Circle',
       'Jumeirah Village Triangle', 'Motor City', 'Sports City']

# select project status
project_status = 'Under development (Cancelled)'
# output directory, change only last argument
out_directory = os.path.join('output', 'under-development-cancelled')

# set source file name
source_file_path = 'towers_df.csv'
# read source file
df = pd.read_csv(source_file_path)

for project_area in project_areas:
    # set output file name
    output_file_path = project_area.lower().replace(' ', '_') + '_' + project_status.lower().replace(' ', '_') + '.csv'
    
    # if file does not exist already, scrap it
    file_exists = os.path.isfile(os.path.join(out_directory, output_file_path))
    if not file_exists:
        # print file name
        print('Creating...{}'.format(os.path.join(out_directory, output_file_path)))
        # run main
        flat_df = process_scrap_output(project_status, project_area, df)
        # save output
        flat_df.to_csv(os.path.join(out_directory, output_file_path), index=False)
    else:
        print("File exists, skipping... {}".format(os.path.join(out_directory, output_file_path)))
    

File exists, skipping... output/under-development-cancelled/arjan_under_development_(cancelled).csv
File exists, skipping... output/under-development-cancelled/business_bay_under_development_(cancelled).csv
File exists, skipping... output/under-development-cancelled/downtown_under_development_(cancelled).csv
File exists, skipping... output/under-development-cancelled/dubai_marina_under_development_(cancelled).csv
File exists, skipping... output/under-development-cancelled/jumeirah_lakes_towers_under_development_(cancelled).csv
File exists, skipping... output/under-development-cancelled/jumeirah_village_circle_under_development_(cancelled).csv
File exists, skipping... output/under-development-cancelled/jumeirah_village_triangle_under_development_(cancelled).csv
Creating...output/under-development-cancelled/motor_city_under_development_(cancelled).csv
scrapping links
Failed to get photo update for: https://propsearch.ae/dubai/the-vertex
merging scrap with main
renaming table keys
flatten