## Merge area lists from Propsearch.com into a single list

Having scrapped area lists containing all projects/buildings in each from propsearch which have been saved in separate files and lists, we seek to merge all into a single list to classify by completion/development/hold/planned. This will help us inform which developers to contact for new and ongoing off-plan sales.

This list will contain the following columns:
* Link to the particular building/community on propsearch.com: link to Marina Gate 2
* Building/project name: e.g. Marina Gate 2
* Area: e.d. Dubai Marina
* Completion stage: e.g. completed

The list is saved as a csv file for further use.

In [241]:
import pandas as pd
import numpy as np
import seaborn as sns

In [187]:
import glob
import os

def preprocess_single_projects(path):
    '''
    Merge all csv files in the directory into single file
    
    Input: directory: directory name, not path, assumes this notebook is in the 
    '''
    # read in all file paths
    # assumes they are in the cwd
    path_list = glob.glob('./{}/*/*'.format(directory))
    
    # process paths into df
    projects_df = pd.DataFrame()
    for path in path_list:
        raw_df = pd.read_csv(os.path.abspath(path), index_col=False)
        if 'ps-loc-card-location' not in raw_df.columns:
            # extract location name from path
            added_location = path.split('/')[2].replace('-', ' ').title()
            # add area name to new column
            raw_df.insert(loc=0, column='ps-loc-card-location', value=added_location)
        # add custom area name column based on the name I used for each directory containing an area
        added_location = path.split('/')[2].replace('-', ' ').title() #todo
        raw_df.insert(loc=0, column='area', value=added_location) #todo
        projects_df = pd.concat([projects_df, raw_df], axis=0)
    
    # clean df
    #drop 'lazy src column'
    projects_df.drop(columns=['lazy src'], inplace=True)
    # reorder columns
    new_column_order = ['area', 'ps-loc-card-location', 'ps-loc-card-title', 
                        'ps-loc-card-status', 'tablescraper-selected-row href']
    projects_df = projects_df[new_column_order]
    # rename columns to simpler names
    renamed_columns = {'ps-loc-card-location': 'sub_area', 'ps-loc-card-title': 'project', 
                        'ps-loc-card-status': 'status', 'tablescraper-selected-row href': 'link'}
    projects_df.rename(columns=renamed_columns, inplace=True)
    # sort by area name
    projects_df.sort_values(by=['area'], inplace=True)
    # reset indices
    projects_df.reset_index(drop=True, inplace=True)
    
    return projects_df

In [188]:
# read and preprocess data into df containing all towers
directory = 'buildings_raw'
towers_df = preprocess_single_projects(directory)

In [189]:
# describe data
towers_df.describe()

Unnamed: 0,area,sub_area,project,status,link
count,1576,1576,1576,1576,1576
unique,13,57,1576,11,1576
top,Jumeirah Village Circle,Business Bay,Lana Tower,Complete,https://propsearch.ae/dubai/golden-wood-view
freq,498,201,1,934,1


In [194]:
towers_df.head(3)

Unnamed: 0,area,sub_area,project,status,link
0,Arjan,Arjan,Dania Building 2,Complete,https://propsearch.ae/dubai/dania-building-2
1,Arjan,Arjan,Joya Blanca,Under development (In progress),https://propsearch.ae/dubai/joya-blanca
2,Arjan,Arjan,La Fontana,Complete,https://propsearch.ae/dubai/la-fontana


In [191]:
# project count by status type
towers_df.status.value_counts()

Complete                             934
Under development                    277
Under development (Cancelled)        136
Under development (On hold)          134
Planned                               47
Envisioned                            22
Under development (In progress)       15
Planned (Cancelled)                    5
Under development (Progress slow)      3
Demolished                             2
Complete (Handover underway)           1
Name: status, dtype: int64

In [192]:
# projects under development by area
on_dev_df = towers_df[towers_df.status == 'Under development']
on_dev_df.area.value_counts()

Jumeirah Village Circle      97
Business Bay                 34
Arjan                        32
Jumeirah Village Triangle    22
Downtown                     18
Dubai Creek Harbour          17
Palm Jumeirah                16
Sobha Hartland               11
Dubai Hills Estate           10
Dubai Marina                  9
Sports City                   7
Jumeirah Lakes Towers         2
Motor City                    2
Name: area, dtype: int64

In [548]:
# projects under development in Business Bay
bbay_on_dev_df = on_dev_df[on_dev_df.area == 'Business Bay']
bbay_on_dev_df

Unnamed: 0,area,sub_area,project,status,link
96,Business Bay,Business Bay,Millennium Binghatti Residences,Under development,https://propsearch.ae/dubai/millennium-binghat...
98,Business Bay,Business Bay,Moon Tower,Under development,https://propsearch.ae/dubai/moon-tower
99,Business Bay,Business Bay,Mövenpick Hotel & Living,Under development,https://propsearch.ae/dubai/movenpick-hotel-li...
108,Business Bay,Business Bay,Nobles Residential Tower,Under development,https://propsearch.ae/dubai/nobles-residential...
110,Business Bay,Business Bay,Marble Arch Tower,Under development,https://propsearch.ae/dubai/marble-arch-tower
118,Business Bay,Business Bay,Mama Shelter Dubai,Under development,https://propsearch.ae/dubai/mama-shelter-dubai
121,Business Bay,Business Bay,Lillian Tower,Under development,https://propsearch.ae/dubai/lillian-tower
133,Business Bay,Business Bay,AG Tower Business Bay,Under development,https://propsearch.ae/dubai/ag-tower-business-bay
134,Business Bay,Business Bay,Ahad Residences,Under development,https://propsearch.ae/dubai/ahad-residences
140,Business Bay,Business Bay,Paramount Tower Hotel & Residences,Under development,https://propsearch.ae/dubai/paramount-tower-ho...


In [199]:
bbay_on_dev_df.link.iloc[0]

'https://propsearch.ae/dubai/millennium-binghatti-residences'

In [178]:
towers_df.sub_area.unique()

array(['Jumeirah Lakes Towers', 'Sports City', 'Canal Residence West',
       'Sobha Hartland', 'Sobha Hartland Greens', 'JVC District 15',
       'JVC District 14', 'JVC District 12', 'JVC District 18',
       'JVC District 11', 'JVC District 17', 'JVC District 10',
       'JVC District 13', 'JVC District 16', 'Jumeirah Village Circle',
       'Downtown Dubai', 'The Opera District', 'Yansoon', 'Zaafaran',
       'Zanzabeel', 'Old Town', 'Kamoon', 'Miska', 'Reehan',
       'Al Murooj Complex', 'Dubai Marina', 'Jumeirah Beach Residence',
       'Bluewaters Island', 'Dubai Marina Mall', 'Bluewaters Residences',
       'The Address Dubai Marina', 'Creek Island', 'Creek Beach',
       'Dubai Creek Harbour', 'Uptown Motor City', 'Motor City',
       'Green Community Motor City', 'Business Bay', 'Al Habtoor City',
       'Bay Square', 'M Hotel Downtown by Millennium Dubai', 'Arjan',
       'Palm Jumeirah', 'The Palm Crescent', 'The Golden Mile',
       'Park Heights', 'Dubai Hills Estate', '

In [179]:
towers_df.area.unique()

array(['Jumeirah Lakes Towers', 'Sports City', 'Sobha Hartland',
       'Jumeirah Village Circle', 'Downtown', 'Dubai Marina',
       'Dubai Creek Harbour', 'Motor City', 'Business Bay', 'Arjan',
       'Palm Jumeirah', 'Dubai Hills Estate', 'Jumeirah Village Triangle'],
      dtype=object)

## Get gps coordinates and project data for each project

Using Beatutifulsoup to scrap each link on the project df
* key information
* companies-associated
* milestones
* proximity-to-landmarks
* coordinates

Scrapping inspiration from Datacamp [link](https://www.datacamp.com/community/tutorials/amazon-web-scraping-using-beautifulsoup?utm_source=adwords_ppc&utm_campaignid=898687156&utm_adgroupid=48947256715&utm_device=c&utm_keyword=&utm_matchtype=b&utm_network=g&utm_adpostion=&utm_creative=229765585183&utm_targetid=dsa-429603003980&utm_loc_interest_ms=&utm_loc_physical_ms=1000013&gclid=Cj0KCQjw0emHBhC1ARIsAL1QGNdx-wU43XjIPm2mfMWut-ceTSl2j3WZ7X8DiH4tl7mfG4njEnGfGYkaAuCiEALw_wcB) and Real Python [link](https://realpython.com/beautiful-soup-web-scraper-python/)

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import re

In [550]:
len(bbay_on_dev_df)

34

In [487]:
## read propsearch link
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

url = bbay_on_dev_df.link.iloc[33]
r = requests.get(url, headers=headers)#, proxies=proxies)
content = r.content
soup = BeautifulSoup(content, "html.parser")

# dictionary to hold urls and table contents. 
kv_dic = {}
url_count = 0
# create entry name by url counter
url_key = "url_{}".format(url_count)
kv_dic[url_key] = {}
kv_dic[url_key]['url'] = url
kv_dic[url_key]['tables'] = {}

# get coordinates
coordinates_div = soup.find('div', attrs={'class':'ps-guide-sub-subheading'})
coordinates = coordinates_div.text.strip().replace('place', '').replace('\'', ' ')
# add coordinates to dic
kv_dic[url_key]['coordinates'] = coordinates

# get tables
kv_list = soup.findAll('div', class_='ps-kv-list')
# iterate through tables
for k in kv_list:
    # get table title
    try:
        title = k.find(class_='ps-kv-list-title').text.lower().replace(' ', '-')
    except AttributeError as a_err:
        print(a_err)
        continue
    # instantiate dict for each title
    kv_dic[url_key]['tables'][title] = {}
    # get table content
    contents = k.findAll('div', class_='grid-x')
    # iterate through table content
    for content in contents:
        entry = content.text.strip().split('\n')
        kv_dic[url_key]['tables'][title][entry[0]] = entry[1]
        #print(title)
        #print(entry)
        

In [502]:
url_list

([96, 98],
 ['https://propsearch.ae/dubai/millennium-binghatti-residences',
  'https://propsearch.ae/dubai/moon-tower'])

In [497]:
type(url_list)

pandas.core.series.Series

In [498]:
bbay_on_dev_df.link.iloc[:2].index

Int64Index([96, 98], dtype='int64')

In [554]:
# alias for the dataframe containing the links to scrap
scrap_df = bbay_on_dev_df
# set bounds for urls to scrap
url_idx_start = 0
url_idx_end = len(scrap_df) + 1
# construct an iterable with the original index and the url
url_list = zip(list(scrap_df.link.iloc[url_idx_start:url_idx_end].index), 
               list(scrap_df.link.iloc[url_idx_start:url_idx_end]))
# instantiate dic to hold scrapped data
scrapped_dict = {}
# iterate over a list of urls
for url_key, url in url_list:
    # cast url_key as str to index the dictionary
    url_key = str(url_key)
    # set request header with appropriate permissions
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
    # request url data
    r = requests.get(url, headers=headers)#, proxies=proxies)
    # get content
    content = r.content
    # parse content
    soup = BeautifulSoup(content, "html.parser")
    
    # add this entry to dictionary, using the same key as in the original dataframe index
    scrapped_dict[url_key] = {}
    # save the url
    scrapped_dict[url_key]['url'] = url
    # add space to save each table with info
    scrapped_dict[url_key]['tables'] = {}

    # get coordinates
    coordinates_div = soup.find('div', attrs={'class':'ps-guide-sub-subheading'})
    coordinates = coordinates_div.text.strip().replace('place', '').replace('\'', ' ')
    # add coordinates to dict
    scrapped_dict[url_key]['coordinates'] = coordinates

    # get tables with contents
    kv_list = soup.findAll('div', class_='ps-kv-list')
    # iterate through tables
    for k in kv_list:
        # get table title
        # some tables won't have a title, which we ignore
        try:
            title = k.find(class_='ps-kv-list-title').text.lower().replace(' ', '-')
        except AttributeError as a_err:
            print('Error on table for: {}'.format(url))
            continue
        # instantiate dict for each title
        scrapped_dict[url_key]['tables'][title] = {}
        # get table content
        contents = k.findAll('div', class_='grid-x')
        # iterate through table content
        for content in contents:
            entry = content.text.strip().split('\n')
            scrapped_dict[url_key]['tables'][title][entry[0]] = entry[1]
            

Error on table for: https://propsearch.ae/dubai/mama-shelter-dubai
Error on table for: https://propsearch.ae/dubai/amna-tower
Error on table for: https://propsearch.ae/dubai/dorchester-hotel-dubai
Error on table for: https://propsearch.ae/dubai/zada-tower


In [555]:
scrapped_dict

{'96': {'url': 'https://propsearch.ae/dubai/millennium-binghatti-residences',
  'tables': {'millennium-binghatti-residences-key-information': {'Location type': 'Residential building',
    'Area': 'Business Bay',
    'Phase': 'Under development',
    'Floors': '24',
    'Building type': 'High-rise building',
    'Total units': '230',
    'Unit types': '113 studios58 one-bedroom apartments59 two-bedroom apartments'},
   'companies-associated-with-millennium-binghatti-residences': {'Developer': 'Binghatti Developers',
    'Architectural Consultant': 'Eng. Adnan Saffarini',
    'Contractor': 'Granada Europe Engineering Contracting Co.'},
   'millennium-binghatti-residences-milestones': {'Date Launched': 'March 2018',
    'Estimated Handover Date': 'Q4 2019',
    'Construction Started': '2018'},
   'proximity-to-landmarks': {'Palm Jumeirah': '20 mins drive',
    'Al Maktoum International Airport': '42 mins drive',
    'La Mer by Meraas': '19 mins drive',
    'Mall of the Emirates': '16 mins

In [None]:
import pickle

cache_dir = os.path.join("../cache", "sentiment_web_app")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay