## Merge area lists from Propsearch.com into a single list

Having scrapped area lists containing all projects/buildings in each from propsearch which have been saved in separate files and lists, we seek to merge all into a single list to classify by completion/development/hold/planned. This will help us inform which developers to contact for new and ongoing off-plan sales.

This list will contain the following columns:
* Link to the particular building/community on propsearch.com: link to Marina Gate 2
* Building/project name: e.g. Marina Gate 2
* Area: e.d. Dubai Marina
* Completion stage: e.g. completed

The list is saved as a csv file for further use.

In [200]:
import pandas as pd
import numpy as np
import seaborn as sns
import glob
import os
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

In [187]:
def preprocess_single_projects(path):
    '''
    Merge all csv files in the directory into single file
    
    Input: directory: directory name, not path, assumes this notebook is in the 
    '''
    # read in all file paths
    # assumes they are in the cwd
    path_list = glob.glob('./{}/*/*'.format(directory))
    
    # process paths into df
    projects_df = pd.DataFrame()
    for path in path_list:
        raw_df = pd.read_csv(os.path.abspath(path), index_col=False)
        if 'ps-loc-card-location' not in raw_df.columns:
            # extract location name from path
            added_location = path.split('/')[2].replace('-', ' ').title()
            # add area name to new column
            raw_df.insert(loc=0, column='ps-loc-card-location', value=added_location)
        # add custom area name column based on the name I used for each directory containing an area
        added_location = path.split('/')[2].replace('-', ' ').title() #todo
        raw_df.insert(loc=0, column='area', value=added_location) #todo
        projects_df = pd.concat([projects_df, raw_df], axis=0)
    
    # clean df
    #drop 'lazy src column'
    projects_df.drop(columns=['lazy src'], inplace=True)
    # reorder columns
    new_column_order = ['area', 'ps-loc-card-location', 'ps-loc-card-title', 
                        'ps-loc-card-status', 'tablescraper-selected-row href']
    projects_df = projects_df[new_column_order]
    # rename columns to simpler names
    renamed_columns = {'ps-loc-card-location': 'sub_area', 'ps-loc-card-title': 'project', 
                        'ps-loc-card-status': 'status', 'tablescraper-selected-row href': 'link'}
    projects_df.rename(columns=renamed_columns, inplace=True)
    # sort by area name
    projects_df.sort_values(by=['area'], inplace=True)
    # reset indices
    projects_df.reset_index(drop=True, inplace=True)
    
    return projects_df

In [188]:
# read and preprocess data into df containing all towers
directory = 'buildings_raw'
towers_df = preprocess_single_projects(directory)

In [189]:
# describe data
towers_df.describe()

Unnamed: 0,area,sub_area,project,status,link
count,1576,1576,1576,1576,1576
unique,13,57,1576,11,1576
top,Jumeirah Village Circle,Business Bay,Lana Tower,Complete,https://propsearch.ae/dubai/golden-wood-view
freq,498,201,1,934,1


In [194]:
towers_df.head(3)

Unnamed: 0,area,sub_area,project,status,link
0,Arjan,Arjan,Dania Building 2,Complete,https://propsearch.ae/dubai/dania-building-2
1,Arjan,Arjan,Joya Blanca,Under development (In progress),https://propsearch.ae/dubai/joya-blanca
2,Arjan,Arjan,La Fontana,Complete,https://propsearch.ae/dubai/la-fontana


In [191]:
# project count by status type
towers_df.status.value_counts()

Complete                             934
Under development                    277
Under development (Cancelled)        136
Under development (On hold)          134
Planned                               47
Envisioned                            22
Under development (In progress)       15
Planned (Cancelled)                    5
Under development (Progress slow)      3
Demolished                             2
Complete (Handover underway)           1
Name: status, dtype: int64

In [192]:
# projects under development by area
on_dev_df = towers_df[towers_df.status == 'Under development']
on_dev_df.area.value_counts()

Jumeirah Village Circle      97
Business Bay                 34
Arjan                        32
Jumeirah Village Triangle    22
Downtown                     18
Dubai Creek Harbour          17
Palm Jumeirah                16
Sobha Hartland               11
Dubai Hills Estate           10
Dubai Marina                  9
Sports City                   7
Jumeirah Lakes Towers         2
Motor City                    2
Name: area, dtype: int64

In [193]:
# projects under development in Business Bay
bbay_on_dev_df = on_dev_df[on_dev_df.area == 'Business Bay']
bbay_on_dev_df

Unnamed: 0,area,sub_area,project,status,link
96,Business Bay,Business Bay,Millennium Binghatti Residences,Under development,https://propsearch.ae/dubai/millennium-binghat...
98,Business Bay,Business Bay,Moon Tower,Under development,https://propsearch.ae/dubai/moon-tower
99,Business Bay,Business Bay,Mövenpick Hotel & Living,Under development,https://propsearch.ae/dubai/movenpick-hotel-li...
108,Business Bay,Business Bay,Nobles Residential Tower,Under development,https://propsearch.ae/dubai/nobles-residential...
110,Business Bay,Business Bay,Marble Arch Tower,Under development,https://propsearch.ae/dubai/marble-arch-tower
118,Business Bay,Business Bay,Mama Shelter Dubai,Under development,https://propsearch.ae/dubai/mama-shelter-dubai
121,Business Bay,Business Bay,Lillian Tower,Under development,https://propsearch.ae/dubai/lillian-tower
133,Business Bay,Business Bay,AG Tower Business Bay,Under development,https://propsearch.ae/dubai/ag-tower-business-bay
134,Business Bay,Business Bay,Ahad Residences,Under development,https://propsearch.ae/dubai/ahad-residences
140,Business Bay,Business Bay,Paramount Tower Hotel & Residences,Under development,https://propsearch.ae/dubai/paramount-tower-ho...


In [199]:
bbay_on_dev_df.link.iloc[0]

'https://propsearch.ae/dubai/millennium-binghatti-residences'

In [178]:
towers_df.sub_area.unique()

array(['Jumeirah Lakes Towers', 'Sports City', 'Canal Residence West',
       'Sobha Hartland', 'Sobha Hartland Greens', 'JVC District 15',
       'JVC District 14', 'JVC District 12', 'JVC District 18',
       'JVC District 11', 'JVC District 17', 'JVC District 10',
       'JVC District 13', 'JVC District 16', 'Jumeirah Village Circle',
       'Downtown Dubai', 'The Opera District', 'Yansoon', 'Zaafaran',
       'Zanzabeel', 'Old Town', 'Kamoon', 'Miska', 'Reehan',
       'Al Murooj Complex', 'Dubai Marina', 'Jumeirah Beach Residence',
       'Bluewaters Island', 'Dubai Marina Mall', 'Bluewaters Residences',
       'The Address Dubai Marina', 'Creek Island', 'Creek Beach',
       'Dubai Creek Harbour', 'Uptown Motor City', 'Motor City',
       'Green Community Motor City', 'Business Bay', 'Al Habtoor City',
       'Bay Square', 'M Hotel Downtown by Millennium Dubai', 'Arjan',
       'Palm Jumeirah', 'The Palm Crescent', 'The Golden Mile',
       'Park Heights', 'Dubai Hills Estate', '

In [179]:
towers_df.area.unique()

array(['Jumeirah Lakes Towers', 'Sports City', 'Sobha Hartland',
       'Jumeirah Village Circle', 'Downtown', 'Dubai Marina',
       'Dubai Creek Harbour', 'Motor City', 'Business Bay', 'Arjan',
       'Palm Jumeirah', 'Dubai Hills Estate', 'Jumeirah Village Triangle'],
      dtype=object)

## Get gps coordinates for each project

Using Beatutifulsoup to scrap each link on the project df

In [201]:
no_pages = 2

def get_data(pageNo):  
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    r = requests.get('https://www.amazon.in/gp/bestsellers/books/ref=zg_bs_pg_'+str(pageNo)+'?ie=UTF8&pg='+str(pageNo), headers=headers)#, proxies=proxies)
    content = r.content
    soup = BeautifulSoup(content)
    #print(soup)

    alls = []
    for d in soup.findAll('div', attrs={'class':'a-section a-spacing-none aok-relative'}):
        #print(d)
        name = d.find('span', attrs={'class':'zg-text-center-align'})
        n = name.find_all('img', alt=True)
        #print(n[0]['alt'])
        author = d.find('a', attrs={'class':'a-size-small a-link-child'})
        rating = d.find('span', attrs={'class':'a-icon-alt'})
        users_rated = d.find('a', attrs={'class':'a-size-small a-link-normal'})
        price = d.find('span', attrs={'class':'p13n-sc-price'})

        all1=[]

        if name is not None:
            #print(n[0]['alt'])
            all1.append(n[0]['alt'])
        else:
            all1.append("unknown-product")

        if author is not None:
            #print(author.text)
            all1.append(author.text)
        elif author is None:
            author = d.find('span', attrs={'class':'a-size-small a-color-base'})
            if author is not None:
                all1.append(author.text)
            else:    
                all1.append('0')

        if rating is not None:
            #print(rating.text)
            all1.append(rating.text)
        else:
            all1.append('-1')

        if users_rated is not None:
            #print(price.text)
            all1.append(users_rated.text)
        else:
            all1.append('0')     

        if price is not None:
            #print(price.text)
            all1.append(price.text)
        else:
            all1.append('0')
        alls.append(all1)    
    return alls

In [202]:
results = []
for i in range(1, no_pages+1):
    results.append(get_data(i))
flatten = lambda l: [item for sublist in l for item in sublist]
df = pd.DataFrame(flatten(results),columns=['Book Name','Author','Rating','Customers_Rated', 'Price'])
df.to_csv('amazon_products.csv', index=False, encoding='utf-8')

In [203]:
df = pd.read_csv("amazon_products.csv")

In [204]:
df

Unnamed: 0,Book Name,Author,Rating,Customers_Rated,Price
0,Ikigai: The Japanese secret to a long and happ...,Héctor García,4.6 out of 5 stars,19396,₹320.00
1,My First Library: Boxset of 10 Board Books for...,Wonder House Books,4.5 out of 5 stars,31506,₹399.00
2,The Psychology of Money,Morgan Housel,4.6 out of 5 stars,14197,₹277.00
3,My First Book of Pencil Control : Practice Pat...,Wonder House Books,4.4 out of 5 stars,8639,₹89.00
4,Word Power Made Easy,Norman Lewis,4.4 out of 5 stars,24609,₹91.00
...,...,...,...,...,...
95,Metamorphosis,Franz Kafka,4.5 out of 5 stars,4005,₹70.00
96,Spoken English Course (Telugu),Vashista 360,4.3 out of 5 stars,2625,₹369.00
97,How to Prepare for QUANTITATIVE APTITUDE for C...,Arun Sharma,4.5 out of 5 stars,164,₹725.00
98,Amazon Brand - Solimo Board Books for Kids (Se...,Solimo,4.6 out of 5 stars,5333,₹400.00
