# Panama Apartments Pricing | Data Scraping Notebook
This notebook uses the Selenium library to scrap apartments pricing in Panama (price in dollars $\$$) with their respective features. The features used in this notebook are:
- area or $m^2$ of construction
- number of bedrooms
- number of bathrooms
- number of parkings

The add posting website used in this notebook can be found [here](https://www.encuentra24.com/panama-en/classifieds). The extracted data is dumped into a .csv file for preposecing.

In [39]:
import pandas as pd
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager 
from selenium.webdriver.common.by import By

from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions

In [2]:
options = webdriver.ChromeOptions()
options.binary_location = "C:\Program Files\Google\Chrome Beta\Application\chrome.exe"
driver = webdriver.Chrome(executable_path=r'C:\Users\a-gue\.wdm\drivers\chromedriver\win32\104.0.5112.20\chromedriver.exe', chrome_options=options)

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
# page_url = "https://www.encuentra24.com/panama-en/real-estate-for-sale-apartments-condos?q=number.50"
page_url = "https://www.encuentra24.com/panama-en/real-estate-for-sale-apartments-condos/prov-panama?q=withcat.real-estate-for-sale-apartments-condos|number.50"

In [4]:
driver.get(page_url)

__________________

## Get url by Add
### Start Here | Get link per pages:
First we need to identify how many pages of adds do we have. This can be check online. 
With the number of pages, we then can pass to a function that will loop over each page and get each add url.

**Run this when data.csv file for individual add is not up to date**



In [41]:
def get_links_of_pages(pages_num):
    '''Get the links of the number of pages on encuentra 24 main for panama new apartaments
    - pages_num: number of pages that we will scrapt
    '''
    url_list = []
    for i in range(pages_num):

        # _url_decomposed = "https://www.encuentra24.com/panama-en/real-estate-for-sale-apartments-condos." + str(i+1) + "?q=number.50"
        _url_decomposed = "https://www.encuentra24.com/panama-en/real-estate-for-sale-apartments-condos/prov-panama." + str(i+1) + "?q=withcat.real-estate-for-sale-apartments-condos|number.50"
        url_list.append(_url_decomposed)
    return url_list

In [42]:
url_list = get_links_of_pages(247)

### Get Individual Links per Add

In [153]:
def get_url_by_add(driver, url_list):
    '''Gets url list by each individual add
    - return: list of url of each add
    '''
    url_by_add = []
    # iterate over link pages
    for i in url_list:
        # get url on driver
        driver.get(i)
        time.sleep(3)
        # find all box details
        box_details = driver.find_elements(By.XPATH, "//div[@class='ann-box-details']/a")

        for j in box_details:
            url_by_add.append(j.get_attribute("href"))
    return url_by_add

In [154]:
url_by_add = get_url_by_add(driver, url_list)

In [43]:
# save to csv all urls
df_by_add = pd.DataFrame(url_by_add, columns=["url_by_add"])
df_by_add.to_csv("data/url_by_add.csv")

_________________
## Get Attributes 

### Get area per add

In [46]:
def get_area(labels_in_add, values_in_add):
    '''Return area of Apartment if present; NaN otherwise
    - labels_in_add: labels of full description
    - values_in_add: values of full description
    '''
    # eval if key word for price is present
    if 'M²' in labels_in_add:
        indx = labels_in_add.index('M²')
    elif 'M² of construction' in labels_in_add:
        indx = labels_in_add.index('M² of construction')
    elif 'M²:' in labels_in_add:
        indx = labels_in_add.index('M²:')
    elif 'M² of construction:' in labels_in_add:
        indx = labels_in_add.index('M² of construction:')
    else:
        indx = None
        
    # if keyword for area not present use nan
    if indx != None:
        area = values_in_add[indx]
    else:  
         area = 'NaN'
    
    return area
    

### Get Price per add

In [6]:
def get_price(labels_in_add, values_in_add, driver):
    '''Return Price of Apartment if present; NaN otherwise
    - labels_in_add: labels of full description
    - values_in_add: values of full description
    '''

    # eval if key word for price is present
    if 'Price' in labels_in_add:
        indx = labels_in_add.index('Price')
        
    elif 'Precio' in labels_in_add:
        indx = labels_in_add.index('Precio')
        
    elif 'Price:' in labels_in_add:
        indx = labels_in_add.index('Price:')
        
    elif 'Precio:' in labels_in_add:
        indx = labels_in_add.index('Precio:')
    else:
        indx = None
        
    # if keyword for price not present use selenium xpath
    if indx == None:
        try:
            price = driver.find_element(By.XPATH, "//div/div[@class='d3-property-headline__text-element d3-property-headline__text-element--price']").text
        except:
            price = 'NaN'
    else:
        
        price = values_in_add[indx]
    
    return price

### Get Bathrooms per add

In [47]:
def get_bath(labels_in_add, values_in_add):
    '''Return number of bathrooms of Apartment if present; NaN otherwise
    - labels_in_add: labels of full description
    - values_in_add: values of full description
    '''
    
    # eval if key word for bathroom is present
    if 'Bathrooms' in labels_in_add:
        indx = labels_in_add.index('Bathrooms')
        
    elif 'Bathrooms:' in labels_in_add:
        indx = labels_in_add.index('Bathrooms:')

    else:
        indx = None
        
    # if keyword for area not present use nan
    if indx != None:
        bath = values_in_add[indx]
    else:  
         bath = 'NaN'
    
    return bath

### Get Bedrooms per add

In [48]:
def get_bed(labels_in_add, values_in_add):
    '''Return number of bedrooms of Apartment if present; NaN otherwise
    - labels_in_add: labels of full description
    - values_in_add: values of full description
    '''
    
    # eval if key word for bedroom is present
    if 'Bedrooms' in labels_in_add:
        indx = labels_in_add.index('Bedrooms')
        
    elif 'Bedrooms:' in labels_in_add:
        indx = labels_in_add.index('Bedrooms:')

    else:
        indx = None
        
    # if keyword for bedroom not present use nan
    if indx != None:
        bed = values_in_add[indx]
    else:  
         bed = 'NaN'
    
    return bed

### Get Parkings per add

In [49]:
def get_park(labels_in_add, values_in_add):
    '''Return number of parkings of Apartment if present; NaN otherwise
    - labels_in_add: labels of full description
    - values_in_add: values of full description
    '''
    
    # eval if key word for parking is present
    if 'Parking' in labels_in_add:
        indx = labels_in_add.index('Parking')
        
    elif 'Parking:' in labels_in_add:
        indx = labels_in_add.index('Parking:')

    else:
        indx = None
        
    # if keyword for parking not present use nan
    if indx != None:
        park = values_in_add[indx]
    else:  
         park = 'NaN'
    
    return park

# Get full description per add

In [50]:
# Case 3
def get_attributes_case_3(driver, add_price,  
                          add_area_val, add_bed_val,
                          add_bath_val, add_park_val):
    '''
    Get the attributes and append to respective list in case of different formating on the add. This is not efficient method as it adds another for loop.
    - driver: browser object
    - remaining params are list to append the data
    '''
    
    full_desc_labl = driver.find_elements(By.XPATH, "//div[@class='col-800']/ul/li/span[@class='info-name']")
    full_desc_val = driver.find_elements(By.XPATH, "//div[@class='col-800']/ul/li/span[@class='info-value']")
    
    if len(full_desc_labl) == 0:
        print("initial link to all descriptions did not work, Use the alternative full description")
        
        full_desc_labl = driver.find_elements(By.XPATH, "//dl/dt[@class='d3-property-insight__attribute-title']")
        full_desc_val = driver.find_elements(By.XPATH, "//dl/dd[@class='d3-property-insight__attribute-value']")
        
        if len(full_desc_labl) == 0:
            print("find a new way to get attributtes")
            raise NotImplementedError
        
    labels_in_add = []
    values_in_add = []
    for i in range(len(full_desc_labl)):

        # print(i, full_desc_labl[i].text, full_desc_val[i].text, '<< These are the label and value at each index on the description')
        
        labels_in_add.append(full_desc_labl[i].text)
        values_in_add.append(full_desc_val[i].text)
    
    

    # get price
    price_val = get_price(labels_in_add, values_in_add, driver)

    # get area
    area_val = get_area(labels_in_add, values_in_add)
    
    # get bath    
    bath_val = get_bath(labels_in_add, values_in_add)
     
    # get bed
    bed_val = get_bed(labels_in_add, values_in_add)

 
    # get park
    park_val = get_park(labels_in_add, values_in_add)


    # append values to lists accordingly
    add_price.append(price_val)

    add_area_val.append(area_val)

    add_bed_val.append(bed_val)

    add_bath_val.append(bath_val)

    add_park_val.append(park_val)

    

_________________________________________________________

# Data Extraction


In [12]:
df = pd.read_csv("data/url_by_add.csv")

In [38]:
url_by_add = df['url_by_add']

add_title = []
add_price = []
add_area_val = []
add_bed_val = []
add_bath_val = []
add_park_val = []

# iterate over links
for url in url_by_add[0:2139]: # 2139 for testing
    # print(url)
    # get url on driver
    driver.get(url)
    # sleep 10 seconds just in case
    time.sleep(0.25)
    
    
    # get add and append
    try:
        title = driver.find_element(By.XPATH, "//div/div/h1").text
    
        add_title.append(title)
        print("Title = ", title)

        get_attributes_case_3(driver, add_price,  
                              add_area_val, add_bed_val,
                              add_bath_val, add_park_val)     
    except NoSuchElementException:
        pass


Title =  Panama Viejo Residences, en Panama Viejo
initial link to all descriptions did not work, Use the alternative full description


# Backup storing

In [35]:
df_bckp = pd.DataFrame({"add_title":add_title, 
                   "add_price": add_price, 
                   "add_area_val": add_area_val,
                   "add_bed_val": add_bed_val,
                   "add_bath_val": add_bath_val,
                   "add_park_val": add_park_val
                  })

In [36]:
df_bckp

Unnamed: 0,add_title,add_price,add_area_val,add_bed_val,add_bath_val,add_park_val
0,Apartments in Obarrio | Apartamento - Bella Vi...,"B/.259,900.00",210,3,3,2
1,Apartments in Coco del Mar | Venda de Lujoso A...,"B/.485,000.00",224,3,4,2
2,Apartments in Punta Pacífica | Apartamento - P...,"B/.399,999.00",111.46,1,2,1
3,Apartments in Coco del Mar | Alquiler de apart...,"B/.225,000.00",83,2,2,1
4,Apartments in Marbella | Se vende penthouse ju...,"B/.750,000.00\n (Reduced 5%)",361,4,4,4
...,...,...,...,...,...,...
2135,Apartments in Santa María | Edificio Único en ...,"B/.305,000.00",96,2,2.5,2
2136,The Regent Costa del Este venta,"B/.380,000.00",95,2,2,
2137,Apartments in Coco del Mar | Para Inversionist...,"B/.274,000.00",79,2,2,2
2138,Apartments in San Francisco | Sale of reposses...,"B/.200,000.00",124,3,2.5,2


In [37]:
df_bckp.to_csv("data/data.csv")