In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/hk-centaline-property-buy.csv")

In [3]:
print(df.shape)

(10000, 9)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              10000 non-null  object 
 1   room_numbers      9985 non-null   object 
 2   address           9996 non-null   object 
 3   price             10000 non-null  object 
 4   usable_area       10000 non-null  object 
 5   gross_floor_area  6158 non-null   object 
 6   url               10000 non-null  object 
 7   latitude          8591 non-null   float64
 8   longitude         8591 non-null   float64
dtypes: float64(2), object(7)
memory usage: 703.3+ KB


In [5]:
df.head()

Unnamed: 0,name,room_numbers,address,price,usable_area,gross_floor_area,url,latitude,longitude
0,NOVO LAND・PHASE 2A・CHARLOT TOWER 1B,1 Room・(1 Suite),Siu Hong,3.4M,293ft²,,https://hk.centanet.com/findproperty/en/detail...,22.411753,113.978996
1,BELVEDERE GARDEN・Phase 3・BLOCK 3,High Floor・FLAT A・3 Rooms・(1 Suite),Belvedere,7.2M,728ft²,846ft²,https://hk.centanet.com/findproperty/en/detail...,,
2,PARK ISLAND・PHASE 1・BLOCK 9,2 Rooms,Ma Wan,5.38M,537ft²,706ft²,https://hk.centanet.com/findproperty/en/detail...,22.350075,114.059207
3,LUK YEUNG SUN CHUEN・BLOCK P,FLAT 6・2 Rooms,Luk Yeung,5.6M,445ft²,511ft²,https://hk.centanet.com/findproperty/en/detail...,,
4,Tsuen Wan Centre・Block 15 (Kunming House),Low Floor・FLAT H・2 Rooms,Tsuen King Circuit,3.38M,374ft²,490ft²,https://hk.centanet.com/findproperty/en/detail...,22.376582,114.107233


In [6]:
# Check if there is Billion unit in the price column 
contains_B = df["price"].str.contains("B")
print(df["price"][contains_B])

# Check if every value has "M" in price column 
df["price"].str.contains("M").all()

# Filter and print values without "M"
without_M = df[~df["price"].str.contains("M")]
print(without_M)
print(type(without_M))

Series([], Name: price, dtype: object)
                                                  name    room_numbers  \
5382  CHUK YUEN NORTH ESTATE・WAI YUEN HOUSE (BLOCK 10)  FLAT 5A・Studio   

           address    price usable_area gross_floor_area  \
5382  Wong Tai Sin  950,000      149ft²              NaN   

                                                    url   latitude  longitude  
5382  https://hk.centanet.com/findproperty/en/detail...  22.354812  114.19744  
<class 'pandas.core.frame.DataFrame'>


In [7]:
# Get the url of the hkd 950,000 house page
df.loc[5382,"url"]

'https://hk.centanet.com/findproperty/en/detail/CHUK-YUEN-NORTH-ESTATE-WAI-YUEN-HOUSE-(BLOCK-10)_UKA799?theme=buy'

In [8]:
# Change 950,000 into 0.95M 
df.loc[5382,"price"] = "0.95M"
df.loc[5382,:]

name                 CHUK YUEN NORTH ESTATE・WAI YUEN HOUSE (BLOCK 10)
room_numbers                                           FLAT 5A・Studio
address                                                  Wong Tai Sin
price                                                           0.95M
usable_area                                                    149ft²
gross_floor_area                                                  NaN
url                 https://hk.centanet.com/findproperty/en/detail...
latitude                                                    22.354812
longitude                                                   114.19744
Name: 5382, dtype: object

In [9]:
# Remove "M" and "," in the price column and turn into float 
df["price"] = df["price"].str.replace(",", "").str.replace("M", "").astype(float)

In [10]:
# Convert the price back to millions 
df["price"] = df["price"] * 1e6

In [11]:
# Remove "ft²" from "usable_area" and "gross_floor_area"
df["usable_area"] = (df["usable_area"]
                     .str.replace("ft²", "")
                     .str.replace(",", "")
                     .astype(float))
df["gross_floor_area"] = (df["gross_floor_area"]
                          .str.replace("ft²", "")
                          .str.replace(",", "")
                          .astype(float))

In [12]:
# Create a "price_per_sq_foot" column 
df["price_per_sq_foot_SA"] = df["price"] / df["usable_area"]
df["price_per_sq_foot_GFA"] = df["price"] / df["gross_floor_area"]

In [13]:
df.iloc[5382,:]

name                      CHUK YUEN NORTH ESTATE・WAI YUEN HOUSE (BLOCK 10)
room_numbers                                                FLAT 5A・Studio
address                                                       Wong Tai Sin
price                                                             950000.0
usable_area                                                          149.0
gross_floor_area                                                       NaN
url                      https://hk.centanet.com/findproperty/en/detail...
latitude                                                         22.354812
longitude                                                        114.19744
price_per_sq_foot_SA                                           6375.838926
price_per_sq_foot_GFA                                                  NaN
Name: 5382, dtype: object

In [14]:
# Sort df by price per square foot using S.A. in ascending order, in case of ties, sort by per GFA 
# Reset the row index 
df = df.sort_values(by = ["price_per_sq_foot_SA", "price_per_sq_foot_GFA"], ascending=True).reset_index(drop=True)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   name                   10000 non-null  object 
 1   room_numbers           9985 non-null   object 
 2   address                9996 non-null   object 
 3   price                  10000 non-null  float64
 4   usable_area            10000 non-null  float64
 5   gross_floor_area       6158 non-null   float64
 6   url                    10000 non-null  object 
 7   latitude               8591 non-null   float64
 8   longitude              8591 non-null   float64
 9   price_per_sq_foot_SA   10000 non-null  float64
 10  price_per_sq_foot_GFA  6158 non-null   float64
dtypes: float64(7), object(4)
memory usage: 859.5+ KB


In [16]:
# Most expensive per square foot house url: 
print(f"The most expensive per square foot house url: {df.loc[9999,"url"]}")

# Least expensive per square foot house url:
print(f"The least expensive per square foot house url: {df.loc[0,"url"]}")

The most expensive per square foot house url: https://hk.centanet.com/findproperty/en/detail/BEL-AIR-RISE_TLA343?theme=buy
The least expensive per square foot house url: https://hk.centanet.com/findproperty/en/detail/KING-LAM-ESTATE-KING-LUI-HOUSE-(BLOCK-2)_UDU107?theme=buy


### Convert address into Lat and Long using Google Geocoding 

In [17]:
# ! pip install requests

In [18]:
# import requests
# import pandas as pd
# import os

# # Import the API key
# from dotenv import load_dotenv

# def configure():
#     load_dotenv()

# api_key = os.getenv('api_key')

# # Load address data
# df = pd.read_csv('data/hk-centaline-property-buy.csv')  
# address = df['address']  

# # Function to get lat and long for an address
# def geocode_address(address, api_key):
#     configure() # Call the `configure` function to get my API

#     base_url = 'https://maps.googleapis.com/maps/api/geocode/json'
#     params = {'address': address, 'key': api_key}
#     response = requests.get(base_url, params=params)
#     results = response.json().get('results', [])
#     if results:
#         location = results[0]['geometry']['location']
#         return location['lat'], location['lng']
#     else:
#         return None, None

# # Add lat and long columns to your DataFrame
# df['latitude'], df['longitude'] = zip(*df['address'].apply(lambda x: geocode_address(x, api_key)))

# # Save the updated DataFrame to a new CSV file
# df.to_csv('address_with_lat_long.csv', index=False)


In [19]:
df.tail()

Unnamed: 0,name,room_numbers,address,price,usable_area,gross_floor_area,url,latitude,longitude,price_per_sq_foot_SA,price_per_sq_foot_GFA
9995,The Arch・SUN TOWER (BLOCK 1A),High Floor・FLAT B・4 Rooms・(2 Suites),Kowloon Station,135000000.0,1842.0,2316.0,https://hk.centanet.com/findproperty/en/detail...,22.304306,114.161475,73289.90228,58290.15544
9996,RESIDENCE BEL-AIR・PHASE 1・TOWER 2,High Floor・FLAT B・4 Rooms,Residence Bel-air,175000000.0,2303.0,2748.0,https://hk.centanet.com/findproperty/en/detail...,22.254545,114.132685,75987.841945,63682.678311
9997,Infinity,3 Rooms,Peak,180000000.0,2061.0,2980.0,https://hk.centanet.com/findproperty/en/detail...,,,87336.244541,60402.684564
9998,Regence Royale・TOWER 2,Middle Floor・FLAT B・4 Rooms・(1 Suite),Mid-Levels Central,168000000.0,1805.0,2522.0,https://hk.centanet.com/findproperty/en/detail...,22.283846,114.154969,93074.792244,66613.798573
9999,BEL-AIR RISE,HOUSE 9・4 Rooms,Residence Bel-air,460000000.0,4667.0,6562.0,https://hk.centanet.com/findproperty/en/detail...,22.254545,114.132685,98564.388258,70100.579092


In [20]:
df.head()

Unnamed: 0,name,room_numbers,address,price,usable_area,gross_floor_area,url,latitude,longitude,price_per_sq_foot_SA,price_per_sq_foot_GFA
0,KING LAM ESTATE・KING LUI HOUSE (BLOCK 2),Low Floor・FLAT 21・1 Room,Po Lam,1400000.0,349.0,,https://hk.centanet.com/findproperty/en/detail...,22.323713,114.25488,4011.461318,
1,TAK TIN ESTATE・TAK YEE HOUSE (BLOCK 3),(1 Suite),Lam Tin,1800000.0,443.0,,https://hk.centanet.com/findproperty/en/detail...,22.30802,114.237564,4063.205418,
2,FU SHIN ESTATE・SHIN KWAN HOUSE (BLOCK 1),1 Room,Tai Po Town Centre,1200000.0,294.0,,https://hk.centanet.com/findproperty/en/detail...,22.442322,114.165506,4081.632653,
3,Long Ping Estate・HOR PING HOUSE,Low Floor・FLAT 35・2 Rooms,Long Ping,1480000.0,355.0,,https://hk.centanet.com/findproperty/en/detail...,,,4169.014085,
4,TAI WO ESTATE・OI WO HOUSE (BLOCK 2),Studio,Tai Po Town Centre,1480000.0,349.0,,https://hk.centanet.com/findproperty/en/detail...,22.442322,114.165506,4240.687679,
