In [1]:
import pandas as pd
import os

RAW_CSV_PATH = "../data/raw/Real Estate Property Dataset/CSV/Lamudi Properties.csv"          
CLEAN_CSV_PATH = "../data/processed/Lamudi Properties.csv"     
#LOAD & CLEAN DATASET
print("Loading dataset...")
df = pd.read_csv(RAW_CSV_PATH)

print("Rows before cleaning:", len(df))

# 1) Remove duplicate listings on ID or URL
if "id" in df.columns:
    df.drop_duplicates(subset=["id"], inplace=True)
elif "url" in df.columns:
    df.drop_duplicates(subset=["url"], inplace=True)
else:
    df.drop_duplicates(inplace=True)

# 2) Remove empty rows where price or description is missing
must_have_cols = []
if "price" in df.columns: must_have_cols.append("price")
if "description" in df.columns: must_have_cols.append("description")

df.dropna(subset=must_have_cols, inplace=True)

print("Rows after cleaning:", len(df))

# 3) Save the clean dataset
df.to_csv(CLEAN_CSV_PATH, index=False)
print(f"Saved clean dataset → {CLEAN_CSV_PATH}")


Loading dataset...
Rows before cleaning: 1000
Rows after cleaning: 1000
Saved clean dataset → ../data/processed/Lamudi Properties.csv


In [5]:
import re

# Load CSV
df = pd.read_csv('../data/processed/Lamudi Properties.csv')

# Path to image folder
image_folder = '../data/raw/Real Estate Property Dataset/Images/Lamudi Images/exif/'

# Get all image filenames
all_images = os.listdir(image_folder)

# Function to find images that match the ID
def find_images_for_id(property_id):
    pattern = re.compile(rf"A{property_id}\.\d+\.jpg", re.IGNORECASE)
    matches = [img for img in all_images if pattern.match(img)]
    return matches if matches else []

# Apply to CSV
df["Local_Images"] = df["ID"].apply(find_images_for_id)

# Save updated CSV
output_path = '../data/processed/Lamudi_Properties_with_images.csv'
df.to_csv(output_path, index=False)

print("Saved updated dataset to:", output_path)


Saved updated dataset to: ../data/processed/Lamudi_Properties_with_images.csv


In [6]:
df = pd.read_csv('../data/processed/Lamudi_Properties_with_images.csv')

df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
      .str.replace("-", "_")
)

df.head()

Unnamed: 0,id,title,price,address,num_of_bedrooms,num_of_bathrooms,floor_area,description,list_of_amenities,image_urls,property_url,local_images
0,1,1BEDROOM CONDO UNIT FOR SALE AT GOLD RESIDENCE...,6973985.0,"Ninoy Aquino Avenue, Brgy. Sto. Niño, Parañaqu...",1.0,1.0,25.0,"GOLD RESIDENCES!!! Across NAIA Terminal 1, Par...","CCTV, Utility room, Air conditioning, Alarm Sy...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/projects/gold-reside...,"['A1.1.jpg', 'A1.2.jpg', 'A1.3.jpg', 'A1.4.jpg..."
1,2,"1 Bedroom w/ balcony For Sale Le Pont Tower 2,...",14829797.0,"Bridgetowne East, Eulogio Amang Rodriguez Ave....",1.0,1.0,45.0,LE PONT RESIDENCES TOWER 2 Completion Date: Se...,"CCTV, Air conditioning, Alarm System, Billiard...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/buy/metro-manila/pas...,"['A2.1.jpg', 'A2.2.jpg', 'A2.3.jpg', 'A2.4.jpg..."
2,3,Operational Resort for Sale | OR01 | San Narci...,60000000.0,"Alusiis, San Narciso",,,,Take over this income-generating resort in a 1...,"Air conditioning, Alarm System, CCTV, Driver's...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/buy/zambales/san-nar...,"['A3.1.jpg', 'A3.2.jpg', 'A3.3.jpg', 'A3.4.jpg..."
3,4,1 Bedroom Loft-Type Condo For Sale in Bellagio...,15000000.0,"Fort Bonifacio, Taguig",1.0,2.0,58.0,UNIT DESCRIPTION:\n✅ 1 Bedroom\n✅ 2 Toilet and...,"Gymnasium, Air conditioning, Alarm System, Ele...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/projects/the-bellagi...,"['A4.1.jpg', 'A4.2.jpg', 'A4.3.jpg', 'A4.4.jpg..."
4,5,For Sale 2 Bedroom Rent to Own Condo in Floren...,16920000.0,"Florence Way, 1634 Taguig City, Philippines\n ...",2.0,3.0,79.0,THE FLORENCE RESIDENCES\nREADY FOR OCCUPANCY |...,"Gymnasium, CCTV, Utility room, Indoor Pool, Ai...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/buy/metro-manila/tag...,"['A5.1.jpg', 'A5.2.jpg', 'A5.3.jpg', 'A5.4.jpg..."


In [7]:
num_cols = ["num_of_bedrooms", "num_of_bathrooms", "floor_area", "price"]

for col in num_cols:
    if col in df.columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace("[^0-9.]", "", regex=True)
            .replace("", None)
        )
        df[col] = pd.to_numeric(df[col], errors="coerce")


In [8]:
df = df[df["local_images"].apply(lambda x: len(x) > 0)]
df.shape


(1000, 12)

In [9]:
def extract_city(addr):
    if not isinstance(addr, str): return None
    parts = addr.split(',')
    return parts[-1].strip() if len(parts) > 1 else None

df["city"] = df["address"].apply(extract_city)


In [12]:
# List the real text columns in your dataset
text_cols = ["title", "description", "list_of_amenities", "address"]

# Combine them into one full_text field
df["full_text"] = df[text_cols].astype(str).agg(" ".join, axis=1)

# Clean the text for BERT 
def clean_text(t):
    t = t.lower()
    t = re.sub(r"[^a-z0-9 ]", " ", t)
    t = re.sub(r"\s+", " ", t)
    return t.strip()

df["full_text"] = df["full_text"].apply(clean_text)


In [11]:
list(df.columns)


['id',
 'title',
 'price',
 'address',
 'num_of_bedrooms',
 'num_of_bathrooms',
 'floor_area',
 'description',
 'list_of_amenities',
 'image_urls',
 'property_url',
 'local_images',
 'city']

In [13]:
num_cols = ["num_of_bedrooms", "num_of_bathrooms", "floor_area", "price"]

for col in num_cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.replace("[^0-9.]", "", regex=True)
        .replace("", None)
    )
    df[col] = pd.to_numeric(df[col], errors="coerce")


In [15]:
def extract_city(addr):
    if not isinstance(addr, str): return None
    parts = addr.split(',')
    return parts[-1].strip() if len(parts) > 1 else None

df["city"] = df["address"].apply(extract_city)


In [18]:
import ast

df["local_images"] = df["local_images"].apply(ast.literal_eval)


In [20]:
def clean_amenities(a):
    if not isinstance(a, str):
        return ""
    items = [x.strip().lower() for x in a.split(",")]
    items = list(dict.fromkeys(items))  # remove duplicates, keep order
    return ", ".join(items)

df["list_of_amenities"] = df["list_of_amenities"].apply(clean_amenities)


In [21]:
def improve_text(t):
    t = re.sub(r"#\w+", "", t)  # remove hashtags
    t = re.sub(r"\b\w{1,2}\b", "", t)  # remove tiny useless words
    t = re.sub(r"\d+(\,\d+)*(\.\d+)*", "", t)  # remove big numbers & prices
    t = re.sub(r"[\n\r]", " ", t)
    t = re.sub(r"\s+", " ", t)
    return t.strip()

df["full_text"] = df["full_text"].apply(improve_text)

df.head()

Unnamed: 0,id,title,price,address,num_of_bedrooms,num_of_bathrooms,floor_area,description,list_of_amenities,image_urls,property_url,local_images,city,full_text
0,1,1BEDROOM CONDO UNIT FOR SALE AT GOLD RESIDENCE...,6973985.0,"Ninoy Aquino Avenue, Brgy. Sto. Niño, Parañaqu...",1.0,1.0,25.0,"GOLD RESIDENCES!!! Across NAIA Terminal 1, Par...","cctv, utility room, air conditioning, alarm sy...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/projects/gold-reside...,"[A1.1.jpg, A1.2.jpg, A1.3.jpg, A1.4.jpg, A1.5....",Parañaque,bedroom condo unit for sale gold residences ac...
1,2,"1 Bedroom w/ balcony For Sale Le Pont Tower 2,...",14829797.0,"Bridgetowne East, Eulogio Amang Rodriguez Ave....",1.0,1.0,45.0,LE PONT RESIDENCES TOWER 2 Completion Date: Se...,"cctv, air conditioning, alarm system, billiard...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/buy/metro-manila/pas...,"[A2.1.jpg, A2.2.jpg, A2.3.jpg, A2.4.jpg, A2.5....",Pasig,bedroom balcony for sale pont tower bridgetown...
2,3,Operational Resort for Sale | OR01 | San Narci...,60000000.0,"Alusiis, San Narciso",,,,Take over this income-generating resort in a 1...,"air conditioning, alarm system, cctv, driver's...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/buy/zambales/san-nar...,"[A3.1.jpg, A3.2.jpg, A3.3.jpg, A3.4.jpg, A3.5....",San Narciso,operational resort for sale or san narciso zam...
3,4,1 Bedroom Loft-Type Condo For Sale in Bellagio...,15000000.0,"Fort Bonifacio, Taguig",1.0,2.0,58.0,UNIT DESCRIPTION:\n✅ 1 Bedroom\n✅ 2 Toilet and...,"gymnasium, air conditioning, alarm system, ele...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/projects/the-bellagi...,"[A4.1.jpg, A4.2.jpg, A4.3.jpg, A4.4.jpg, A4.5....",Taguig,bedroom loft type condo for sale bellagio tagu...
4,5,For Sale 2 Bedroom Rent to Own Condo in Floren...,16920000.0,"Florence Way, 1634 Taguig City, Philippines\n ...",2.0,3.0,79.0,THE FLORENCE RESIDENCES\nREADY FOR OCCUPANCY |...,"gymnasium, cctv, utility room, indoor pool, ai...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/buy/metro-manila/tag...,"[A5.1.jpg, A5.2.jpg, A5.3.jpg, A5.4.jpg, A5.5....",Taguig,for sale bedroom rent own condo florence mckin...


In [25]:
import re

def remove_emojis(text):
    if not isinstance(text, str):
        return text
    # Remove most emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags
        "\U00002700-\U000027BF"  # Dingbats
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

for col in cols_to_clean:
    if col in df.columns:
        df[col] = df[col].apply(remove_emojis)


In [26]:
df.head()

Unnamed: 0,id,title,price,address,num_of_bedrooms,num_of_bathrooms,floor_area,description,list_of_amenities,image_urls,property_url,local_images,city,full_text
0,1,1BEDROOM CONDO UNIT FOR SALE AT GOLD RESIDENCE...,6973985.0,"Ninoy Aquino Avenue, Brgy. Sto. Niño, Parañaqu...",1.0,1.0,25.0,"GOLD RESIDENCES!!! Across NAIA Terminal 1, Par...","cctv, utility room, air conditioning, alarm sy...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/projects/gold-reside...,"[A1.1.jpg, A1.2.jpg, A1.3.jpg, A1.4.jpg, A1.5....",Parañaque,bedroom condo unit for sale gold residences ac...
1,2,"1 Bedroom w/ balcony For Sale Le Pont Tower 2,...",14829797.0,"Bridgetowne East, Eulogio Amang Rodriguez Ave....",1.0,1.0,45.0,LE PONT RESIDENCES TOWER 2 Completion Date: Se...,"cctv, air conditioning, alarm system, billiard...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/buy/metro-manila/pas...,"[A2.1.jpg, A2.2.jpg, A2.3.jpg, A2.4.jpg, A2.5....",Pasig,bedroom balcony for sale pont tower bridgetown...
2,3,Operational Resort for Sale | OR01 | San Narci...,60000000.0,"Alusiis, San Narciso",,,,Take over this income-generating resort in a 1...,"air conditioning, alarm system, cctv, driver's...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/buy/zambales/san-nar...,"[A3.1.jpg, A3.2.jpg, A3.3.jpg, A3.4.jpg, A3.5....",San Narciso,operational resort for sale or san narciso zam...
3,4,1 Bedroom Loft-Type Condo For Sale in Bellagio...,15000000.0,"Fort Bonifacio, Taguig",1.0,2.0,58.0,UNIT DESCRIPTION: 1 Bedroom 2 Toilet and Bath ...,"gymnasium, air conditioning, alarm system, ele...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/projects/the-bellagi...,"[A4.1.jpg, A4.2.jpg, A4.3.jpg, A4.4.jpg, A4.5....",Taguig,bedroom loft type condo for sale bellagio tagu...
4,5,For Sale 2 Bedroom Rent to Own Condo in Floren...,16920000.0,"Florence Way, 1634 Taguig City, Philippines\n ...",2.0,3.0,79.0,THE FLORENCE RESIDENCES READY FOR OCCUPANCY | ...,"gymnasium, cctv, utility room, indoor pool, ai...",https://static-ph.lamudi.com/static/media/bm9u...,https://www.lamudi.com.ph/buy/metro-manila/tag...,"[A5.1.jpg, A5.2.jpg, A5.3.jpg, A5.4.jpg, A5.5....",Taguig,for sale bedroom rent own condo florence mckin...


In [27]:
list(df.columns)

['id',
 'title',
 'price',
 'address',
 'num_of_bedrooms',
 'num_of_bathrooms',
 'floor_area',
 'description',
 'list_of_amenities',
 'image_urls',
 'property_url',
 'local_images',
 'city',
 'full_text']

In [34]:

import numpy as np

# List of text columns
text_cols = ["title", "description", "list_of_amenities", "full_text", "address", "city", "property_url", "image_urls", "local_images"]

# List of numeric columns
num_cols = ["price", "num_of_bedrooms", "num_of_bathrooms", "floor_area"]

# Fill text columns: NaN, empty strings, or only spaces → ""
for col in text_cols:
    if col in df.columns:
        df[col] = df[col].replace(np.nan, "")  # Replace NaN
        df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x)  # remove spaces
        df[col] = df[col].replace("", "")  # ensure empty strings stay consistent

# Fill numeric columns: NaN or non-numeric → 0
for col in num_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Check for remaining missing values
print(df.isna().sum())


id                   0
title                0
price                0
address              0
num_of_bedrooms      0
num_of_bathrooms     0
floor_area           0
description          0
list_of_amenities    0
image_urls           0
property_url         0
local_images         0
city                 0
full_text            0
dtype: int64


In [35]:
output_path = '../data/processed/lamudi_final_clean.csv'
df.to_csv(output_path, index=False)

print("Saved final clean dataset →", output_path)


Saved final clean dataset → ../data/processed/lamudi_final_clean.csv
