In [1]:
import pandas as pd
import numpy as np
import ast
import os
from tqdm import tqdm
import torch
from sentence_transformers import SentenceTransformer
import torchvision.transforms as T
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../data/processed/feature_engineered_dataset.csv")
print("Loaded dataset:", df.shape)
df.head()

Loaded dataset: (1000, 43)


Unnamed: 0,id,title,price,address,num_of_bedrooms,num_of_bathrooms,floor_area,description,list_of_amenities,image_urls,...,"amenity_['fire_exits',_'basketball_court',_'jogging_path',_'playground',_'secure_parking',_'sports_facilities',_'swimming_pool']","amenity_['cctv',_'bar',_'elevators',_'entertainment_room',_'fire_exits',_'fitness_center',_'function_room',_'lobby',_'lounge',_'reception_area',_'fire_alarm',_'fire_sprinkler_system',_'24-hour_security',_'shops',_'basketball_court',_'shower_rooms',_'swimming_pool']","amenity_['cctv',_'elevators',_'fire_exits',_'fitness_center',_'function_room',_'game_room',_'gym',_'lobby',_'lounge',_'reception_area',_'fire_alarm',_'fire_sprinkler_system',_'24-hour_security',_'basketball_court',_'clubhouse',_'garden',_'jogging_path',_'landscaped_garden',_'playground',_'swimming_pool']","amenity_['cctv',_'balcony',_'elevators',_'fire_exits',_'function_room',_'gym',_'lobby',_'lounge',_'reception_area',_'fire_alarm',_'fire_sprinkler_system',_'24-hour_security',_'clubhouse',_'deck',_'swimming_pool']","amenity_['cctv',_'bar',_'elevators',_'entertainment_room',_'fire_exits',_'fitness_center',_'function_room',_'lobby',_'lounge',_'powder_room',_'reception_area',_'fire_alarm',_'fire_sprinkler_system',_'24-hour_security',_'shops',_'basketball_court',_'shower_rooms',_'swimming_pool']","amenity_['carport',_'fully_fenced',_'swimming_pool',_'24-hour_security',_'basketball_court',_'clubhouse',_'playground']","amenity_['balcony',_'fire_exits',_'basketball_court',_'jogging_path',_'playground',_'secure_parking',_'sports_facilities',_'swimming_pool']","amenity_['cctv',_'balcony',_'elevators',_'entertainment_room',_'fire_exits',_'gym',_'lobby',_'reception_area',_'fire_alarm',_'fire_sprinkler_system',_'open_space',_'24-hour_security',_'basketball_court',_'clubhouse',_'swimming_pool']","amenity_['cctv',_'utility_room',_'balcony',_'elevators',_'fire_exits',_'function_room',_'gym',_'lobby',_'lounge',_'meeting_rooms',_'reception_area',_'fire_alarm',_'fire_sprinkler_system',_'courtyard',_'24-hour_security',_'deck',_'playground',_'swimming_pool']","amenity_['garage',_'jogging_path',_'swimming_pool',_'24-hour_security',_'clubhouse',_'playground']"
0,1,1BEDROOM CONDO UNIT FOR SALE AT GOLD RESIDENCE...,6973985.0,"Ninoy Aquino Avenue, Brgy. Sto. Niño, Parañaqu...",1.0,1.0,25.0,"GOLD RESIDENCES!!! Across NAIA Terminal 1, Par...","cctv, utility room, air conditioning, alarm sy...",https://static-ph.lamudi.com/static/media/bm9u...,...,False,False,False,False,False,False,False,False,False,False
1,2,"1 Bedroom w/ balcony For Sale Le Pont Tower 2,...",14829797.0,"Bridgetowne East, Eulogio Amang Rodriguez Ave....",1.0,1.0,45.0,LE PONT RESIDENCES TOWER 2 Completion Date: Se...,"cctv, air conditioning, alarm system, billiard...",https://static-ph.lamudi.com/static/media/bm9u...,...,False,False,False,False,False,False,False,False,False,False
2,3,Operational Resort for Sale | OR01 | San Narci...,60000000.0,"Alusiis, San Narciso",0.0,0.0,0.0,Take over this income-generating resort in a 1...,"air conditioning, alarm system, cctv, driver's...",https://static-ph.lamudi.com/static/media/bm9u...,...,False,False,False,False,False,False,False,False,False,False
3,4,1 Bedroom Loft-Type Condo For Sale in Bellagio...,15000000.0,"Fort Bonifacio, Taguig",1.0,2.0,58.0,UNIT DESCRIPTION: 1 Bedroom 2 Toilet and Bath ...,"gymnasium, air conditioning, alarm system, ele...",https://static-ph.lamudi.com/static/media/bm9u...,...,False,False,False,False,False,False,False,False,False,False
4,5,For Sale 2 Bedroom Rent to Own Condo in Floren...,16920000.0,"Florence Way, 1634 Taguig City, Philippines\n ...",2.0,3.0,79.0,THE FLORENCE RESIDENCES READY FOR OCCUPANCY | ...,"gymnasium, cctv, utility room, indoor pool, ai...",https://static-ph.lamudi.com/static/media/bm9u...,...,False,False,False,False,False,False,False,False,False,False


In [3]:
# Load text transformer
model_text = SentenceTransformer('all-MiniLM-L6-v2')

# Function to encode text safely
def encode_text(text):
    if pd.isna(text) or len(str(text).strip()) == 0:
        text = ""
    return model_text.encode(str(text), show_progress_bar=False)

# Generate embeddings
text_embeddings = []

for txt in tqdm(df["full_text"], desc="Encoding text"):
    emb = encode_text(txt)
    text_embeddings.append(emb)

text_embeddings = np.array(text_embeddings)

print("Text embedding shape:", text_embeddings.shape)


Encoding text: 100%|██████████| 1000/1000 [00:37<00:00, 26.32it/s]

Text embedding shape: (1000, 384)





In [4]:
amenity_embeddings = []

for row in tqdm(df["amenities_parsed"], desc="Encoding amenities"):
    if isinstance(row, list):
        sentence = ", ".join(row)
    else:
        sentence = ""
    emb = model_text.encode(sentence, show_progress_bar=False)
    amenity_embeddings.append(emb)

amenity_embeddings = np.array(amenity_embeddings)

print("Amenity embedding shape:", amenity_embeddings.shape)


Encoding amenities: 100%|██████████| 1000/1000 [00:06<00:00, 158.06it/s]

Amenity embedding shape: (1000, 384)





In [5]:
import torchvision.models as models

# EfficientNet-B0 pretrained
model_img = models.efficientnet_b0(weights="IMAGENET1K_V1")
model_img.classifier = torch.nn.Identity()   # remove classifier → output = 1280 dim
model_img.eval()

# Image transforms
transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406],
                [0.229, 0.224, 0.225])
])


In [6]:
def extract_image_feature(img_path):
    try:
        img = Image.open(img_path).convert("RGB")
        img = transform(img).unsqueeze(0)
        with torch.no_grad():
            feat = model_img(img).squeeze().numpy()
        return feat
    except:
        return np.zeros(1280)


In [7]:
image_base = "../data/raw/Real Estate Property Dataset/Images/Lamudi Images/exif/"

image_features = []

for imgs in tqdm(df["local_images"], desc="Extracting image features"):
    try:
        files = ast.literal_eval(imgs)
        feats = []
        for f in files:
            full_path = os.path.join(image_base, f)
            feats.append(extract_image_feature(full_path))

        # Pool multiple images → mean
        feats = np.mean(np.vstack(feats), axis=0)

    except Exception:
        feats = np.zeros(1280)

    image_features.append(feats)

image_features = np.array(image_features)
print("Image embedding shape:", image_features.shape)


Extracting image features: 100%|██████████| 1000/1000 [02:40<00:00,  6.21it/s]

Image embedding shape: (1000, 1280)





In [8]:
final_X = np.hstack([
    text_embeddings,
    amenity_embeddings,
    image_features
])

print("Final feature matrix shape:", final_X.shape)


Final feature matrix shape: (1000, 2048)


In [9]:
os.makedirs("../data/processed", exist_ok=True)

import joblib

joblib.dump({
    "X": final_X,
    "y": df["log_price"].values,
    "columns": {
        "text_dim": text_embeddings.shape[1],
        "amenity_dim": amenity_embeddings.shape[1],
        "image_dim": image_features.shape[1]
    }
}, "../data/processed/final_model_input.pkl")

print("Saved → data/processed/final_model_input.pkl")


Saved → data/processed/final_model_input.pkl
