In [None]:
import os
import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
from sklearn.preprocessing import MinMaxScaler

# 1. Read Excel data
file_path = "merged_output_cleaned.xlsx"
if not os.path.exists(file_path):
    raise FileNotFoundError(f"Error: File not found at {file_path}")

df = pd.read_excel(file_path, engine="openpyxl")

# 2. Clean the 'price' column
# Remove non-numeric characters and convert to float
df["price"] = (
    df["price"]
    .astype(str)
    .str.replace("€", "", regex=False)
    .str.replace("/ month", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.extract(r"(\d+\.?\d*)")  # Keep only numbers
    .astype(float)
)
# Filter out invalid price data
df = df.dropna(subset=["price"])

# 3. Fix image paths
# Remove prefixes like 'property_images22/'
df["image"] = df["image"].str.replace("property_images22\\", "", regex=False)
df["image"] = df["image"].str.replace("property_images22/", "", regex=False)

# Define the image folder
image_folder = "merged_images"

def extract_features(img_path):
    try:
        img = image.load_img(img_path, target_size=(224, 224))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        features = model.predict(img_array)
        return features.flatten()
    except Exception as e:
        print(f"Error processing {img_path}: {e}")
        return None

# 4. Load MobileNetV2 for feature extraction
base_model = MobileNetV2(weights="imagenet", include_top=False, pooling="avg")
model = Model(inputs=base_model.input, outputs=base_model.output)

image_features = []
image_files = []

# 5. Extract image features
for img_name in df["image"]:
    img_path = os.path.join(image_folder, img_name)
    if os.path.exists(img_path):
        features = extract_features(img_path)
        if features is not None:
            image_features.append(features)
            image_files.append(img_name)
    else:
        print(f"Warning: Image file {img_path} not found!")

# Convert to NumPy array
image_features = np.array(image_features)
print(f"Extracted image feature shape: {image_features.shape}")

if image_features.shape[0] == 0:
    raise ValueError("Error: No valid image features extracted. Check image paths and extraction process.")

# 6. Perform dimensionality reduction using UMAP
umap_model = umap.UMAP(n_components=2, n_neighbors=30, min_dist=0.3, random_state=42)
X_umap = umap_model.fit_transform(image_features)

# 7. Normalize price data
df_filtered = df[df["image"].isin(image_files)]
price_data = df_filtered["price"].values.reshape(-1, 1)
scaler = MinMaxScaler()
price_scaled = scaler.fit_transform(price_data).flatten()

# 8. Visualization
plt.figure(figsize=(10, 6))
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=price_scaled, cmap="coolwarm", alpha=0.7)
plt.colorbar(label="Price (Normalized)")
plt.title("Price Analysis of Airbnb Listings Based on Image Features")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")

# Annotate the top 10 highest price points
top_10_idx = np.argsort(-price_scaled)[:10]  # Get the indices of the top 10 highest prices
for idx in top_10_idx:
    plt.annotate(f"{df_filtered.iloc[idx]['price']:.0f}", (X_umap[idx, 0], X_umap[idx, 1]),
                 textcoords="offset points", xytext=(5,5), ha='right', fontsize=9, color='black', weight='bold')

plt.show()


  base_model = MobileNetV2(weights="imagenet", include_top=False, pooling="avg")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 186ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [