# Satellite Imagery-Based Property Valuation: Preprocessing & EDA

This notebook generates the visualizations required for the Project Report:
1.  **Price Distribution:** Histograms showing why we need Log-Transformation.
2.  **Geospatial Analysis:** Map of property prices (The "Gold Coast" effect).
3.  **Satellite Imagery:** Sample images of expensive vs. cheap homes.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import os

# Set plot style
sns.set(style="whitegrid")
%matplotlib inline

# Create results directory for images
if not os.path.exists('results_viz'):
    os.makedirs('results_viz')

## 1. Load Data

In [None]:
DATA_PATH = '../data/train.csv'
if not os.path.exists(DATA_PATH):
    DATA_PATH = 'data/train.csv'

df = pd.read_csv(DATA_PATH)
print(f"Loaded {len(df)} rows")

## 2. Price Distribution (Histogram)
We show *Raw Price* (Left) vs *Log Price* (Right) to justify our modeling choice.

In [None]:
plt.figure(figsize=(14, 6))

# 1. Raw Price Distribution
plt.subplot(1, 2, 1)
sns.histplot(df['price'], bins=50, kde=True, color='blue')
plt.title('Raw Price Distribution (Long Tail)')
plt.xlabel('Price ($)')

# 2. Log Price Distribution
plt.subplot(1, 2, 2)
sns.histplot(np.log1p(df['price']), bins=50, kde=True, color='green')
plt.title('Log(Price) Distribution (Normal-ish)')
plt.xlabel('Log Price')

plt.tight_layout()
plt.savefig('results_viz/price_distribution.png')
plt.show()

## 3. Geospatial Analysis (The Map)
Visualizing where the expensive houses are.

In [None]:
plt.figure(figsize=(10, 8))

# Scatter plot colored by Price
scatter = sns.scatterplot(
    x='long', y='lat', 
    hue='price', 
    palette='coolwarm', 
    alpha=0.6, 
    data=df.sort_values('price') # Sort so expensive ones are on top
)

plt.title('Property Price Map: The "Gold Coast" Effect')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.savefig('results_viz/location_map.png')
plt.show()

## 4. Sample Satellite Images
Displaying High-Value vs. Low-Value properties to see the visual difference.

In [None]:
# Get top 2 expensive and bottom 2 cheapest houses with images
IMG_DIR = '../data/images' 
if not os.path.exists(IMG_DIR): IMG_DIR = 'data/images'

# Filter for IDs that actually have images
existing_images = set([f.split('.')[0] for f in os.listdir(IMG_DIR) if f.endswith('.jpg')])
df_with_img = df[df['id'].astype(str).isin(existing_images)].copy()

expensive = df_with_img.nlargest(2, 'price')
cheap = df_with_img.nsmallest(2, 'price')
samples = pd.concat([expensive, cheap])

plt.figure(figsize=(12, 10))
for i, (idx, row) in enumerate(samples.iterrows()):
    img_path = os.path.join(IMG_DIR, f"{int(row['id'])}.jpg")
    img = Image.open(img_path)
    
    plt.subplot(2, 2, i+1)
    plt.imshow(img)
    price_fmt = f"${row['price']:,.0f}"
    label = "High Value" if i < 2 else "Low Value"
    plt.title(f"{label}: {price_fmt}\n(Sqft: {row['sqft_living']})", fontsize=12)
    plt.axis('off')

plt.suptitle("Visual Contrast: Luxury vs. Economy", fontsize=16)
plt.tight_layout()
plt.savefig('results_viz/sample_images.png')
plt.show()