In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import logging

In [5]:
# Suppress TensorFlow logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)

def build_price_awareness_model(input_shape):
    """
    Defines the Keras Sequential model architecture.
    """
    model = Sequential()
    
    # Input layer and first hidden layer
    # 'relu' activation is standard for hidden layers
    model.add(Dense(128, input_dim=input_shape, activation='relu'))
    
    # Dropout layer to prevent overfitting
    model.add(Dropout(0.2))
    
    # Second hidden layer
    model.add(Dense(64, activation='relu'))
    
    # Third hidden layer
    model.add(Dense(32, activation='relu'))
    
    # Output layer
    # A single neuron with no activation (or 'linear') for regression
    model.add(Dense(1))
    
    # Compile the model
    # We use Mean Squared Error for loss as is standard for regression
    # The Adam optimizer is a good, robust default
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    
    return model

In [8]:
def main():
    # --- 1. Data Ingestion ---
    try:
        # Load the core 8000-line CSV dataset
        data = pd.read_csv('final_with_proficiency.csv')
    except FileNotFoundError:
        print("Error: '8000_listings.csv' not found.")
        print("Please create a mock CSV or update the file path.")
        return

    # --- 2. Preprocessing ---
    
    # This dataset contains rich time-series data (e.g., '1/31/2000'),
    # crime data, and school data. For this PoC, we will predict
    # the 'latest_home_value' based on a mix of demographic,
    # crime, school, and rent features.
    
    # Define features (X) and target (y)
    # TODO: You can add/remove features here to experiment.
    
    # We must separate categorical and numerical features for preprocessing
    categorical_features = ['STATE', 'Metro_zhvi', 'CountyName_zhvi']
    numerical_features = [
        'SizeRank_zhvi', 'latest_rent', 'crime_rate_per_100000', 
        'population', 'CountySchoolScore_y', 'Proficiency'
    ]
    target = 'latest_home_value'
    
    all_features = categorical_features + numerical_features
    
    # --- 2a. Handle Missing Data ---
    # Ensure all our target rows and feature rows have data
    try:
        # Check for columns
        if target not in data.columns or not all(f in data.columns for f in all_features):
            print(f"Error: CSV is missing one or more required columns.")
            print(f"Needed: {all_features + [target]}")
            return
        
        # Drop rows where any of our selected columns are NaN
        print(f"Original data shape: {data.shape}")
        data.dropna(subset=all_features + [target], inplace=True)
        print(f"Data shape after dropping NaNs: {data.shape}")
        
        if data.shape[0] == 0:
            print("Error: No data remaining after dropping NaNs. Check your columns or data file.")
            return

    except Exception as e:
        print(f"Error during preprocessing: {e}")
        return

    # --- 2b. One-Hot Encode Categorical Features ---
    # This converts text (like 'CA') into numerical format for the model
    X_categorical = pd.get_dummies(data[categorical_features], drop_first=True, dtype=int)
    
    # --- 2c. Combine Features ---
    X_numerical = data[numerical_features]
    
    # Reset index to ensure clean concatenation
    X_numerical.reset_index(drop=True, inplace=True)
    X_categorical.reset_index(drop=True, inplace=True)
    
    X = pd.concat([X_numerical, X_categorical], axis=1)
    y = data[target].values

    # --- 3. Train/Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # --- 4. Feature Scaling ---
    # This is CRITICAL for neural networks
    # We scale ALL features (numerical and one-hot encoded)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # --- 5. Model Definition ---
    input_shape = X_train_scaled.shape[1]
    model = build_price_awareness_model(input_shape)
    model.summary()

    # --- 6. Model Training ---
    print("\nStarting model training...")
    history = model.fit(
        X_train_scaled,
        y_train,
        validation_data=(X_test_scaled, y_test),
        epochs=50,       # Number of passes through the data
        batch_size=32,   # Number of samples per update
        verbose=1
    )
    print("Training complete.")

    # --- 7. Model Evaluation ---
    test_loss = model.evaluate(X_test_scaled, y_test, verbose=0)
    print(f"\nModel evaluated on test set.")
    print(f"Test Mean Squared Error (Loss): {test_loss:.2f}")
    
    # Calculate Mean Absolute Error (MAE) for a more interpretable result
    y_pred = model.predict(X_test_scaled)
    mae = np.mean(np.abs(y_pred.flatten() - y_test))
    print(f"Test Mean Absolute Error (MAE): ${mae:.2f}")

    # --- 8. Example Prediction ---
    print("\nRunning an example prediction...")
    # We use the pandas-based X_test (before scaling) to show readable features
    # Note: Column names will be slightly different after get_dummies
    example_home_features_unscaled = X_test.iloc[0]
    
    # We use the scaled numpy array for the actual prediction
    example_home_features_scaled = X_test_scaled[0].reshape(1, -1)
    
    predicted_price = model.predict(example_home_features_scaled)[0][0]
    actual_price = y_test[0]
    
    print("\nExample Region Features (Unscaled, combined):")
    print(example_home_features_unscaled)
    print(f"\nPredicted Price: ${predicted_price:,.2f}")
    print(f"Actual Price: ${actual_price:,.2f}")


if __name__ == "__main__":
    main()

Original data shape: (7827, 493)
Data shape after dropping NaNs: (7186, 493)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Starting model training...
Epoch 1/50
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 398059339776.0000 - val_loss: 367835021312.0000
Epoch 2/50
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 326543998976.0000 - val_loss: 121701425152.0000
Epoch 3/50
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 95643287552.0000 - val_loss: 103062773760.0000
Epoch 4/50
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 91230240768.0000 - val_loss: 94910595072.0000
Epoch 5/50
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 107720392704.0000 - val_loss: 89442598912.0000
Epoch 6/50
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 88208596992.0000 - val_loss: 85834244096.0000
Epoch 7/50
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 78490763264.0000 - val_loss: 829