In [2]:
!pip install h2o

Collecting h2o
  Downloading h2o-3.46.0.7-py2.py3-none-any.whl.metadata (2.1 kB)
Downloading h2o-3.46.0.7-py2.py3-none-any.whl (265.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.9/265.9 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: h2o
Successfully installed h2o-3.46.0.7


In [11]:
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Initialize H2O
h2o.init()

# Load the dataset
data = pd.read_csv('/content/Housing.csv')

# Preprocess categorical variables
categorical_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
le_dict = {col: LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    data[col] = le_dict[col].fit_transform(data[col])

# Convert pandas DataFrame to H2OFrame
h2o_data = h2o.H2OFrame(data)

# Define features and target
target = 'price'
features = [col for col in h2o_data.columns if col != target]

# Split the data into train and test sets
train, test = h2o_data.split_frame(ratios=[0.8], seed=42)

# Configure and run AutoML
aml = H2OAutoML(max_models=10, seed=42, max_runtime_secs=300)
aml.train(x=features, y=target, training_frame=train)

# Get the best model
best_model = aml.leader

# Evaluate the model on the test set
performance = best_model.model_performance(test)
print("Test Set Performance:")
print(performance)

# Print feature importance if available
if hasattr(best_model, 'varimp'):
    print("\nFeature Importance:")
    print(best_model.varimp(use_pandas=True))

# Function for manual testing
def predict_house_price(area, bedrooms, bathrooms, stories, mainroad, guestroom, basement,
                      hotwaterheating, airconditioning, parking, prefarea, furnishingstatus):
    # Create a single-row DataFrame with the input features
    input_data = pd.DataFrame({
        'area': [area],
        'bedrooms': [bedrooms],
        'bathrooms': [bathrooms],
        'stories': [stories],
        'mainroad': [mainroad],
        'guestroom': [guestroom],
        'basement': [basement],
        'hotwaterheating': [hotwaterheating],
        'airconditioning': [airconditioning],
        'parking': [parking],
        'prefarea': [prefarea],
        'furnishingstatus': [furnishingstatus]
    })

    # Encode categorical variables using the same LabelEncoders
    for col in categorical_columns:
        try:
            input_data[col] = le_dict[col].transform(input_data[col])
        except ValueError as e:
            print(f"Error encoding {col}: {e}")
            return None

    # Convert to H2OFrame
    h2o_input = h2o.H2OFrame(input_data)

    # Predict
    prediction = best_model.predict(h2o_input)

    # Return the predicted price
    return prediction.as_data_frame()['predict'][0]

# Check if the exact input exists in the dataset
def check_dataset_match(sample_input):
    sample_df = pd.DataFrame([sample_input])
    for col in categorical_columns:
        sample_df[col] = le_dict[col].transform(sample_df[col])
    match = data[
        (data['area'] == sample_input['area']) &
        (data['bedrooms'] == sample_input['bedrooms']) &
        (data['bathrooms'] == sample_input['bathrooms']) &
        (data['stories'] == sample_input['stories']) &
        (data['mainroad'] == le_dict['mainroad'].transform([sample_input['mainroad']])[0]) &
        (data['guestroom'] == le_dict['guestroom'].transform([sample_input['guestroom']])[0]) &
        (data['basement'] == le_dict['basement'].transform([sample_input['basement']])[0]) &
        (data['hotwaterheating'] == le_dict['hotwaterheating'].transform([sample_input['hotwaterheating']])[0]) &
        (data['airconditioning'] == le_dict['airconditioning'].transform([sample_input['airconditioning']])[0]) &
        (data['parking'] == sample_input['parking']) &
        (data['prefarea'] == le_dict['prefarea'].transform([sample_input['prefarea']])[0]) &
        (data['furnishingstatus'] == le_dict['furnishingstatus'].transform([sample_input['furnishingstatus']])[0])
    ]
    if not match.empty:
        print("\nDataset Match Found:")
        print(f"Actual Price: ${match['price'].iloc[0]:,.2f}")
    else:
        print("\nNo exact match found in the dataset.")
    return match

# Manual testing with sample input
print("\nExample 1 Prediction:")
sample_input_1 = {
    'area': 5500,
    'bedrooms': 3,
    'bathrooms': 2,
    'stories': 4,
    'mainroad': 'yes',
    'guestroom': 'yes',
    'basement': 'no',
    'hotwaterheating': 'no',
    'airconditioning': 'yes',
    'parking': 1,
    'prefarea': 'no',
    'furnishingstatus': 'semi-furnished'
}

# Check if this input exists in the dataset
check_dataset_match(sample_input_1)

# Predict the price
predicted_price_1 = predict_house_price(**sample_input_1)
if predicted_price_1 is not None:
    print(f"Predicted price for high-end house: ${predicted_price_1:,.2f}")
    print(f"Actual price from dataset: $13,300,000.00")
    if predicted_price_1 != 13300000:
        print(f"Prediction Error: ${abs(predicted_price_1 - 13300000):,.2f}")
else:
    print("Prediction failed due to encoding error.")

# Shutdown H2O
h2o.cluster().shutdown()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.27" 2025-04-15; OpenJDK Runtime Environment (build 11.0.27+6-post-Ubuntu-0ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.27+6-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.11/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpswrp9u_7
  JVM stdout: /tmp/tmpswrp9u_7/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpswrp9u_7/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,3 months and 17 days
H2O_cluster_name:,H2O_from_python_unknownUser_4q25yh
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
Test Set Performance:
ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 1039461415351.0946
RMSE: 1019539.8056726842
MAE: 744587.4108550648
RMSLE: 0.22113199611190246
Mean Residual Deviance: 1039461415351.0946

Feature Importance:
            variable  relative_importance  scaled_importance  percentage
0               area         2.750348e+15           1.000000    0.421958
1          bathrooms         1.161259e+15           0.422223    0.178160
2    airconditioning         7.260328e+14           0.263979    0.111388
3            parking         4.583860e+14           0.166665    0.070325
4            stories         2.841782e+14           0.103324    0.043599
5           prefarea         2.604989e+14           0.094715    0.039966
6   furnishingstatus         2.584260e+14           0.09


