In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings

warnings.filterwarnings('ignore')


In [4]:

# ============================================================
# 1. LOAD AND EXPLORE DATASET
# ============================================================
def load_and_explore_dataset(filepath):
    print('=' * 60)
    print("LOAD AND EXPLORE DATASET")
    print('=' * 60)
    df = pd.read_csv(filepath)
    print('Shape of the dataset:')
    print(df.shape)
    print('\nCheck for missing values:')
    print(df.isnull().sum())
    print('\nFirst five rows:')
    print(df.head())
    print('\nDescriptive Stats:')
    print(df.describe())
    print('\nDataset info:')
    print(df.info())
    print('\nLocation Distribution:')
    print(df['state'].value_counts())
    print('\nFurnishing Status Description:')
    print(df['furnishing'].value_counts())
    return df


# ============================================================
# 2. PREPROCESSING DATA
# ============================================================
def preprocessing_data(df):
    if df is None:
        print("Error: DataFrame is None.")
        return None
    print('\n' + '=' * 60)
    print('PREPROCESSING DATA')
    print('=' * 60)

    df_processed = df.copy().dropna()
    categorical_cols = ['state', 'furnishing']

    label_encoders = {}
    for col in categorical_cols:
        if col in df_processed.columns:
            le = LabelEncoder()
            df_processed[col] = le.fit_transform(df_processed[col])
            label_encoders[col] = le
            print(f"Encoded '{col}': {dict(zip(le.classes_, le.transform(le.classes_)))}")

    print(f"\nPreprocessed dataset shape: {df_processed.shape}")
    return label_encoders, df_processed


# ============================================================
# 3. FEATURE SELECTION
# ============================================================
def features_data(df_processed):
    print('\n' + '=' * 60)
    print('FEATURE SELECTION')
    print('=' * 60)

    target_col           = 'price_title'
    categorical_features = ['state', 'furnishing']
    numeric_features     = ['property_size', 'bedrooms', 'bathrooms']

    X = df_processed[categorical_features + numeric_features]
    y = df_processed[target_col]

    print(f'Features shape: {X.shape}')
    print(f'Target shape:   {y.shape}')
    return X, y, categorical_features, numeric_features


# ============================================================
# 4. SPLIT DATA
# ============================================================
def split_data(X, y, test_size=0.2, random_state=42):
    print('\n' + '=' * 60)
    print('SPLITTING DATA')
    print('=' * 60)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    print('Training set size:', X_train.shape[0])
    print('Testing set size: ', X_test.shape[0])
    print("\nTraining set price range: ₦{:.2f} - ₦{:.2f}".format(
        float(y_train.min()), float(y_train.max())
    ))
    print("Testing set price range:  ₦{:.2f} - ₦{:.2f}".format(
        float(y_test.min()), float(y_test.max())
    ))
    return X_train, X_test, y_train, y_test


# ============================================================
# 5. SCALE FEATURES
# ============================================================
def scale_features(X_train, X_test, num_cols):
    print('\n' + '=' * 60)
    print('SCALING FEATURES')
    print('=' * 60)

    scaler = StandardScaler()

    X_train_scaled = X_train.copy()
    X_test_scaled  = X_test.copy()

    X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test_scaled[num_cols]  = scaler.transform(X_test[num_cols])

    print(f"Scaled numeric columns: {num_cols}")
    print(f"\nTraining set mean after scaling:\n{X_train_scaled[num_cols].mean().round(4)}")
    print(f"\nTraining set std after scaling:\n{X_train_scaled[num_cols].std().round(4)}")
    return X_train_scaled, X_test_scaled, scaler


# ============================================================
# 6. TRAIN MODEL
# ============================================================
def train_model(X_train_scaled, y_train):
    print('\n' + '=' * 60)
    print('TRAINING MODEL')
    print('=' * 60)

    model = RandomForestRegressor(
        n_estimators=1000,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train_scaled, y_train)
    print('Model trained successfully')
    return model


# ============================================================
# 7. EVALUATE MODEL
# ============================================================
def evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test):
    print('\n' + '=' * 60)
    print('EVALUATING MODEL PERFORMANCE')
    print('=' * 60)

    y_train_pred = model.predict(X_train_scaled)
    y_test_pred  = model.predict(X_test_scaled)

    train_r2   = r2_score(y_train, y_train_pred)
    test_r2    = r2_score(y_test,  y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse  = np.sqrt(mean_squared_error(y_test,  y_test_pred))
    train_mae  = mean_absolute_error(y_train, y_train_pred)
    test_mae   = mean_absolute_error(y_test,  y_test_pred)

    print('\nTraining set')
    print(f'  R² score : {train_r2:.4f}')
    print(f'  RMSE     : ₦{train_rmse:,.2f}')
    print(f'  MAE      : ₦{train_mae:,.2f}')

    print('\nTesting set')
    print(f'  R² score : {test_r2:.4f}')
    print(f'  RMSE     : ₦{test_rmse:,.2f}')
    print(f'  MAE      : ₦{test_mae:,.2f}')
    print('=' * 60)

    cv_score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    print('\nCross Validation (5-folds)')
    print(f'  R² scores : {cv_score}')
    print(f'  Mean R²   : {cv_score.mean():.4f}')
    print(f'  Std dev   : {cv_score.std():.4f}')
    print('=' * 60)

    return {
        "train_r2":     train_r2,
        "test_r2":      test_r2,
        "train_rmse":   train_rmse,
        "test_rmse":    test_rmse,
        "train_mae":    train_mae,
        "test_mae":     test_mae,
        "y_train_pred": y_train_pred,
        "y_test_pred":  y_test_pred,
        "cv_score":     cv_score
    }


# ============================================================
# 8. SAVE MODEL ARTIFACTS
# ============================================================
def save_model_artifact(model, scaler, label_encoders, cat_cols, num_cols):
    print('\n' + '=' * 60)
    print('SAVING MODEL ARTIFACTS')
    print('=' * 60)

    joblib.dump(model, 'house_price_model.pkl')
    print('House price model saved successfully')

    joblib.dump(scaler, 'house_scaler_features.pkl')
    print('Scaler saved successfully')

    joblib.dump(label_encoders, 'house_label_encoders.pkl')
    print('Label encoders saved successfully')

    feature_cols = {'categorical': cat_cols, 'numeric': num_cols}
    joblib.dump(feature_cols, 'house_feature_columns.pkl')
    print('Feature columns saved successfully')

    print('\n' + '=' * 60)
    print('ALL MODEL ARTIFACTS SAVED SUCCESSFULLY')
    print('=' * 60)


# ============================================================
# 9. MAIN
# ============================================================
def main():
    filepath = 'jiji_housing_cleaned.csv'

    # Step 1: Load
    df = load_and_explore_dataset(filepath)
    if df is None:
        return

    # Step 2: Preprocess + encode
    label_encoders, df_processed = preprocessing_data(df)

    # Step 3: Features
    X, y, cat_cols, num_cols = features_data(df_processed)

    # Step 4: Split
    X_train, X_test, y_train, y_test = split_data(X, y)

    # Step 5: Scale
    X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test, num_cols)

    # Step 6: Train
    model = train_model(X_train_scaled, y_train)

    # Step 7: Evaluate
    metrics = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)

    # Step 8: Save
    save_model_artifact(model, scaler, label_encoders, cat_cols, num_cols)

    return metrics


if __name__ == "__main__":
    main()

LOAD AND EXPLORE DATASET
Shape of the dataset:
(1245, 11)

Check for missing values:
title            0
region           0
region_name      0
price_title      0
property_size    0
bedrooms         0
bathrooms        0
furnishing       0
boosted          0
state            0
price_m2         0
dtype: int64

First five rows:
                                               title  \
0      4bdrm Duplex in Abuja Estate, Owerri for sale   
1  Furnished 5bdrm Bungalow in Prime Property, Be...   
2               2bdrm Block of Flats in Uyo for sale   
3  Furnished 6bdrm Duplex in Port Harcourt, Obio-...   
4       12bdrm Block of Flats in Kapua, FHA for sale   

                     region region_name  price_title  property_size  bedrooms  \
0         Imo State, Owerri      Owerri  170000000.0            600         4   
1     Edo State, Benin City  Benin City   45000000.0           1500         5   
2      Akwa Ibom State, Uyo         Uyo   30000000.0            400         2   
3  Rivers Stat