In [63]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
import joblib
import warnings
warnings.filterwarnings('ignore')

In [36]:
def load_and_explore_dataset(filepath):
    print('=' * 60)
    print("LOAD AND EXPLORE DATASET")
    print('=' * 60)

    df = pd.read_csv(filepath)

    print('shape of the dataset:')
    print(df.shape)
    print('\nCheck for missing values:')
    print(df.isnull().sum())
    print('\nFirst five rows:')
    print(df.head())
    print('\nDescriptive Stats:')
    print(df.describe())
    print('\nDataset info:')
    print(df.info())
    print('\nLocation Distribution:')
    print(df['state'].unique())
      
    return df

In [37]:
def preprocessing_data(df):
    print('\n' + '=' * 60)
    print('PREPROCESSING LOAD DATA')
    print('=' * 60)

    df_processed = df.copy()
    df_processed = df.dropna()

    label_encoder = {}
    df_columns = ['state', 'furnishing']
    for col in df_columns:
        le = LabelEncoder()
        df_processed[col + '_encoded'] = le.fit_transform(df_processed[col])
        label_encoder[col] = le
        print(f'\n{col} encoded')
        for i, label in enumerate(le.classes_):
            print(f" {label}: {i}")
    print("preprocessed_dataset shape", df_processed.shape)

    return df_processed, label_encoder 

In [38]:
def features_data(df_processed):
    print('\n' + '=' * 60)
    print('FEATURES DATA')
    print('=' * 60)

    feature_cols = ["property_size","state_encoded", "furnishing_encoded", "bathrooms", "bedrooms"]
    target_col = ['price_title']

    X = df_processed[feature_cols]
    y = df_processed[target_col]
    
    print('\nFeatures shape', df_processed[feature_cols].shape)
    print('\nTarget shape', df_processed[target_col].shape)

    
    return X, y, feature_cols

In [39]:
def split_data(X, y, test_size=0.2, random_state=42):
    print('\n' + '=' * 60)
    print('SPLITING  DATA')
    print('=' * 60)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size= test_size, random_state = random_state
    )

    print('\n Training set size', X_train.shape[0])
    print('Testing set size', X_test.shape[0])
    print("\nTraining set price range ₦{:.2f} - ₦{:.2f}".format(
       float( y_train.min()), float(y_train.max())
    ))
    print("\nTesting set price range₦{:.2f} - ₦{:.2f}".format(
        float(y_test.min()), float(y_test.max())
    ))


    return X_train, X_test, y_train, y_test

In [40]:
def scale_features(X_train, X_test):
    print('\n' + '=' * 60)
    print('SCALE FEATURES  DATA')
    print('=' * 60)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print('\nFeatures scaled successfully')
    print('Trainnig scaled set:', X_train_scaled.shape)
    print('Testing scaled set:', X_test_scaled.shape)

    return  X_train_scaled, X_test_scaled, scaler

In [41]:
def train_model(X_train_scaled, y_train, feature_cols):
    print('\n' + '=' * 60)
    print('TRAINING MODEL')
    print('=' * 60)

    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    print('\nModel trained successfully')
    print('\nModel coefficient')
    for features, coef in zip(feature_cols, model.coef_.ravel()):
        print(f" {features} : {coef:.2f}")
    print(f'\nModel intercept:{float(model.intercept_):.2f}')

    return model 

In [42]:
def train_random_forest_model(X_train_scaled, y_train, feature_cols):
    print('\n' + '=' * 60)
    print('TRAINING RANDOM FOREST MODEL')
    print('=' * 60)

    rf_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf = 2,
        random_state = 42,
        n_jobs=-1
    )
    rf_model.fit(X_train_scaled, y_train)
    print('Random forest successfully trained')
    print('Model estimators', rf_model.estimator)
    print('Model max depth', rf_model.max_depth)
    feature_importance = sorted(zip(feature_cols, rf_model.feature_importances_),
                                key = lambda x: x[1], reverse=True)
    for feature, importance in feature_importance:
        print(f"  {feature}: {float(importance):.4f}")
    return rf_model

In [43]:
def evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test):
    print('\n' + '=' * 60)
    print('EVALUATING  DATA')
    print('=' * 60)

    # make prediction

    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    # calculate metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
    test_rmse = np.sqrt(mean_squared_error(y_test_pred, y_test))
    train_mae = mean_absolute_error(y_train_pred, y_train)
    test_mae = mean_absolute_error(y_test_pred, y_test)
    
    print('\n' + '=' * 60)
    print('MODEL  PERFORMANCE')
    print('=' * 60)
    print('\nTraining set')
    print(f'R2 score: {train_r2:.4f}')
    print(f'RMSE: {train_rmse:2.2f}')
    print(f'MAE: {train_mae:2.2f}')
    print('\nTestin set')
    print(f'R2 score: {test_r2:.4f}')
    print(f'RMSE: {test_rmse:2.2f}')
    print(f'MAE: {test_mae:2.2f}')
    print('=' * 60)

    # cross validation score
    cv_score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    print('\nCross Validation (5-folds)')
    print(f'R2 Score:{cv_score}')
    print(f'Cross Validation mean: {cv_score.mean():.2f}')
    print(f"Cross Validation STD : {cv_score.std():.4f}")

    metrics = {
        "train_r2":train_r2,
        "test_r2": test_r2,
        "train_rmse": train_rmse,
        "test_rmse": test_rmse,
        "train_mae": train_mae,
        "test_mae": test_mae,
        "y_train_pred": y_train_pred,
        'y_test_pred':y_test_pred,
        "cv_score":cv_score
    }

    return metrics

In [52]:
def evaluate_rf_model(rf_model, X_train_scaled, X_test_scaled, y_train, y_test):
    print('\n' + '=' * 60)
    print('EVALUATING  DATA')
    print('=' * 60)

    # make prediction

    y_train_pred = rf_model.predict(X_train_scaled)
    y_test_pred = rf_model.predict(X_test_scaled)

    # calculate metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
    test_rmse = np.sqrt(mean_squared_error(y_test_pred, y_test))
    train_mae = mean_absolute_error(y_train_pred, y_train)
    test_mae = mean_absolute_error(y_test_pred, y_test)
    
    print('\n' + '=' * 60)
    print('RANDOM FORESRT MODEL  PERFORMANCE')
    print('=' * 60)
    print('\nTraining set')
    print(f'R2 score: {train_r2:.4f}')
    print(f'RMSE: {train_rmse:2.2f}')
    print(f'MAE: {train_mae:2.2f}')
    print('\nTestin set')
    print(f'R2 score: {test_r2:.4f}')
    print(f'RMSE: {test_rmse:2.2f}')
    print(f'MAE: {test_mae:2.2f}')
    print('=' * 60)

    # cross validation score
    cv_score = cross_val_score(rf_model, X_train_scaled, y_train, cv=5, scoring='r2')
    print('\nCross Validation (5-folds)')
    print(f'R2 Score:{cv_score}')
    print(f'Cross Validation mean: {cv_score.mean():.2f}')
    print(f"Cross Validation STD : {cv_score.std():.4f}")

    rf_metrics = {
        "train_r2":train_r2,
        "test_r2": test_r2,
        "train_rmse": train_rmse,
        "test_rmse": test_rmse,
        "train_mae": train_mae,
        "test_mae": test_mae,
        "y_train_pred": y_train_pred,
        'y_test_pred':y_test_pred,
        "cv_score":cv_score
    }

    return rf_metrics

In [53]:
# saving of activities
def save_model_artifact(model, scaler, label_encoders, feature_cols):
    print('\n' + '=' * 60)
    print('SAVING MODEL ARTIFACT')
    print('=' * 60)
    joblib.dump(model, 'house_price_prediction.pkl')
    print('House price prediction model saved successfully')
    
    joblib.dump(scaler, 'scaler_features.pkl')
    print('Scaler prediction saved successfully')
    
    joblib.dump(label_encoders, 'label_encoders.pkl')
    print('label encoder saved successfully')
    
    joblib.dump(feature_cols, 'feature_columns.pkl')
    print('Feature columns saved successfully')

    print('\n' + '=' * 60)
    print('All  MODEL ARTIFACT SAVED SUCCESSFULLY')
    print('=' * 60)

In [54]:
def save_rf_model_artifact(rf_model):
    print('\n' + '=' * 60)
    print('SAVING RANDOM FOREST MODEL ARTIFACT')
    print('=' * 60)
    joblib.dump(rf_model, 'rf_model_prediction.pkl')
    print('Car price prediction model saved successfully')

    print('\n' + '=' * 60)
    print('RANDOM FOREST MODEL ARTIFACT SAVED SUCCESSFULLY')
    print('=' * 60)

In [55]:
def predict_house_price(property_size,  state_encoded, furnishing_encoded, bathrooms, bedrooms):
    model = joblib.load('house_price_prediction.pkl')
    scaler = joblib.load('scaler_features.pkl')
    label_encoders = joblib.load('label_encoders.pkl')
    feature_cols = joblib.load('feature_columns.pkl')
    try:
        state_encoded = label_encoders['state'].transform([state_encoded])[0]
        furnishing_encoded = label_encoders['furnishing'].transform([furnishing_encoded])[0]
        
    except ValueError as e:
        return f"Unknown category - {e}"
    features_dict = {
        'property_size': property_size,
        'state_encoded': state_encoded,
        'furnishing_encoded': furnishing_encoded,
        'bathrooms': bathrooms,
        'bedrooms': bedrooms
    }
    features = np.array([[features_dict[col] for col in features_dict]])
    features_scale = scaler.transform(features)
    predict_price = model.predict(features_scale)[0]
    return predict_price

In [56]:
def rf_model_predict_house_price(property_size,  state_encoded, furnishing_encoded, bathrooms, bedrooms):
    rf_model = joblib.load('house_price_prediction.pkl')
    scaler = joblib.load('scaler_features.pkl')
    label_encoders = joblib.load('label_encoders.pkl')
    feature_cols = joblib.load('feature_columns.pkl')
    try:
        state_encoded = label_encoders['state'].transform([state_encoded])[0]
        furnishing_encoded = label_encoders['furnishing'].transform([furnishing_encoded])[0]
        
    except ValueError as e:
        return f"Unknown category - {e}"
    features_dict = {
        'property_size': property_size,
        'state_encoded': state_encoded,
        'furnishing_encoded': furnishing_encoded,
        'bathrooms': bathrooms,
        'bedrooms': bedrooms
    }
    features = np.array([[features_dict[col] for col in features_dict]])
    features_scale = scaler.transform(features)
    predict_price = rf_model.predict(features_scale)[0]
    return predict_price

In [57]:
def test_prediction():
    print('\n' + '=' * 60)
    print('TESTING  PREDICTION')
    print('=' * 60)

    # example no.1
    price1 = predict_house_price(1000,'Lagos', 'Furnished', 5, 5)
    print('\n 1000 ms, location: Lagos,  furnishded_status : Furnished, 5 bedrooms, 5 bathroom')
    print(f" ₦{float(price1):,.2f}")

    # example no.2
    # price2 = predict_house_price(2010,'Honda', 'Accord', 'Local used', 'Automatic')
    # print('\n 2010 Honda Accord (Local used Automatic)')
    # print(f" ₦{float(price2):,.2f}")

    # # example no.1
    # price3 = predict_house_price(2012,'Lexus', 'RX 350', 'Foreign used', 'Automatic')
    # print('\n 2012 Lexus RX 350 (foreign used Automatic)')
    # print(f" ₦{float(price3):,.2f}")
    
    print('=' * 60)
    

In [60]:
def test_rf_model_prediction():
    print('\n' + '=' * 60)
    print('TESTING  PREDICTION')
    print('=' * 60)

    # example no.1
    price1 = rf_model_predict_house_price(1000,'Lagos', 'Furnished', 5, 5)
    print('\n 1000 ms, location: Lagos,  furnishded_status : Furnished, 5 bedrooms, 5 bathroom')
    print(f" ₦{float(price1):,.2f}")

    # example no.2
    # price2 = predict_house_price(2010,'Honda', 'Accord', 'Local used', 'Automatic')
    # print('\n 2010 Honda Accord (Local used Automatic)')
    # print(f" ₦{float(price2):,.2f}")

    # # example no.1
    # price3 = predict_house_price(2012,'Lexus', 'RX 350', 'Foreign used', 'Automatic')
    # print('\n 2012 Lexus RX 350 (foreign used Automatic)')
    # print(f" ₦{float(price3):,.2f}")
    
    print('=' * 60)
    

In [61]:
def main():
    filepath  = 'jiji_housing_cleaned.csv'
    df = load_and_explore_dataset(filepath)
    df_processed, label_encoders = preprocessing_data(df)
    X, y, feature_cols = features_data(df_processed)
    X_train, X_test, y_train, y_test = split_data(X, y)
    X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)
    model = train_model(X_train_scaled, y_train, feature_cols)
    metrics = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
    save_model_artifact(model, scaler, label_encoders, feature_cols)
    
    test_prediction()

    rf_model = train_random_forest_model(X_train_scaled, y_train, feature_cols)
    
    rf_metrics = evaluate_rf_model(rf_model, X_train_scaled, X_test_scaled, y_train, y_test)
    save_rf_model_artifact(rf_model)
    test_rf_model_prediction()

In [62]:
if __name__ == "__main__" :
    main()

LOAD AND EXPLORE DATASET
shape of the dataset:
(1245, 11)

Check for missing values:
title            0
region           0
region_name      0
price_title      0
property_size    0
bedrooms         0
bathrooms        0
furnishing       0
boosted          0
state            0
price_m2         0
dtype: int64

First five rows:
                                               title  \
0      4bdrm Duplex in Abuja Estate, Owerri for sale   
1  Furnished 5bdrm Bungalow in Prime Property, Be...   
2               2bdrm Block of Flats in Uyo for sale   
3  Furnished 6bdrm Duplex in Port Harcourt, Obio-...   
4       12bdrm Block of Flats in Kapua, FHA for sale   

                     region region_name  price_title  property_size  bedrooms  \
0         Imo State, Owerri      Owerri  170000000.0            600         4   
1     Edo State, Benin City  Benin City   45000000.0           1500         5   
2      Akwa Ibom State, Uyo         Uyo   30000000.0            400         2   
3  Rivers Stat