In [1]:
import numpy as np
import pandas as pd

cleaned_df = pd.read_excel('clean_data_for_ml_training.xlsx', index_col=None)
cleaned_df = cleaned_df.drop(['Unnamed: 0'], axis=1)
X = cleaned_df.drop(columns=['price_usd'])
y = cleaned_df['price_usd']

<h1>Bayesian optimization</h1>

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import joblib


def preprocess_data(X, y):
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    joblib.dump(scaler, 'pkls/scaler.pkl')
    return train_test_split(X_scaled, y, test_size=0.2, random_state=42)

def calculate_metrics(y_test, y_pred):
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    r2 = r2_score(y_test, y_pred)
    return {
        "RMSE": rmse,
        "MAE": mae,
        "MSE": mse,
        "MAPE": mape,
        "R2": r2
    }

def random_forest_bayesian_tuning(X, y):
    print("Preprocessing data...")
    X_train, X_test, y_train, y_test = preprocess_data(X, y)

    param_space = {
        'n_estimators': Integer(100, 1000),
        'max_depth': Integer(3, 30),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 10),
        'max_features': Categorical(['sqrt', 'log2', None]),
        'oob_score': Categorical([True, False]),
        'criterion': Categorical(['squared_error']),
        'max_samples': Real(0.5, 1.0),
        'min_weight_fraction_leaf': Real(0.0, 0.5),
        'max_leaf_nodes': Integer(10, 100),
        'ccp_alpha': Real(0.0, 0.05),
        'warm_start': Categorical([False])  
    }

    rf_model = RandomForestRegressor(random_state=23)

    opt = BayesSearchCV(
        estimator=rf_model,
        search_spaces=param_space,
        n_iter=30,
        cv=5,
        scoring='neg_root_mean_squared_error',
        verbose=1,
        n_jobs=-1,
        random_state=42
    )

    print("Starting Bayesian optimization...")
    opt.fit(X_train, y_train)

    print("Optimization finished.")
    print("Best Parameters:", opt.best_params_)

    best_model = opt.best_estimator_
    y_pred = best_model.predict(X_test)

    metrics = calculate_metrics(y_test, y_pred)
    return metrics, best_model, opt.best_params_


<h1>Model implementation</h1>

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

def random_forest_with_loss(X, y):
    # Preprocess data
    X_train, X_test, y_train, y_test = preprocess_data(X, y)
    train_mse_values = []
    test_mse_values = []
    n_trees = range(1, 101)    param_grid = {
        'n_estimators': 100,  
        'max_depth': 30,  
        'min_samples_split': 2,  
        'min_samples_leaf': 10,  
        'max_features': None,  
        'oob_score': True,  
        'criterion': 'squared_error',  
        'max_samples': None,  
        'min_weight_fraction_leaf': 0.0,  
        'max_leaf_nodes': 100,  
        'warm_start': False, 
        'ccp_alpha': 0.05,  
    }

    
    for i, n in enumerate(n_trees):        model = RandomForestRegressor(n_estimators=n, 
                                      max_depth=param_grid['max_depth'],
                                      min_samples_split=param_grid['min_samples_split'],
                                      min_samples_leaf=param_grid['min_samples_leaf'],
                                      max_features=param_grid['max_features'],
                                      oob_score=param_grid['oob_score'],
                                      criterion=param_grid['criterion'],
                                      max_samples=param_grid['max_samples'],
                                      min_weight_fraction_leaf=param_grid['min_weight_fraction_leaf'],
                                      max_leaf_nodes=param_grid['max_leaf_nodes'],
                                      warm_start=param_grid['warm_start'],
                                      ccp_alpha=param_grid['ccp_alpha'],
                                      random_state=42)
                model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        train_mse = mean_squared_error(y_train, y_train_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        
        train_mse_values.append(train_mse)
        test_mse_values.append(test_mse)

        # Print progress
        print(f"Completed {i + 1}/{len(n_trees)} iterations. {len(n_trees) - (i + 1)} left.")

    plt.figure(figsize=(10, 6))
    plt.plot(n_trees, train_mse_values, label="Train MSE", marker='o')
    plt.plot(n_trees, test_mse_values, label="Test MSE", marker='o')
    plt.title("Loss (MSE) vs Number of Trees")
    plt.xlabel("Number of Trees")
    plt.ylabel("Mean Squared Error")
    plt.legend()
    plt.grid()
    plt.show()
    return 


In [None]:
rf_metrics, rf_best_model, rf_best_params = random_forest_with_loss(X, y)

In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

def preprocess_data_single_entry(X):
    
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

def prepare_input(incoming_data: pd.DataFrame, expected_columns: list) -> pd.DataFrame:
    categorical_cols = ['district_name', 'foundation_name', 'layout_name', 'repair_name', 'wc_name']
    incoming_encoded = pd.get_dummies(incoming_data, columns=categorical_cols)

    # Add missing columns
    for col in expected_columns:
        if col not in incoming_encoded.columns:
            incoming_encoded[col] = 0

    incoming_encoded = incoming_encoded[[col for col in expected_columns]]

    incoming_encoded = incoming_encoded[expected_columns]

    incoming_encoded = incoming_encoded.astype(int)

    return incoming_encoded

def predict_with_confidence(model, X_row: pd.DataFrame) -> dict:
    all_preds = [tree.predict(X_row)[0] for tree in model.estimators_]
    mean_pred = sum(all_preds) / len(all_preds)
    std_pred = pd.Series(all_preds).std()
    return {
        'prediction': mean_pred,
        'lower_bound': mean_pred - 1 * std_pred,
        'upper_bound': mean_pred + 1 * std_pred,
        'std_dev': std_pred,
        'all_pred': all_preds
    }

model = rf_best_model
expected_columns = X.columns

incoming_data = pd.DataFrame([{
    "district_name": "Чиланзарский район",
    "number_of_rooms": "2",
    "floors": "4",
    "total_floors": "4",
    "total_area": "20",
    "foundation_name": "Панельный",
    "layout_name": "Смежно-раздельная",
    "wc_name": "2 санузла и более",
    "repair_name": "Евроремонт",
    "year": 2024,
    "month":11,
    "is_primary": "1"
}])

scaler = joblib.load('pkls/scaler.pkl')
processed_input = prepare_input(incoming_data, expected_columns)
scaled_input = scaler.transform(processed_input)  # Apply MinMaxScaler
result = predict_with_confidence(model, scaled_input)

print(f"Prediction: {result['prediction']:.2f}")
print(f"Confidence Interval (approx. 95%): {result['lower_bound']:.2f} to {result['upper_bound']:.2f}")
print(f"Standard Deviation: {result['std_dev']:.2f}")
import matplotlib.pyplot as plt

data = result['all_pred']

plt.hist(data, bins=10, edgecolor='black') 
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram of result[\'all_pred\']')
plt.show()
