In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import joblib
import locale

In [16]:
df = pd.read_excel('../../../Data/Cleaned_Combined.xlsx')

X = df.drop(['price','source'], axis=1)
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

X_train[['area', 'num_of_rooms']] = scaler.fit_transform(X_train[['area', 'num_of_rooms']])
X_test[['area', 'num_of_rooms']] = scaler.transform(X_test[['area', 'num_of_rooms']])

joblib.dump(scaler, 'min_max_scaler.pkl')

['min_max_scaler.pkl']

In [17]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
import joblib
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

all_models = {
    "Gradient Boosting": {
        "model": GradientBoostingRegressor(),
        "params": {
            'learning_rate': [0.1, 0.05, 0.01],
            'n_estimators': [300],
            'max_depth': [3, 4, 5],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'subsample': [0.5, 0.8, 1.0]
        }
    },
    "XGBoost": {
        "model": XGBRegressor(),
        "params": {
            'learning_rate': [0.1, 0.05, 0.01],
            'n_estimators': [300],
            'max_depth': [3, 4, 5],
            'colsample_bytree': [0.3, 0.7],
            'subsample': [0.5, 0.8],
            'gamma': [0, 0.1, 0.2]
        }
    },
    "CatBoost": {
        "model": CatBoostRegressor(),
        "params": {
            'learning_rate': [0.1, 0.05, 0.01],
            'n_estimators': [300],
            'depth': [3, 4, 5],
            'colsample_bylevel': [0.3, 0.7],
            'subsample': [0.5, 0.8],
            'l2_leaf_reg': [1, 3, 5]
        }
    }
}


best_models = {}
results = []

# Grid search and evaluation for each model
for name, model_data in all_models.items():
    model = model_data["model"]
    params = model_data["params"]
    
    if "Regressor" in name:
        model = make_pipeline(RobustScaler(), model)
    
    grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_absolute_error')
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    y_pred = best_model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    r_squared = r2_score(y_test, y_pred)
    
    best_models[name] = best_model
    results.append({'Model': name, 'MAE': mae, 'R-squared': r_squared, 'Best Params': best_params})

# Save the best models to disk
for name, model in best_models.items():
    joblib.dump(model, f"{name}_model.pkl")
    print(f"Saved {name} model to {name}_model.pkl")

In [5]:
locale.setlocale(locale.LC_ALL, ('en_US', 'UTF-8'))

example_values = {
    'area': [100],
    'num_of_rooms': [2],
    'district_Almazar': [1],
    'district_Bektemir': [0],
    'district_Chilanzar': [0],
    'district_Mirabad': [0],
    'district_Mirzo-Ulugbek': [0],
    'district_Sergerli': [0],
    'district_Shaikhantaur': [0],
    'district_Uchtepa': [0],
    'district_Yakkasaray': [0],
    'district_Yangihayat': [0],
    'district_Yashnabad': [0],
    'district_Yunusabad': [0],
    'renovation_Euro': [0],
    'renovation_Normal': [0],
    'renovation_Required': [1],
    'type_apartment': [1],
    'type_house': [0],
    'building_type_Primary market': [0],
    'building_type_Secondary market': [1]
}

example_df = pd.DataFrame(example_values)

# scaler = joblib.load('min_max_scaler.pkl')

# example_df[['area', 'num_of_rooms']] = scaler.transform(example_df[['area', 'num_of_rooms']])

loaded_models = {
    'GradientBoosting': joblib.load('GradientBoosting.pkl'),
    'XGBoost': joblib.load('XGBoost_model.pkl'),
    'CatBoost': joblib.load('CatBoost_model.pkl')
}

for name, model in loaded_models.items():
    predictions = model.predict(example_df)

    predictions_formatted = [f'{pred/ 12_500:,.0f}'.replace(',', ' ') for pred in predictions]
    print(f"Predictions using {name} model:")
    for pred in predictions_formatted:
        print(pred)

FileNotFoundError: [Errno 2] No such file or directory: 'Gradient Boosting_model.pkl'