# Importing

In [1]:
import pandas as pd
import numpy as np
import joblib
import os

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer

# for load the data pre-preparation function
import importlib.util

In [2]:
np.set_printoptions(suppress=False)

work_dir = r'C:\Users\krasavica\Desktop\Projekty - DS\python-project-ApartmentPriceAnalysis'
os.chdir(work_dir)

pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
# loading functions from the file
module_name = f"pipeline_pre-processing"
module_path = f"pipeline_pre-processing.py"

spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)

# Functions for training finale model
Before modeling, the data were processed as follows:
1. categorical features were one-hot encoded - **one_hot_encode_train_ecoder**
2. all features were scale with z-score normalization - **scale_train_scaler**
3. missing data was filled usin kNN imputation - **kNN_impute_train_imputer**
4. all the steps above were combined, and the fitted encoder, scaler and imputer were saved - **prepare_final_training_data**
5. additional data modification to improve model predictions for low-cost and high-priced housing groups (additional explanation in model_choosing_analysis file) - **modify_train_data**

### one_hot_encode

In [4]:
def one_hot_encode_train_encoder(X_to_encode, features_to_onehotencode):
    
    """
    Applies one-hot encoding to specified categorical features in the training dataset
    and returns both the transformed DataFrame and a dictionary of trained encoders.

    For each categorical feature, the function:
    - Checks whether specific unwanted categories (e.g., 'nie podano') are present.
    - Drops the first matching category from the one-hot encoding if found, otherwise drops the first by default.
    - Uses sklearn's OneHotEncoder with 'handle_unknown=ignore' to handle unseen categories gracefully.
    - Returns a transformed DataFrame and a dictionary of fitted encoders for future use (e.g., for encoding test data).

    Parameters:
    -----------
    X_to_encode : pd.DataFrame
        Input DataFrame containing categorical features to be encoded.

    features_to_onehotencode : list of str
        List of column names in `X_to_encode` to apply one-hot encoding on.

    Returns:
    --------
    X : pd.DataFrame
        DataFrame with original categorical features replaced by one-hot encoded columns.

    encoders : dict
        Dictionary mapping each encoded feature name to its corresponding fitted OneHotEncoder instance.
    """
    
    # Create copy of the original dataset to avoid modifying it in-place
    X = X_to_encode.copy()
    
    # Dictionary to store fitted encoders for each column
    encoders = {}
    
    for column in features_to_onehotencode:
        # Define categories that should be dropped if they exist
        categories_to_drop = ['nie podano', 'inny', 'wtórny', 'prywatny', 'opolskie']
        
        # Determine which category to drop (first match found)
        cat_to_drop = None
        
        for cat in categories_to_drop:
            if cat in X[column].unique():
                cat_to_drop = [cat]
                break
                
        # Create and fit the encoder
        encoder = OneHotEncoder(drop = cat_to_drop if cat_to_drop else 'first',
                                sparse_output = False, handle_unknown = 'ignore')
        encoder.fit(X[[column]])
        
        # Get names of one-hot encoded columns
        cols = encoder.get_feature_names_out([column])
        
        # Transform the data and wrap in DataFrame
        X_encoded = pd.DataFrame(encoder.transform(X[[column]]), columns=cols, index=X.index)
        
        # Append the encoded columns to the dataset
        X = pd.concat([X, X_encoded], axis=1)
        
        # Save the encoder for this column
        encoders[column] = encoder
        
    # Drop the original categorical column
    X.drop(features_to_onehotencode, axis=1, inplace=True)
    
    return X, encoders

### scale

In [5]:
def scale_train_scaler(X_to_scale):
    
    """
    Applies standard scaling (zero mean, unit variance) to the input dataset and returns
    both the scaled dataset and the fitted scaler for future use (e.g., for test data).

    The function performs the following steps:
    - Copies the input DataFrame to avoid modifying the original.
    - Fits a 'StandardScaler' from 'sklearn.preprocessing' to the data.
    - Applies the transformation and returns the scaled data along with the scaler.

    Parameters:
    -----------
    X_to_scale : pd.DataFrame
        The DataFrame containing numerical features to be scaled.

    Returns:
    --------
    X : pd.DataFrame
        The scaled version of the input DataFrame, with the same column names.

    scaler : StandardScaler
        The fitted 'StandardScaler' object, which can be used to transform new data
        (e.g., test set) using the same scaling parameters.
    """
    
    # Create copy of the original dataset to avoid modifying it in-place
    X = X_to_scale.copy()
    
    # Create the scaler
    scaler = StandardScaler()

    # Fit and apply the scaler to the data and preserve column names
    X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
        
    return X, scaler

### kNN_impute

In [6]:
def kNN_impute_train_imputer(X_to_impute, features_for_kNN_impute, features_to_onehotencode):       
    
    """
    Performs K-Nearest Neighbors (KNN) imputation on selected features of a dataset,
    including those that have been one-hot encoded.

    The function:
    - Identifies features that were originally categorical and have been one-hot encoded.
    - Replaces these original variables in the list of features to impute with their
      one-hot encoded column names.
    - Fits a 'KNNImputer' on the specified features.
    - Returns the dataset with imputed values and a dictionary containing the fitted
      imputer and the actual list of features used.

    Parameters:
    -----------
    X_to_impute : pd.DataFrame
        The input dataset (typically already encoded and scaled) with missing values to be imputed.

    features_for_kNN_impute : list of str
        List of original features intended for imputation.

    features_to_onehotencode : list of str
        List of features that were one-hot encoded (to help expand into multiple columns).

    Returns:
    --------
    X : pd.DataFrame
        The dataset with imputed values for the specified features.

    imputer_dict : dict
        A dictionary containing:
            - 'imputer' : the fitted 'KNNImputer'`object
            - 'features': the actual list of column names used for imputation,
                          including one-hot encoded columns.
    """
    
    # Create copy of the original dataset to avoid modifying it in-place
    X = X_to_impute.copy()
    
     # Copy the list of features to impute
    features_for_kNN_impute_new = features_for_kNN_impute.copy()
    
    # Identify one-hot encoded variables among features to impute
    encoded_vars = list(set(features_for_kNN_impute) & set(features_to_onehotencode))
    
    # Replace encoded variables with their actual one-hot encoded column names
    for var in encoded_vars:
        col_after_encode = [col for col in X.columns if col.startswith(var)]
        features_for_kNN_impute_new.remove(var)
        features_for_kNN_impute_new = features_for_kNN_impute_new + col_after_encode
        
    # Create the imputer
    imputer = KNNImputer()
    
    # Fit and apply imputation only on the selected columns
    X[features_for_kNN_impute_new] = imputer.fit_transform(X[features_for_kNN_impute_new])
    
    # Return the transformed data and the fitted imputer with the column names used
    imputer_dict = {'imputer': imputer,
                    'features': features_for_kNN_impute_new}
    
    return X, imputer_dict

### prepare_final_training_data

In [7]:
def prepare_final_training_data(data_to_train, to_save = True):
    
    """
    Prepares the final training dataset by applying a complete preprocessing pipeline,
    including one-hot encoding, scaling, and KNN imputation.

    This function:
    - Loads the model configuration (features, encoders) from a predefined dictionary.
    - Applies preprocessing steps in the following order:
        1. Cleans missing and outlier values.
        2. One-hot encoding of categorical variables.
        3. Feature standardization (z-score scaling).
        4. Missing data imputation using K-Nearest Neighbors.
    - Optionally saves the fitted preprocessing objects (encoder, scaler, imputer) for future use.

    Parameters:
    -----------
    data : pd.DataFrame
        The raw input dataset containing both features and the target variable.

    to_save : bool, default=True
        If True, saves the fitted encoder, scaler, and imputer to disk under the
        folder '1. Data Preparation/pipeline_objects'.

    Returns:
    --------
    X_imputed : pd.DataFrame
        The fully preprocessed feature matrix ready for training.

    y : pd.Series
        The target variable extracted from the input data.
    """
    
    # Load the list of features and settings used for preprocessing
    model_features_dict = joblib.load("1. Data Preparation/model_features_dict.joblib")
    
    features_to_use = model_features_dict['features_to_use']
    target = model_features_dict['target']
    features_to_onehotencode = model_features_dict['features_to_onehotencode']
    features_for_kNN_impute = model_features_dict['features_for_kNN_impute']
    
    # Check whether all required features are present in the data
    if all(x in data_to_train.columns for x in model_features_dict['features_to_use']) == 0:
        print("The dataset does not have all the variables defined for modeling")
    
    # Copy the dataset to avoid in-place changes
    data = data_to_train.copy()
    
    # Proceed missing and outlier values
    data = module.cleaning_data(data, train_dataset = True)
    
    # Ensure compatibility with np.nan
    data = data.replace({pd.NA: np.nan})
    
    # Separate input features and target
    X = data[features_to_use]
    y = data[target]
    
    # One-hot encode categorical features
    X_encoded, encoder = one_hot_encode_train_encoder(X, features_to_onehotencode)
    
    # Scale features using StandardScaler
    X_scaled, scaler = scale_train_scaler(X_encoded)
    
    # Impute missing values using KNN imputer
    X_imputed, imputer = kNN_impute_train_imputer(X_scaled, features_for_kNN_impute, features_to_onehotencode)
    
    # Optionally save preprocessing objects to disk for future use
    if to_save: 
        joblib.dump(encoder, "production_pipeline_objects/encoder.pkl")
        joblib.dump(scaler, "production_pipeline_objects/scaler.pkl")
        joblib.dump(imputer, "production_pipeline_objects/imputer.pkl")
    
    return X_imputed, y

## modify_train_data

In [8]:
def modify_train_data(X_train, y_train):
    
    """
    Modify the target variable 'y_train' based on specified lower and upper thresholds 
    and associated multiplicative weights.

    Parameters
    ----------
    X_train : pandas.DataFrame or numpy.ndarray
        Feature matrix used for training.
    
    y_train : pandas.Series or numpy.ndarray
        Target vector used for training.

    lower_thresholds : list of float, optional (default = [0])
        Threshold values below which the corresponding lower_wages will be applied.
        Each threshold defines a cutoff, and values below it will be multiplied.

    lower_wages : list of float, optional (default = [1])
        Multiplicative weights applied to target values below each corresponding
        value in 'lower_thresholds'.

    upper_thresholds : list of float, optional (default = [1_500_000])
        Threshold values above which the corresponding upper_wages will be applied.
        Each threshold defines a cutoff, and values above it will be multiplied.

    upper_wages : list of float, optional (default = [1])
        Multiplicative weights applied to target values above each corresponding
        value in 'upper_thresholds'.

    Returns
    -------
    X_train : unchanged
        The input features passed through without modification.

    y_train_copy : pandas.Series or numpy.ndarray
        A modified version of `y_train`, where values below/above the defined 
        thresholds have been scaled accordingly.

    """
    modification_settings = joblib.load("2. Model Building/final_modification_settings.joblib")
    
    lower_thresholds = modification_settings["lower_thresholds"]
    lower_wages = modification_settings["lower_wages"]
    upper_thresholds = modification_settings["upper_thresholds"]
    upper_wages = modification_settings["upper_wages"]

    # Create a copy of y_train to avoid modifying the original target data
    y_train_copy = y_train.copy()
    
    # Apply scaling to values below the given lower thresholds
    for i in range(len(lower_thresholds)):
        condition = y_train_copy < lower_thresholds[i]
        y_train_copy[condition] = y_train_copy[condition] * lower_wages[i]

    # Apply scaling to values above the given upper thresholds
    for i in range(len(upper_thresholds)):
        condition = y_train_copy > upper_thresholds[i]
        y_train_copy[condition] = y_train_copy[condition] * upper_wages[i]

    return X_train, y_train_copy

# Training final model and saving pipeline elements for production

In [9]:
data_initial = pd.read_csv('data_2024-01.csv', index_col = 0)
data_for_analysis = module.preliminary_transform(data_initial, True)

All the categorizations occurring in the set in multi-vector selection variables were coded.


In [10]:
# Application of the function
X, y = prepare_final_training_data(data_for_analysis, to_save = True)

In [12]:
# Training the final model on parameters choosed in model_choosing_analysis file
model_params = joblib.load("2. Model Building/final_model_params.joblib")

X_modified, y_modified = modify_train_data(X, y)

trained_model = model_params.fit(X_modified, y_modified)

In [None]:
# Savining trained model
joblib.dump(trained_model, "production_pipeline_objects/trained_model.joblib")

['production_pipeline_objects/trained_model.joblib']

In [28]:
# Saving the base value (mean) for SHAP report
shap_base_value = np.mean(y, axis=None)
shap_base_value_df = pd.DataFrame([shap_base_value],
                                   columns = ["base_value"])
shap_base_value_df.to_csv("production_pipeline_objects/shap_base_value")

In [None]:
# Saving the transformed data for testing the SHAP report
X.to_csv('3. Feature importance and report/feature_imp_data_X.csv')
pd.DataFrame(y, columns = ['price']).to_csv('3. Feature importance and report/feature_imp_data_y.csv')