Academic Integrity and Learning Statement

By submitting my work, I confirm that:

1. The code, analysis, and documentation in this notebook are my own work and reflect my own understanding.
2. I am prepared to explain all code and analysis included in this submission.

If I used assistance (e.g., AI tools, tutors, or other resources), I have:

- Clearly documented where and how external tools or resources were used in my solution.
- Included a copy of the interaction (e.g., AI conversation or tutoring notes) in an appendix.

I acknowledge that:

- I may be asked to explain any part of my code or analysis during evaluation.
- Misrepresenting assisted work as my own constitutes academic dishonesty and undermines my learning.

In [1]:
import numpy as np
import os
import pandas as pd
import multiprocessing
import subprocess

import comet_ml, mlflow
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import optuna
import tensorflow as tf
import torch
import platform
from joblib import load

In [2]:
# Enable auto-reload extension
%load_ext autoreload
# Automatically reload all modules before executing code
%autoreload 2

In [3]:
import base_utils_logging
import proj_utils_data_loader
import proj_configs
import proj_utils
import proj_utils_feat_engg
import proj_utils_plots
import proj_utils_model

In [4]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")

In [5]:
# Check software specs
dict_sw_version = {
    'python': os.popen('python --version').read().strip(),
    'numpy': np.__version__,
    'pandas': pd.__version__,
    'optuna': optuna.__version__,
    'mlflow': mlflow.__version__,
}

for key, value in dict_sw_version.items():
    print(f'{proj_utils_plots.beautify(key, 1)} version is: {proj_utils_plots.beautify(value)}')


[1m[32mpython[0m version is: [1m[35mPython 3.11.13[0m
[1m[32mnumpy[0m version is: [1m[35m1.26.4[0m
[1m[32mpandas[0m version is: [1m[35m2.2.3[0m
[1m[32moptuna[0m version is: [1m[35m4.4.0[0m
[1m[32mmlflow[0m version is: [1m[35m2.18.0[0m


In [6]:
# Check hardware specs
def get_mac_gpu_info():
    try:
        # Get system information about GPU
        result = subprocess.run(['system_profiler', 'SPDisplaysDataType'],
                              capture_output=True, text=True)
        return result.stdout
    except Exception as e:
        return f"Error getting GPU info: {e}"

# Check CPU cores
print(f'CPU cores available to use: {proj_utils_plots.beautify(str(multiprocessing.cpu_count()))}')

# Check MPS availability
print("TensorFlow GPU devices:", proj_utils_plots.beautify(tf.config.list_physical_devices('GPU')))
print(f"Processor: {proj_utils_plots.beautify(platform.processor())}")
print(f"Machine: {proj_utils_plots.beautify(platform.machine())}")

print("PyTorch MPS (Metal) Status:")
print(f"MPS available: {proj_utils_plots.beautify(torch.backends.mps.is_available())}")
print(f"MPS built: {proj_utils_plots.beautify(str(torch.backends.mps.is_built()))}")

# Get detailed GPU information
print("\nDetailed GPU Information:")
print(get_mac_gpu_info())

CPU cores available to use: [1m[35m10[0m
TensorFlow GPU devices: [1m[35m[][0m
Processor: [1m[35marm[0m
Machine: [1m[35marm64[0m
PyTorch MPS (Metal) Status:
MPS available: [1m[35mTrue[0m
MPS built: [1m[35mTrue[0m

Detailed GPU Information:
Graphics/Displays:

    Apple M4:

      Chipset Model: Apple M4
      Type: GPU
      Bus: Built-In
      Total Number of Cores: 10
      Vendor: Apple (0x106b)
      Metal Support: Metal 3
      Displays:
        DELL U3425WE:
          Resolution: 3440 x 1440 (UWQHD - Ultra-Wide Quad HD)
          UI Looks like: 3440 x 1440 @ 100.00Hz
          Main Display: Yes
          Mirror: Off
          Online: Yes
          Rotation: Supported




In [7]:
base_utils_logging.setup_logging()

In [8]:
base_utils_logging.logger.info('Starting the application')

In [9]:
df_raw_train = proj_utils_data_loader.load_data(proj_configs.TRAIN_FILE)
df_raw_test = proj_utils_data_loader.load_data(proj_configs.TEST_FILE)
df_raw_train.shape, df_raw_test.shape

((1460, 81), (1459, 80))

In [10]:
df_raw_train.sample(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
826,827,45,RM,50.0,6130,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2008,WD,Normal,109500
528,529,30,RL,58.0,9098,Pave,,IR1,Lvl,AllPub,...,0,,,,0,7,2007,WD,Normal,86000
557,558,50,C (all),60.0,11040,Pave,,Reg,Low,AllPub,...,0,,,,0,9,2006,COD,Normal,108000


In [11]:
low_cardinality_cols = [cname for cname in df_raw_train.columns
                        if df_raw_train[cname].nunique() < 10 and
                        df_raw_train[cname].dtype == "object"]

In [13]:
len(low_cardinality_cols)

40

In [14]:
# TODO: uncomment and comment below insignificant col list
# insignificant_cols = ['Order', 'PID']
insignificant_cols = ['Id']
target_col = 'SalePrice'
ignorables_cols = insignificant_cols + [target_col]
ordinal_cols = ['LotShape', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']
temporal_cols_name_pattern = ['Yr', 'Year']

In [15]:
df_raw_all, df_raw_target = proj_utils_data_loader.merge_train_test_data(df_raw_train, df_raw_test, insignificant_cols, target_col)
df_raw_all.shape, df_raw_target.shape

((2919, 80), (1460,))

In [16]:
# Slice of train data records from the merged data frame
df_train = df_raw_all[df_raw_all['is_train']==1].iloc[:,:-1]
df_test = df_raw_all[df_raw_all['is_train']==0].iloc[:, :-1]

In [17]:
n_cat_cardinality_threshold = proj_configs.CATEGORICAL_CARDINALITY_THRESHOLD_ABS
threshold_type = 'ABS'
feature_categories = proj_utils_feat_engg.classify_columns(df=df_train, n_cat_threshold=n_cat_cardinality_threshold, threshold_type=threshold_type, cols_to_ignore=ignorables_cols, temporal_cols_name_pattern=temporal_cols_name_pattern, ordinal_cols=ordinal_cols)

In [18]:
cols_num_continuous, n_num_continuous, cols_num_discrete, n_num_discrete, cols_cat_nominal, n_cat_nominal, cols_cat_ordinal, n_cat_ordinal, cols_object, n_object, cols_temporal, n_temporal, cols_binary, n_binary = proj_utils_feat_engg.get_cols_as_tuple(feature_categories)

n_total = df_train.shape[1] - len(ignorables_cols)

print(f"="*80)
print(f"Total raw columns = {proj_utils_plots.beautify(str(len(df_train.columns)))} \nNumerical Continuous = {proj_utils_plots.beautify(n_num_continuous)} \nNumerical Discrete = {proj_utils_plots.beautify(n_num_discrete)} \nCategorical Nominal = {proj_utils_plots.beautify(n_cat_nominal)} \nCategorical Ordinal = {proj_utils_plots.beautify(n_cat_ordinal)} \nObject/String = {proj_utils_plots.beautify(n_object)} \nTemporal = {proj_utils_plots.beautify(n_temporal)} \nBinary = {proj_utils_plots.beautify(n_binary)}")

print(f"="*80)
print(f"Any inconsistencies detected?[True/False] = {proj_utils_plots.beautify('True', 3) if n_total != len(df_train.columns) - len(ignorables_cols) else proj_utils_plots.beautify('False', 1)}")
print(f'='*80)

Total raw columns = [1m[35m79[0m 
Numerical Continuous = [1m[35m20[0m 
Numerical Discrete = [1m[35m10[0m 
Categorical Nominal = [1m[35m17[0m 
Categorical Ordinal = [1m[35m18[0m 
Object/String = [1m[35m7[0m 
Temporal = [1m[35m4[0m 
Binary = [1m[35m3[0m
Any inconsistencies detected?[True/False] = [1m[32mFalse[0m


In [None]:
# Calculate the number of NaN values for each column
nan_counts = df_train.isna().sum()

# Filter only columns that have NaN values and sort by the number of NaNs
cols_with_nans = nan_counts[nan_counts > 0].index.tolist()
print(f"Columns with NaNs: = {proj_utils_plots.beautify(str(len(cols_with_nans)))}/{proj_utils_plots.beautify(n_total)}")
print(f"And they are: {cols_with_nans}")

In [None]:
df_cardinality = proj_utils_feat_engg.get_cardinality_df(df_train)

In [None]:
df_cardinality

In [None]:
proj_utils_plots.plot_cardinality(df_cardinality, n_cat_cardinality_threshold, threshold_used=threshold_type, type_of_cols='all', figsize=(20, 6))

In [None]:
# Zoom into cols having NaNs only
proj_utils_plots.plot_cardinality(df_cardinality[df_cardinality['col_name'].isin(cols_with_nans)], n_cat_cardinality_threshold, threshold_used=threshold_type, type_of_cols="NaN", figsize=(10, 6))

In [None]:
df_train[cols_num_continuous].isnull().sum().sort_values(ascending=False)

In [None]:
# Creating a copy of the raw data to impute missing values for plotting purposes only (as NaNs are not plotted)
df_imputed_for_plots = df_train.copy()
df_imputed_for_plots[cols_num_continuous] = df_imputed_for_plots[cols_num_continuous].fillna(0)
most_frequent = df_imputed_for_plots[cols_num_discrete].mode().iloc[0]
df_imputed_for_plots[cols_num_discrete] = df_imputed_for_plots[cols_num_discrete].fillna(most_frequent)

In [None]:
df_imputed_for_plots

In [None]:
df_raw_target

In [None]:
df_imputed_for_plots_v2 = pd.concat([df_train[cols_num_continuous], df_raw_target], axis=1)

In [None]:
df_imputed_for_plots_v2.sample(2)

In [None]:
# correlation_plot = plot_correlation_with_demand(df, save_path="correlation_plot.png")
correlation_plot = proj_utils_plots.plot_correlation_with_target(df_imputed_for_plots_v2, target_col)

In [None]:
proj_utils_plots.plot_numerical_distribution(df_imputed_for_plots, cols_num_continuous)

In [None]:
proj_utils_plots.plot_categorical_distribution(df_imputed_for_plots, cols_cat_nominal)

In [None]:
proj_utils_plots.plot_categorical_distribution(df_imputed_for_plots, cols_cat_ordinal)

In [None]:
proj_utils_plots.plot_relationship_to_target(df_imputed_for_plots, cols_num_discrete, target_col)

In [None]:
proj_utils_plots.plot_relationship_to_target(df_imputed_for_plots, cols_num_discrete, target_col, trend_type='median')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    df_train,
    df_raw_target,
    test_size=proj_configs.VALIDATION_SIZE,
    random_state=proj_configs.RANDOM_STATE
)

In [None]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

In [None]:
num_columns = cols_num_continuous
cat_columns = cols_cat_nominal + cols_cat_ordinal + cols_num_discrete + cols_binary + cols_object
tempo_columns = cols_temporal

In [None]:
len(num_columns), len(cat_columns), len(tempo_columns)

In [None]:
pproc_pipe = proj_utils_feat_engg.create_pproc_pipeline(num_columns, cat_columns, tempo_columns)

In [None]:
# Login to mlflow
# mlflow.login()
# proj_utils_model.set_mlflow_uri("databricks")
# mlflow_experiment_name = f"/Users/asheesh.ambardar@live.com/{proj_configs.PROJECT_NAME}"
# mlflow_experiment_id = proj_utils_model.get_or_create_experiment(mlflow_experiment_name)
# proj_utils_model.set_mlflow_experiment(mlflow_experiment_name)
# model_uri = mlflow.get_artifact_uri(artefact_path)
# model_uri

In [None]:
comet_experiment = comet_ml.Experiment()
# comet_experiment.set_name(proj_configs.PROJECT_NAME)

In [None]:
# Verify connection
if comet_experiment.api_key:
    print("Successfully connected to Comet ML!")
else:
    print("Failed to connect to Comet ML")


In [None]:
import warnings, logging
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)
logging.getLogger("mlflow").setLevel(logging.ERROR)

In [None]:
X_train_transformed = pproc_pipe.fit_transform(X_train)

In [None]:
X_val_transformed = pproc_pipe.transform(X_val)

In [None]:
y_train_transformed = y_train.to_numpy()
y_val_transformed = y_val.to_numpy()

In [None]:
# Check for both NaN and None
has_nulls_or_nans = pd.isna(X_train_transformed).any()
print(f"Contains null or NaN values: {has_nulls_or_nans}")

In [None]:
type(y_val)

In [None]:
comet_experiment = comet_ml.Experiment()
run_name='xgb-10'
try:
    optimised_study_xgb = proj_utils_model.run_hyperparam_tuning_xgb_exp(X_train_transformed, y_train_transformed, X_val_transformed, y_val_transformed, comet_experiment, run_name, proj_configs.OPTUNA_TRIAL_COUNT)
finally:
    comet_experiment.end()

In [None]:
import os
import pickle

def safe_load_model(model_path):
        try:
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Model file not found at {model_path}")

            with open(model_path, 'rb') as f:
                model = pickle.load(f)
            return model
        except (pickle.UnpicklingError, KeyError) as e:
            print(f"Error loading model: {str(e)}")
            print("This might be due to version mismatch or corrupted file")
            return None

# Load the model
model_path = f"{proj_configs.PATH_OUT_MODELS}xgb_model.pkl"
loaded_model = safe_load_model(model_path)

if loaded_model is not None:
    print("Model loaded successfully")

In [None]:
loaded_model

In [None]:
df_test = df_raw_all[df_raw_all['is_train']==0].iloc[:,:-1]

In [None]:
df_test.sample(3)

In [None]:
data_test_transformed = pproc_pipe.transform(df_test)
type(data_test_transformed)

In [None]:
# Check for both NaN and None
has_nulls_or_nans = pd.isna(data_test_transformed).any()
print(f"Contains null or NaN values: {has_nulls_or_nans}")

In [None]:
data_train_transformed = pproc_pipe.transform(df_train)
type(data_train_transformed)

In [None]:
# Check for both NaN and None
has_nulls_or_nans = pd.isna(data_train_transformed).any()
print(f"Contains null or NaN values: {has_nulls_or_nans}")

In [None]:
df_raw_target.shape

In [None]:
data_target = df_raw_target.to_numpy()

In [None]:
loaded_model.fit(data_train_transformed, data_target)

In [None]:
train_preds = loaded_model.predict(data_train_transformed)
train_actuals = data_target

In [None]:
type(train_preds)

In [None]:
# Evaluate the model
train_mse = round(mean_squared_error(train_actuals, train_preds), 5)
train_r2 = round(r2_score(train_actuals, train_preds), 5)

print("=== Model Performance ===")
print(f"Train MSE: {proj_utils_plots.beautify(train_mse)}, Train R2: {proj_utils_plots.beautify(train_r2)}")

In [None]:
test_preds = loaded_model.predict(data_test_transformed)

In [None]:
test_preds

In [None]:
df_my_submission = pd.DataFrame({'Id': df_raw_test.Id, 'SalePrice': test_preds})
my_submission_file = f"{proj_configs.PATH_OUT_SUBMISSIONS}submission.csv"
df_my_submission.to_csv(my_submission_file, index=False)

In [None]:
# this is a Trial object, not the underlying ML object.
best_performing_trial = optimised_study_xgb.best_trial
print(f'Best trial was at number {proj_utils_plots.beautify(str(best_performing_trial.number), 1)} with params as:\n {proj_utils_plots.beautify(str(best_performing_trial.params), 2)}')
print(f'Best score value is: {proj_utils_plots.beautify(str(best_performing_trial.value))}')

In [None]:
best_performing_trial

In [None]:
study_full_metrics = optimised_study_xgb.trials_dataframe()
# save the metrics to a file
proj_utils_model.save_hyperparams(f'full_metrics_{proj_utils.get_current_timestamp()}.csv', proj_configs.PATH_OUT_MODELS, study_full_metrics)

# peek at the full metrics dataframe
study_full_metrics

In [None]:
# fetch number of trial runs per model type
num_lr_trials = study_full_metrics[study_full_metrics['params_model'] == 'lr'].shape[0]
num_lasso_trials = study_full_metrics[study_full_metrics['params_model'] == 'lasso'].shape[0]
num_ridge_trials = study_full_metrics[study_full_metrics['params_model'] == 'ridge'].shape[0]
num_elasticnet_trials = study_full_metrics[study_full_metrics['params_model'] == 'elasticnet'].shape[0]

print(f'Total trials = {proj_utils_plots.beautify(str(num_lr_trials + num_lasso_trials + num_ridge_trials + num_elasticnet_trials), 1)}\n-- LR trials = {proj_utils_plots.beautify(str(num_lr_trials), 1)}\n-- Lasso trials = {proj_utils_plots.beautify(str(num_lasso_trials), 1)}\n-- Ridge trials = {proj_utils_plots.beautify(str(num_ridge_trials), 1)}\n-- ElasticNet trials = {proj_utils_plots.beautify(str(num_elasticnet_trials), 1)}')

In [None]:
# retrieve all performance values for each model type studied
grp_by_model_type_val = study_full_metrics.groupby('params_model')['value'].apply(list)
# retrieve the best performing model (use nsmallest if Optuna objective was to minimise,
grp_by_model_type_best_val = study_full_metrics.groupby('params_model')['value'].nsmallest(1)
# display the stats
grp_by_model_type_best_val

In [None]:
 # grouping Optuna metrics by model type and using idxmax (or idmin) method to find a row with the best model performance (value) for each group
study_best_model_group = study_full_metrics.loc[study_full_metrics.groupby('params_model')['value'].idxmin()]

In [None]:
study_best_model_group

In [None]:
# retrieve the trial number of the best model for each model type - the Optuna metrics dataframe index and trial number are the same.
best_lr_trial = study_best_model_group[study_best_model_group['params_model'] == 'lr']['number'].values[0]
best_lasso_trial = study_best_model_group[study_best_model_group['params_model'] == 'lasso']['number'].values[0]
best_ridge_trial = study_best_model_group[study_best_model_group['params_model'] == 'ridge']['number'].values[0]
best_elasticnet_trial = study_best_model_group[study_best_model_group['params_model'] == 'elasticnet']['number'].values[0]

final_pipe_best_lr = models[best_lr_trial]
best_model_lr = final_pipe_best_lr.named_steps['regressor']
final_pipe_best_lasso = models[best_lasso_trial]
best_model_lasso = final_pipe_best_lasso.named_steps['regressor']
final_pipe_best_ridge = models[best_ridge_trial]
best_model_ridge = final_pipe_best_ridge.named_steps['regressor']
final_pipe_best_elasticnet = models[best_elasticnet_trial]
best_model_elasticnet = final_pipe_best_elasticnet.named_steps['regressor']

# retrieve the best model object (amongst all model types evaluated)
final_pipe_best = models[best_performing_trial.number]
best_model = final_pipe_best.named_steps['regressor']

In [None]:
final_pipe_best

In [None]:
final_pipe_best.fit(X_train, y_train)

In [None]:
cols_final_inputs, cols_final_output_features = proj_utils_feat_engg.get_final_features(final_pipe_best, X_train)

In [None]:
proj_utils_model.save_features(f'pproc_final_input_cols_{len(cols_final_inputs)}_{proj_utils.get_current_timestamp()}.csv', proj_configs.PATH_OUT_FEATURES, pd.DataFrame(cols_final_inputs))
proj_utils_model.save_features(f'pproc_final_output_features_{len(cols_final_output_features)}_{proj_utils.get_current_timestamp()}.csv', proj_configs.PATH_OUT_FEATURES, pd.DataFrame(cols_final_output_features))

In [None]:
proj_utils_model.save_model(f'final_pipe_{proj_utils.get_current_timestamp()}.pkl', proj_configs.PATH_OUT_MODELS, final_pipe_best)

In [None]:
y_train_preds = final_pipe_best.predict(X_train)
y_val_preds = final_pipe_best.predict(X_val)

In [None]:
# Evaluate the model
train_mse = mean_squared_error(y_train, y_train_preds).round(5)
val_mse = mean_squared_error(y_val, y_val_preds).round(5)
train_r2 = r2_score(y_train, y_train_preds).round(5)
val_r2 = r2_score(y_val, y_val_preds).round(5)

print("=== Model Performance ===")
print(f"Train MSE: {proj_utils_plots.beautify(train_mse)}, Train R2: {proj_utils_plots.beautify(train_r2)}")
print(f"Validation MSE: {proj_utils_plots.beautify(val_mse)}, Validation R2: {proj_utils_plots.beautify(val_r2)}")

In [None]:
string_to_log = f'=== Model Performance === \n Train MSE: {train_mse}, Train R2: {train_r2} \n Validation MSE: {val_mse}, Validation R2: {val_r2}'
proj_utils.save_file('metrics', 'validation_metrics.txt', proj_configs.PATH_OUT_MODELS, string_to_log)