In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

In [30]:
# Load data house_price dataset
train_data = '../artifacts/train.csv'
test_data = '../artifacts/test.csv'

df_train = pd.read_csv(train_data)
df_test = pd.read_csv(test_data)

# df_house = pd.concat((df_train.loc[:,'MSSubClass':],
#                       df_test.loc[:,'MSSubClass':])).reset_index(drop=True)

In [31]:
# Check for missing values in both datasets
missing_train = df_train.isnull().sum()
missing_test = df_test.isnull().sum()

# Filter columns with missing values
missing_train = missing_train[missing_train > 0].sort_values(ascending=False)
missing_test = missing_test[missing_test > 0].sort_values(ascending=False)

# Display the missing values
missing_train, missing_test

(PoolQC          1453
 MiscFeature     1406
 Alley           1369
 Fence           1179
 MasVnrType       872
 FireplaceQu      690
 LotFrontage      259
 GarageType        81
 GarageYrBlt       81
 GarageFinish      81
 GarageQual        81
 GarageCond        81
 BsmtExposure      38
 BsmtFinType2      38
 BsmtQual          37
 BsmtCond          37
 BsmtFinType1      37
 MasVnrArea         8
 Electrical         1
 dtype: int64,
 PoolQC          1456
 MiscFeature     1408
 Alley           1352
 Fence           1169
 MasVnrType       894
 FireplaceQu      730
 LotFrontage      227
 GarageQual        78
 GarageCond        78
 GarageYrBlt       78
 GarageFinish      78
 GarageType        76
 BsmtCond          45
 BsmtQual          44
 BsmtExposure      44
 BsmtFinType1      42
 BsmtFinType2      42
 MasVnrArea        15
 MSZoning           4
 Functional         2
 BsmtFullBath       2
 Utilities          2
 BsmtHalfBath       2
 Exterior1st        1
 Exterior2nd        1
 TotalBsmtSF     

In [32]:
# Drop columns with more than 50% missing values in both datasets
columns_to_drop = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
df_train.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)

# Impute missing values for categorical columns with mode
categorical_columns = df_train.select_dtypes(include='object').columns
for col in categorical_columns:
    df_train[col].fillna(df_train[col].mode()[0], inplace=True)
    if col in df_test.columns:
        df_test[col].fillna(df_test[col].mode()[0], inplace=True)

# Impute missing values for numerical columns with median
numerical_columns = df_train.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_columns:
    df_train[col].fillna(df_train[col].median(), inplace=True)
    if col in df_test.columns:
        df_test[col].fillna(df_test[col].median(), inplace=True)

# Verify if all missing values are handled
missing_train_final = df_train.isnull().sum().sum()
missing_test_final = df_test.isnull().sum().sum()

missing_train_final, missing_test_final

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[col].fillna(df_train[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[col].fillna(df_test[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

(np.int64(0), np.int64(0))

In [33]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [34]:
corr_with_saleprice = df_train[numerical_columns].corr()["SalePrice"]
important_num_cols = list(corr_with_saleprice[(corr_with_saleprice > 0.50) | (corr_with_saleprice < -0.50)].index)
cat_cols = ["MSZoning", "Utilities","BldgType","Heating","KitchenQual","SaleCondition","LandSlope"]
important_cols = important_num_cols + cat_cols

df_train = df_train[important_cols]

In [35]:
df_train

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,GarageCars,GarageArea,SalePrice,MSZoning,Utilities,BldgType,Heating,KitchenQual,SaleCondition,LandSlope
0,7,2003,2003,856,856,1710,2,8,2,548,208500,RL,AllPub,1Fam,GasA,Gd,Normal,Gtl
1,6,1976,1976,1262,1262,1262,2,6,2,460,181500,RL,AllPub,1Fam,GasA,TA,Normal,Gtl
2,7,2001,2002,920,920,1786,2,6,2,608,223500,RL,AllPub,1Fam,GasA,Gd,Normal,Gtl
3,7,1915,1970,756,961,1717,1,7,3,642,140000,RL,AllPub,1Fam,GasA,Gd,Abnorml,Gtl
4,8,2000,2000,1145,1145,2198,2,9,3,836,250000,RL,AllPub,1Fam,GasA,Gd,Normal,Gtl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,6,1999,2000,953,953,1647,2,7,2,460,175000,RL,AllPub,1Fam,GasA,TA,Normal,Gtl
1456,6,1978,1988,1542,2073,2073,2,7,2,500,210000,RL,AllPub,1Fam,GasA,TA,Normal,Gtl
1457,7,1941,2006,1152,1188,2340,2,9,1,252,266500,RL,AllPub,1Fam,GasA,Gd,Normal,Gtl
1458,5,1950,1996,1078,1078,1078,1,5,1,240,142125,RL,AllPub,1Fam,GasA,Gd,Normal,Gtl


In [36]:
# Encode categorical variables using one-hot encoding
train_data_encoded = pd.get_dummies(df_train, drop_first=True)
test_data_encoded = pd.get_dummies(df_test, drop_first=True)

# Align the train and test datasets to ensure they have the same columns
train_data_encoded, test_data_encoded = train_data_encoded.align(test_data_encoded, join='left', axis=1)

# Fill any new NaN values in test data with 0 (caused by missing columns)
test_data_encoded.fillna(0, inplace=True)

# Normalize numerical columns
scaler = MinMaxScaler()
numerical_columns = df_train.select_dtypes(include=['int64', 'float64']).columns.drop('SalePrice')
train_data_encoded[numerical_columns] = scaler.fit_transform(train_data_encoded[numerical_columns])
test_data_encoded[numerical_columns] = scaler.transform(test_data_encoded[numerical_columns])

train_data_encoded.shape, test_data_encoded.shape

((1460, 35), (1459, 35))

In [37]:
X = train_data_encoded.drop(columns='SalePrice')
y = train_data_encoded['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Model Configurations
models = {
    "linear_regression": LinearRegression(),
    "random_forest_regressor": RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=2, random_state=42),
    "gradient_boosting_regressor": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
}

In [39]:
# Set up MLflow experiment
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment("house_price_prediction")

# Train and Evaluate Models
results = {}
for name, model in models.items():
    with mlflow.start_run(run_name=name):
        # Train the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        rmse = root_mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Log parameters, metrics, and model
        mlflow.log_param("model_name", name)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        mlflow.sklearn.log_model(model, "model")
        
        print(f"Model: {name} | RMSE: {rmse:.2f} | MAE: {mae:.2f} | R²: {r2:.4f}")

# results



Model: linear_regression | RMSE: 37615.57 | MAE: 23567.89 | R²: 0.8155




Model: random_forest_regressor | RMSE: 28373.75 | MAE: 17846.55 | R²: 0.8950




Model: gradient_boosting_regressor | RMSE: 27599.47 | MAE: 17855.54 | R²: 0.9007


In [40]:
# Select the best model based on R²
best_model_name = max(models, key=lambda x: r2_score(y_test, models[x].predict(X_test)))
best_model = models[best_model_name]

best_model_name

'gradient_boosting_regressor'