In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 4) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)


In [2]:
import os
print("Notebook working dir:", os.getcwd())           
print("Parent folder list:", os.listdir(".."))       
print("Is ../data a dir:", os.path.isdir("../data"))
print("Does ../data/train.csv exist:", os.path.exists("../data/train.csv"))


Notebook working dir: C:\Users\hp\dsp-bilal-razaghouru\notebooks
Parent folder list: ['.git', '.gitignore', '.ipynb_checkpoints', 'ave outputs\uf022', 'data', 'dsp-bilal-razaghouru', 'ershpdata-science-env', 'house-prices-modeling.ipynb', 'house_prices', 'model-industrialization-final.ipynb', 'model-industrialization-final.ipynb.ipynb', 'model-industrialization-final.ipynb.py', 'models', 'notebook.ipynb', 'notebooks', 'README.md', 'requirements.txt', 'requirments.txt', 'Untitled.ipynb']
Is ../data a dir: True
Does ../data/train.csv exist: True


In [7]:
import pandas as pd


train = pd.read_csv("../data/train.csv")
test  = pd.read_csv("../data/test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)


Train shape: (1460, 81)
Test shape: (1459, 80)


In [8]:
print("Continuous features (numerical):")
print(train.select_dtypes(include=['int64','float64']).columns.tolist())

print("\nCategorical features (object):")
print(train.select_dtypes(include=['object']).columns.tolist())


Continuous features (numerical):
['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']

Categorical features (object):
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',

In [9]:
cont_features = ['LotArea', 'GrLivArea']

cat_features = ['Neighborhood', 'HouseStyle']


In [10]:
print("Missing values in continuous features:")
print(train[cont_features].isnull().sum())

print("\nMissing values in categorical features:")
print(train[cat_features].isnull().sum())


Missing values in continuous features:
LotArea      0
GrLivArea    0
dtype: int64

Missing values in categorical features:
Neighborhood    0
HouseStyle      0
dtype: int64


In [11]:
train_cont = train[cont_features].fillna(train[cont_features].median())
train_cat  = train[cat_features].fillna('MISSING')



In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_cont)  # fit on train

train_cont_scaled = pd.DataFrame(
    scaler.transform(train_cont),
    columns=cont_features,
    index=train.index
)

train_cont_scaled.head(5)


Unnamed: 0,LotArea,GrLivArea
0,-0.207142,0.370333
1,-0.091886,-0.482512
2,0.07348,0.515013
3,-0.096897,0.383659
4,0.375148,1.299326


In [13]:
train_cat_encoded = pd.get_dummies(train_cat, drop_first=False)

train_cat_encoded.head(5)


Unnamed: 0,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,...,Neighborhood_Timber,Neighborhood_Veenker,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,True,False,False,False,False,False
2,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [14]:
import pandas as pd

X_train_processed = pd.concat([train_cont_scaled, train_cat_encoded], axis=1)
X_train_processed.head(5)


Unnamed: 0,LotArea,GrLivArea,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,...,Neighborhood_Timber,Neighborhood_Veenker,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,-0.207142,0.370333,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
1,-0.091886,-0.482512,False,False,False,False,False,False,False,False,...,False,True,False,False,True,False,False,False,False,False
2,0.07348,0.515013,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
3,-0.096897,0.383659,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
4,0.375148,1.299326,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [15]:
y_train = train['SalePrice'].values  # target


In [16]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train_processed, y_train)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [17]:
y_pred = rf.predict(X_train_processed)

y_pred = np.maximum(0, y_pred)


In [18]:
from sklearn.metrics import mean_squared_log_error
import numpy as np

def compute_rmsle(y_true, y_pred, precision=4):
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return round(rmsle, precision)

rmsle_score = compute_rmsle(y_train, y_pred)
print("RMSLE on training data:", rmsle_score)


RMSLE on training data: 0.0797


In [19]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_processed, y_train, test_size=0.2, random_state=42
)

rf.fit(X_tr, y_tr)
y_val_pred = np.maximum(0, rf.predict(X_val))
compute_rmsle(y_val, y_val_pred)


np.float64(0.1957)