In [1]:
import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split        # Split data
from sklearn.ensemble import RandomForestRegressor          # First model that will be tested (random forest)
from xgboost import XGBRegressor                            # Second model to test (Gradient Boosting Regressor)
%matplotlib inline

matplotlib.rcParams['figure.figsize'] = (10,7)

In [2]:
# Loading datasets
X_train = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')

In [3]:
# Split data
X_train, X_val = train_test_split(X_train, test_size=0.1, random_state=42)

In [4]:
X_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
907,908,50,RL,86.0,11500,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,250000
782,783,20,RL,67.0,16285,Pave,,IR2,Lvl,AllPub,...,0,,,,0,6,2009,WD,Normal,187100
952,953,85,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,4,2009,WD,Normal,133900
620,621,30,RL,45.0,8248,Pave,Grvl,Reg,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,67000
669,670,30,RL,80.0,11600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2006,WD,Normal,137500


In [5]:
X_train.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1095,1096,20,RL,78.0,9317,Pave,,IR1,Lvl,AllPub,...,0,,,,0,3,2007,WD,Normal,176432
1130,1131,50,RL,65.0,7804,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,12,2009,WD,Normal,135000
1294,1295,20,RL,60.0,8172,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Normal,115000
860,861,50,RL,55.0,7642,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,,0,6,2007,WD,Normal,189950
1126,1127,120,RL,53.0,3684,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2009,WD,Normal,174000


In [6]:
# Setting target and dropping target column
X_train_target = X_train['SalePrice']
X_train.drop('SalePrice', axis=1, inplace=True)

In [7]:
# Setting target and dropping target column
X_val_target = X_val['SalePrice']
X_val.drop('SalePrice', axis=1, inplace=True)

In [8]:
X_train_target

907     250000
782     187100
952     133900
620      67000
669     137500
         ...  
1095    176432
1130    135000
1294    115000
860     189950
1126    174000
Name: SalePrice, Length: 1314, dtype: int64

In [9]:
X_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
907,908,50,RL,86.0,11500,Pave,,IR1,Lvl,AllPub,...,322,0,,,,0,6,2006,WD,Normal
782,783,20,RL,67.0,16285,Pave,,IR2,Lvl,AllPub,...,0,0,,,,0,6,2009,WD,Normal
952,953,85,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,4,2009,WD,Normal
620,621,30,RL,45.0,8248,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
669,670,30,RL,80.0,11600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,7,2006,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,1096,20,RL,78.0,9317,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,3,2007,WD,Normal
1130,1131,50,RL,65.0,7804,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,12,2009,WD,Normal
1294,1295,20,RL,60.0,8172,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Normal
860,861,50,RL,55.0,7642,Pave,,Reg,Lvl,AllPub,...,0,0,,GdPrv,,0,6,2007,WD,Normal


In [10]:
X_val

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
892,893,20,RL,70.0,8414,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,2,2006,WD,Normal
1105,1106,60,RL,98.0,12256,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal
413,414,30,RM,56.0,8960,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,,,,0,3,2010,WD,Normal
522,523,50,RM,50.0,5000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,10,2006,WD,Normal
1036,1037,20,RL,89.0,12898,Pave,,IR1,HLS,AllPub,...,0,0,,,,0,9,2009,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526,527,20,RL,70.0,13300,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2007,WD,Normal
101,102,60,RL,77.0,9206,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
1092,1093,50,RL,60.0,8400,Pave,,Reg,Bnk,AllPub,...,0,0,,,,0,6,2008,WD,Normal
411,412,190,RL,100.0,34650,Pave,,Reg,Bnk,AllPub,...,0,0,,,,0,1,2006,WD,Normal


In [11]:
X_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [12]:
X_train.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      237
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 80, dtype: int64

In [13]:
X_val.isna().sum()

Id                0
MSSubClass        0
MSZoning          0
LotFrontage      22
LotArea           0
                 ..
MiscVal           0
MoSold            0
YrSold            0
SaleType          0
SaleCondition     0
Length: 80, dtype: int64

In [14]:
X_test.isna().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [15]:
X_train_target.isna().sum()

np.int64(0)

In [16]:
X_val_target.isna().sum()

np.int64(0)

In [17]:
# Establishing numeric vs object columns
numeric_cols = X_train.select_dtypes(include=np.number).columns
categorical_cols = X_train.select_dtypes(include='object').columns

In [18]:
numeric_cols

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')

In [19]:
categorical_cols

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [20]:
from sklearn.impute import SimpleImputer

In [21]:
# Creating imputer
imputer = SimpleImputer(copy=False)

In [22]:
# Fitting imputer on training set to ensure no data leakage
imputer.fit(X_train[numeric_cols])

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,False
,add_indicator,False
,keep_empty_features,False


In [23]:
# Removing NaN values by transforming
X_train[numeric_cols] = imputer.transform(X_train[numeric_cols])
X_val[numeric_cols] = imputer.transform(X_val[numeric_cols])
X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

In [24]:
X_train[numeric_cols].isna().sum()

Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
dtype: int64

In [25]:
from sklearn.preprocessing import OneHotEncoder

In [26]:
# Create encoder
enc = OneHotEncoder(sparse_output=False, drop=None, handle_unknown='ignore')  

In [27]:
# Fit encoder
enc.fit(X_train[categorical_cols])

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [28]:
# Encode 
X_train_encoded = enc.transform(X_train[categorical_cols])
X_val_encoded = enc.transform(X_val[categorical_cols])
X_test_encoded = enc.transform(X_test[categorical_cols])

# Get column names
encoded_col_names = enc.get_feature_names_out(categorical_cols)

# Transfer into a dataframe from an array
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_col_names, index=X_train.index)
X_val_encoded_df   = pd.DataFrame(X_val_encoded,   columns=encoded_col_names, index=X_val.index)
X_test_encoded_df  = pd.DataFrame(X_test_encoded,  columns=encoded_col_names, index=X_test.index)

# Replace new categorical columns 
X_train = pd.concat([X_train[numeric_cols].reset_index(drop=True), X_train_encoded_df.reset_index(drop=True)], axis=1)
X_val = pd.concat([X_val[numeric_cols].reset_index(drop=True), X_val_encoded_df.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test[numeric_cols].reset_index(drop=True), X_test_encoded_df.reset_index(drop=True)], axis=1)

In [29]:
X_train

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,908.0,50.0,86.0,11500.0,7.0,7.0,1936.0,1987.0,0.0,223.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,783.0,20.0,67.0,16285.0,7.0,5.0,2001.0,2002.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,953.0,85.0,60.0,7200.0,5.0,8.0,1972.0,2003.0,0.0,660.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,621.0,30.0,45.0,8248.0,3.0,3.0,1914.0,1950.0,0.0,41.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,670.0,30.0,80.0,11600.0,4.0,5.0,1922.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1309,1096.0,20.0,78.0,9317.0,6.0,5.0,2006.0,2006.0,0.0,24.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1310,1131.0,50.0,65.0,7804.0,4.0,3.0,1928.0,1950.0,0.0,622.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1311,1295.0,20.0,60.0,8172.0,5.0,7.0,1955.0,1990.0,0.0,167.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1312,861.0,50.0,55.0,7642.0,7.0,8.0,1918.0,1998.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [30]:
X_val

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,893.0,20.0,70.0,8414.0,6.0,8.0,1963.0,2003.0,0.0,663.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1106.0,60.0,98.0,12256.0,8.0,5.0,1994.0,1995.0,362.0,1032.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,414.0,30.0,56.0,8960.0,5.0,6.0,1927.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,523.0,50.0,50.0,5000.0,6.0,7.0,1947.0,1950.0,0.0,399.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1037.0,20.0,89.0,12898.0,9.0,5.0,2007.0,2008.0,70.0,1022.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,527.0,20.0,70.0,13300.0,5.0,7.0,1956.0,2000.0,0.0,377.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
142,102.0,60.0,77.0,9206.0,6.0,5.0,1985.0,1985.0,336.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
143,1093.0,50.0,60.0,8400.0,6.0,5.0,1925.0,1950.0,0.0,423.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
144,412.0,190.0,100.0,34650.0,5.0,5.0,1955.0,1955.0,0.0,1056.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [31]:
from sklearn.metrics import mean_squared_error

# Loss function
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


In [32]:
print("Mean house price:", X_train_target.mean())
print("Median house price:", X_train_target.median())

Mean house price: 180704.73439878234
Median house price: 164500.0


In [33]:
def try_rf_params(**params):

    rf = RandomForestRegressor(**params, n_jobs=-1, random_state=42) # Creates model
    rf.fit(X_train, X_train_target) # Trains model
    train_preds = rf.predict(X_train) # Training predictions
    val_preds = rf.predict(X_val) # Validation predictions
    
    # Results
    return {
        "Model": rf,
        "Params":params,
        "Train Accuracy": rmse(y_true=X_train_target, y_pred=train_preds),
        "Val Accuracy"  : rmse(y_true=X_val_target, y_pred=val_preds),
    }


In [34]:
result = try_rf_params(n_estimators=500, max_depth=15)
model = result["Model"]
print(result['Train Accuracy'])
print(result['Val Accuracy'])

10944.123014857778
31177.80734100393


In [35]:
# Submit
# 1. Make predictions on the test set
test_preds = model.predict(X_test)  # <- replace with your actual test set

# 2. Create submission DataFrame
submission = pd.DataFrame({
    "Id": X_test["Id"].astype(int),  
    "SalePrice": test_preds
})


# 3. Save to CSV (no index)
submission.to_csv("submission.csv", index=False)

print("✅ Submission file 'submission.csv' is ready to go!")


✅ Submission file 'submission.csv' is ready to go!
