In [121]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import set_config

In [122]:
# Constants
data_file_path = "/kaggle/input/home-data-for-ml-course/train.csv"
test_size = 0.2
val_size = 0.2
random_state = 0
set_config(transform_output="pandas")

In [108]:
# Load data
df = pd.read_csv(data_file_path)

In [145]:
# Unique data types of every column
print("Unique data types:")
print([str(x) for x in np.unique(df.dtypes.values)])

# Dataframe of all numeric types
df_num = df.select_dtypes(include=["number"])
numerical_columns = df_num.columns

# Dataframe of non-numerics
df_obj = df.select_dtypes(exclude=["number"])
categorical_columns = df_obj.columns

Unique data types:
['int64', 'float64', 'object']


In [110]:
# View sample data
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [111]:
# Basic EDA
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [112]:
# View null counts and data type per column
def get_df_info(df):
    """
    Function to get information about the dataframe
    :param df: DataFrame
    :return: DataFrame with column names, non-null counts, and data types
    """
    return pd.DataFrame({
        "column": df.columns,
        "non_null_count": df.notnull().sum(),
        "data_type": df.dtypes
    }).reset_index(drop=True).sort_values(by=["non_null_count"])
print(get_df_info(df).to_string())

# # View data type and counts of nulls
# df.info()

           column  non_null_count data_type
72         PoolQC               7    object
74    MiscFeature              54    object
6           Alley              91    object
73          Fence             281    object
57    FireplaceQu             770    object
3     LotFrontage            1201   float64
59    GarageYrBlt            1379   float64
58     GarageType            1379    object
60   GarageFinish            1379    object
63     GarageQual            1379    object
64     GarageCond            1379    object
32   BsmtExposure            1422    object
35   BsmtFinType2            1422    object
33   BsmtFinType1            1423    object
31       BsmtCond            1423    object
30       BsmtQual            1423    object
26     MasVnrArea            1452   float64
25     MasVnrType            1452    object
42     Electrical            1459    object
53    KitchenQual            1460    object
51   BedroomAbvGr            1460     int64
50       HalfBath            146

In [152]:
# Number of rows with missing values
def get_rows_missing(df):
    """
    Function to get the number of rows with missing values
    :param df: DataFrame
    :return: Series with counts of missing values per row
    """
    return df.isnull().sum(axis=1).sort_values(ascending=False)
print(get_rows_missing(df).to_string())


39      15
1011    15
1218    15
533     15
1179    14
705     14
520     14
1143    11
1035    11
1030    11
342     11
1321    11
375     11
287     11
386     10
646     10
102     10
1219    10
1323    10
1326    10
1232    10
125     10
1283    10
108     10
1137    10
582     10
1123    10
1257    10
464     10
749     10
1216    10
738     10
165     10
613     10
614     10
736     10
78      10
48      10
156     10
155     10
441     10
434     10
148     10
710     10
89      10
90      10
1096    10
1234    10
1407    10
826     10
970     10
1412    10
976     10
1090    10
984     10
868     10
259     10
960     10
528     10
1000    10
1009    10
843     10
241     10
535     10
1049    10
942     10
1450    10
894     10
897     10
954     10
1449    10
1453    10
1045    10
210     10
649      9
635      9
431      9
638      9
88       9
968      9
99       9
495      9
636      9
17       9
140      9
921      9
532      9
620      9
198      9
371      9
778      9

In [147]:
# Target and features
target = "SalePrice"
y = df["SalePrice"]

# Create X
features = list(set(df.columns) - set(target))

# Select columns corresponding to features, and preview the data
X = df[features]

In [148]:
# Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=random_state)
print(X_train.shape)

(934, 81)


In [149]:
# Handle missing values
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 934 entries, 21 to 1306
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PoolArea       934 non-null    int64  
 1   OverallQual    934 non-null    int64  
 2   BsmtExposure   912 non-null    object 
 3   GarageYrBlt    888 non-null    float64
 4   YearRemodAdd   934 non-null    int64  
 5   HalfBath       934 non-null    int64  
 6   ExterCond      934 non-null    object 
 7   Utilities      934 non-null    object 
 8   ScreenPorch    934 non-null    int64  
 9   SalePrice      934 non-null    int64  
 10  BsmtFinSF2     934 non-null    int64  
 11  BsmtHalfBath   934 non-null    int64  
 12  BsmtFullBath   934 non-null    int64  
 13  TotRmsAbvGrd   934 non-null    int64  
 14  YearBuilt      934 non-null    int64  
 15  1stFlrSF       934 non-null    int64  
 16  GrLivArea      934 non-null    int64  
 17  OverallCond    934 non-null    int64  
 18  Function

In [117]:
# Handle categorical variables


In [135]:
# Standardization/normalization
X_train_standardized = StandardScaler().fit_transform(X_train)
X_train_normalized = MinMaxScaler().fit_transform(X_train)

In [137]:
# Model definition and training
mdl = RandomForestRegressor(random_state=random_state)
mdl.fit(X_train_standardized, y_train)
mae = mean_absolute_error(y_val, mdl.predict(X_val))
print(mae)

193478.07264957263


In [138]:
# Hyperparameter tuning
maes = []
n_trees_search = (2 ** np.arange(15))
for n_trees in n_trees_search:
    print(f"Training for n_trees = {n_trees}")
    mdl = RandomForestRegressor(n_estimators=n_trees, random_state=random_state)
    mdl.fit(X_train, y_train)
    mae = mean_absolute_error(y_val, mdl.predict(X_val))
    maes.append(mae)

Training for n_trees = 1
Training for n_trees = 2
Training for n_trees = 4
Training for n_trees = 8
Training for n_trees = 16
Training for n_trees = 32
Training for n_trees = 64
Training for n_trees = 128
Training for n_trees = 256
Training for n_trees = 512
Training for n_trees = 1024
Training for n_trees = 2048
Training for n_trees = 4096
Training for n_trees = 8192
Training for n_trees = 16384


In [144]:
# Hyperparameter selection
for n, mae in enumerate(maes):
    print(f"n_tree = {n_trees_search[n]}, mae = {mae}")
mae_min = min(maes)
n_trees = n_trees_search[maes.index(mae_min)]
print(f"best = {n_trees}, mae = {mae_min}")

n_tree = 1, mae = 37192.4358974359
n_tree = 2, mae = 33837.166666666664
n_tree = 4, mae = 30528.865740740745
n_tree = 8, mae = 28741.17361111111
n_tree = 16, mae = 28206.12419871795
n_tree = 32, mae = 27855.69662571225
n_tree = 64, mae = 27092.072213955023
n_tree = 128, mae = 26997.49609454492
n_tree = 256, mae = 26999.50752990499
n_tree = 512, mae = 27031.098184608603
n_tree = 1024, mae = 26966.88907054076
n_tree = 2048, mae = 26940.89864895545
n_tree = 4096, mae = 26927.116361152293
n_tree = 8192, mae = 26913.70188750545
n_tree = 16384, mae = 26890.195217578545
best = 16384, mae = 26890.195217578545


In [140]:
# Retraining with best hyperparameter, and using the validation set as well
X_train2 = pd.concat((X_train, X_val))
y_train2 = pd.concat((y_train, y_val))
mdl = RandomForestRegressor(n_estimators=n_trees, random_state=random_state)
mdl.fit(X_train2, y_train2)

In [None]:
# Estimate accuracy on data set not used for training
mae = mean_absolute_error(y_test, mdl.predict(X_test))
print(mae)

23734.890281865366


In [None]:
# Retrain on all data to prepare for submission
mdl.fit(X, y)

In [None]:
# Load test data and fit
test_data = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")
X_test = test_data[features]
test_preds = mdl.predict(X_test)

In [None]:
submission = pd.DataFrame({"Id": test_data.Id, "SalePrice": test_preds})
submission.to_csv("submission.csv", index=False)