In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn import set_config
from functools import partial

In [None]:
# Constants
data_file_path = "/kaggle/input/home-data-for-ml-course/train.csv"
test_size = 0.2
val_size = 0.2
random_state = 0
missing_columns_drop_threshold = 0.5
set_config(transform_output="pandas")

In [None]:
# Load data
df = pd.read_csv(data_file_path)

In [None]:
# Unique data types of every column
print("Unique data types:")
print([str(x) for x in np.unique(df.dtypes.values)])

# Dataframe of all numeric types
df_num = df.select_dtypes(include=["number"])

# Dataframe of non-numerics
df_obj = df.select_dtypes(exclude=["number"])

In [None]:
# View sample data
df.head()

In [None]:
# Basic EDA
df.describe()

In [None]:
# View null counts and data type per column
def get_df_info(df, missing_only=False):
    """
    Function to get information about the dataframe
    :param df: DataFrame
    :return: DataFrame with column names, non-null counts, and data types
    """
    ret = pd.DataFrame({
        "column": df.columns,
        "non_null_count": df.notnull().sum(),
        "pct_missing": df.isnull().sum() / df.shape[0],
        "data_type": df.dtypes
    }).reset_index(drop=True).sort_values(by=["non_null_count"])
    if missing_only:
        return ret[ret["non_null_count"] < df.shape[0]]
    return ret

print(get_df_info(df, missing_only=True).to_string())

# # View data type and counts of nulls
# df.info()

In [None]:
# Number of rows with missing values
def get_rows_missing(df):
    """
    Function to get the number of rows with missing values
    :param df: DataFrame
    :return: Series with counts of missing values per row
    """
    return df.isnull().sum(axis=1).sort_values(ascending=False)
print(get_rows_missing(df).to_string())


In [None]:
# Target and features
target = "SalePrice"
y = df["SalePrice"]

# Create X
features = list(set(df.columns) - set(target))

# Select columns corresponding to features, and preview the data
X = df[features]

numerical_features = ["LotFrontage","LotArea","YearBuilt","YearRemodAdd","MasVnrArea","BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","1stFlrSF","2ndFlrSF","LowQualFinSF","GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","Bedroom","Kitchen","TotRmsAbvGrd","Fireplaces","GarageYrBlt","GarageCars","GarageArea","WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch","PoolArea","MiscVal","MoSold","YrSold"]
categorical_features = list(set(features) - set(numerical_features))

In [None]:
# Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=random_state)
print(X_train.shape)

In [None]:
# Handle missing values
def drop_columns_with_missing_values(df: pd.DataFrame, threshold=None, fit=False):
    """
    Function to drop columns with missing values above a certain threshold
    :param df: DataFrame
    :param threshold: float, percentage of missing values to drop the column
    :return: DataFrame with columns dropped
    """
    df_info = get_df_info(df=df)
    # print(df_info.loc[df_info["pct_missing"] >= 0.5, "column"])
    if threshold is None:
        dropped_columns = df_info.loc[df_info["pct_missing"] > 0, "column"].tolist()
    else:
        dropped_columns = df_info.loc[df_info["pct_missing"] >= threshold, "column"].tolist()

    if fit:
        drop_columns_with_missing_values.dropped_columns = dropped_columns
    return df.drop(columns=dropped_columns)

def impute_fill(df: pd.DataFrame):
    imputer = SimpleImputer(strategy="constant", fill_value=0)
    impute_fill.imputer = imputer
    return imputer.fit_transform(df)

def encode_ordinal(df: pd.DataFrame, fit: bool = False, categorical_features=None):
    # Safeguard
    categorical_features = set(df.columns).intersection(set(categorical_features))

    # Exclude already-encoded columns
    categorical_features = df[categorical_features].select_dtypes(exclude=["number"]).columns.tolist()
    df = df.copy()
    df[categorical_features] = df[categorical_features].astype(str)
    if fit:
        encoder = OrdinalEncoder()
        ct = ColumnTransformer(transformers=[("encoder", encoder, categorical_features)], remainder="passthrough")
        ct.fit(df)
        encode_ordinal.encoder = ct
    return ct.transform(df)

def preproc(df: pd.DataFrame, fit: bool = False):
    # Constants
    imputer = SimpleImputer(strategy="constant", fill_value=0)
    preproc_funcs = [
        # Drop columns with more missing values than missing_columns_drop_threshold
        (partial(drop_columns_with_missing_values, threshold=missing_columns_drop_threshold, fit=fit),
            partial(drop_columns_with_missing_values, threshold=missing_columns_drop_threshold)),

        # Impute missing values
        (impute_fill, impute_fill),

        # Encode categorical features
        (partial(encode_ordinal, categorical_features=categorical_features, fit=fit),
            partial(encode_ordinal, categorical_features=categorical_features)),
    ]
    if fit:
        preproc_funcs = [f[0] for f in preproc_funcs]
    else:
        preproc_funcs = [f[1] for f in preproc_funcs]

    for f in preproc_funcs:
        df = f(df)
    return df

X_train_preproc = preproc(X_train, fit=True)
X_train_preproc.head()


In [None]:
# Standardization/normalization
scale_columns = numerical_features
scaler = StandardScaler().fit(X_train_preproc[scale_columns])
# scaler = MinMaxScaler().fit(X_train_preproc[scale_columns])
X_train_preproc[scale_columns] = scaler.transform(X_train_preproc[scale_columns])
X_train_preproc.columns


In [None]:
# Model definition and training
mdl = RandomForestRegressor(random_state=random_state)
mdl.fit(X_train_preproc, y_train)
for f in preproc_funcs:
    print(f[0])
    print(f[1])
#     X_val = f[1](X_val)
# X_val.shape
# mae = mean_absolute_error(y_val, mdl.predict(X_val))
# print(mae)

In [None]:
# Hyperparameter tuning
maes = []
n_trees_search = (2 ** np.arange(15))
for n_trees in n_trees_search:
    print(f"Training for n_trees = {n_trees}")
    mdl = RandomForestRegressor(n_estimators=n_trees, random_state=random_state)
    mdl.fit(X_train, y_train)
    mae = mean_absolute_error(y_val, mdl.predict(X_val))
    maes.append(mae)

In [None]:
# Hyperparameter selection
for n, mae in enumerate(maes):
    print(f"n_tree = {n_trees_search[n]}, mae = {mae}")
mae_min = min(maes)
n_trees = n_trees_search[maes.index(mae_min)]
print(f"best = {n_trees}, mae = {mae_min}")

In [None]:
# Retraining with best hyperparameter, and using the validation set as well
X_train2 = pd.concat((X_train, X_val))
y_train2 = pd.concat((y_train, y_val))
mdl = RandomForestRegressor(n_estimators=n_trees, random_state=random_state)
mdl.fit(X_train2, y_train2)

In [None]:
# Estimate accuracy on data set not used for training
mae = mean_absolute_error(y_test, mdl.predict(X_test))
print(mae)

In [None]:
# Retrain on all data to prepare for submission
mdl.fit(X, y)

In [None]:
# Load test data and fit
test_data = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")
X_test = test_data[features]
test_preds = mdl.predict(X_test)

In [None]:
submission = pd.DataFrame({"Id": test_data.Id, "SalePrice": test_preds})
submission.to_csv("submission.csv", index=False)