In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import set_config

In [None]:
# Constants
data_file_path = "/kaggle/input/home-data-for-ml-course/train.csv"
test_size = 0.2
val_size = 0.2
random_state = 0
missing_columns_drop_threshold = 0.5
set_config(transform_output="pandas")

In [None]:
# Load data
df = pd.read_csv(data_file_path)

In [None]:
# Unique data types of every column
print("Unique data types:")
print([str(x) for x in np.unique(df.dtypes.values)])

# Dataframe of all numeric types
df_num = df.select_dtypes(include=["number"])

# Dataframe of non-numerics
df_obj = df.select_dtypes(exclude=["number"])

In [None]:
# View sample data
df.head()

In [None]:
# Basic EDA
df.describe()

In [None]:
# View null counts and data type per column
def get_df_info(df, missing_only=False):
    """
    Function to get information about the dataframe
    :param df: DataFrame
    :return: DataFrame with column names, non-null counts, and data types
    """
    ret = pd.DataFrame({
        "column": df.columns,
        "non_null_count": df.notnull().sum(),
        "data_type": df.dtypes
    }).reset_index(drop=True).sort_values(by=["non_null_count"])
    if missing_only:
        return ret[ret["non_null_count"] < df.shape[0]]
    return ret

print(get_df_info(df, missing_only=True).to_string())

# # View data type and counts of nulls
# df.info()

In [None]:
# Number of rows with missing values
def get_rows_missing(df):
    """
    Function to get the number of rows with missing values
    :param df: DataFrame
    :return: Series with counts of missing values per row
    """
    return df.isnull().sum(axis=1).sort_values(ascending=False)
print(get_rows_missing(df).to_string())


In [None]:
# Target and features
target = "SalePrice"
y = df["SalePrice"]

# Create X
features = list(set(df.columns) - set(target))

# Select columns corresponding to features, and preview the data
X = df[features]

In [None]:
# Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=random_state)
print(X_train.shape)

In [None]:
# Handle missing values
df_info = get_df_info(df=X_train, missing_only=True)
# print(X_train.shape)
# print(df_info.to_string())

# Drop columns with more than 50% missing values missing_columns_drop_threshold
columns_to_drop = df_info.loc[df_info["non_null_count"]/df.shape[0] < missing_columns_drop_threshold, "column"].tolist()


In [None]:
# Handle categorical variables

In [None]:
# Standardization/normalization
X_train_standardized = StandardScaler().fit_transform(X_train)
X_train_normalized = MinMaxScaler().fit_transform(X_train)

In [None]:
# Model definition and training
mdl = RandomForestRegressor(random_state=random_state)
mdl.fit(X_train_standardized, y_train)
mae = mean_absolute_error(y_val, mdl.predict(X_val))
print(mae)

In [None]:
# Hyperparameter tuning
maes = []
n_trees_search = (2 ** np.arange(15))
for n_trees in n_trees_search:
    print(f"Training for n_trees = {n_trees}")
    mdl = RandomForestRegressor(n_estimators=n_trees, random_state=random_state)
    mdl.fit(X_train, y_train)
    mae = mean_absolute_error(y_val, mdl.predict(X_val))
    maes.append(mae)

In [None]:
# Hyperparameter selection
for n, mae in enumerate(maes):
    print(f"n_tree = {n_trees_search[n]}, mae = {mae}")
mae_min = min(maes)
n_trees = n_trees_search[maes.index(mae_min)]
print(f"best = {n_trees}, mae = {mae_min}")

In [None]:
# Retraining with best hyperparameter, and using the validation set as well
X_train2 = pd.concat((X_train, X_val))
y_train2 = pd.concat((y_train, y_val))
mdl = RandomForestRegressor(n_estimators=n_trees, random_state=random_state)
mdl.fit(X_train2, y_train2)

In [None]:
# Estimate accuracy on data set not used for training
mae = mean_absolute_error(y_test, mdl.predict(X_test))
print(mae)

In [None]:
# Retrain on all data to prepare for submission
mdl.fit(X, y)

In [None]:
# Load test data and fit
test_data = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")
X_test = test_data[features]
test_preds = mdl.predict(X_test)

In [None]:
submission = pd.DataFrame({"Id": test_data.Id, "SalePrice": test_preds})
submission.to_csv("submission.csv", index=False)