In [None]:
!pip install git+https://github.com/remykarem/pandas-lightning#egg=pandas-lightning

In [None]:
import os

import numpy as np
import pandas as pd
import pandas_lightning

from sklearn.model_selection import train_test_split
import seaborn as sns

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

sns.set_theme()
sns.set(rc={'figure.figsize':(14.7,8.27)})

In [None]:
df_ = pd.read_csv("/kaggle/input/hdb-resale-price-prediction-full-train-set/train_edited.csv")

In [None]:
def get_average_storey(storey):
    a, b = storey.str.split(" to ").str
    return (b.astype(int) + a.astype(int))/2

def get_flat_age(year, lease_commence_date):
    return year - lease_commence_date

def is_inauspicious(block):
    return block.isin(["13", "4", "44", "444"])

def is_before_covid(date):
    return date < "20200201"

def planning_area_to_district(planning_area):
    # https://www.ura.gov.sg/realEstateIIWeb/resources/misc/list_of_postal_districts.html
    # https://www.harrylau.com/basic-real-estate-knowledge-you-must-know/singapore-district-and-planning-area/
    
    # Note that ['clementi', 'kallang', 'bukit merah', 'outram', 'downtown core'] span multiple districts
    
    DISTRICT_MAPPING = {
        1: ["raffles place", "cecil", "marina", "people’s park"],
        2: ["anson", "tanjong pagar"],
        3: ["queenstown", "tiong bahru", "alexandra", "outram"], # outram is 3 based on wiki
        4: ["telok blangah", "harbourfront", "sentosa", "keppel", "mount faber", "bukit merah"], # bukit merah is 2, 3, 4, 5, 9, 10
        5: ["pasir panjang", "buona vista", "dover", "west coast", "clementi new town", "clementi"], # we place clementi together with clementi new town
        6: ["high street", "beach road", "city hall"],
        7: ["middle road", "golden mile", "bugis", "rochor", "downtown core"], # downtown core is 1, 6, 7
        8: ["little india", "farrer park", "serangoon road"],
        9: ["orchard", "cairnhill", "river valley"],
        10: ["ardmore", "bukit timah", "holland road", "tanglin"],
        11: ["watten estate", "newton", "novena", "thomson"],
        12: ["balestier", "toa payoh", "serangoon"],
        13: ["macpherson", "braddell", "potong pasir"],
        14: ["geylang", "paya lebar", "eunos", "kembangan", "kallang"], # kallang is in 1, 7, 12, 13, 14, 15
        15: ["katong", "joo chiat", "amber road", "marine parade", "tanjong rhu", "meyer"],
        16: ["bedok", "upper east coast", "eastwood", "kew drive", "chai chee", "siglap"],
        17: ["loyang", "changi"],
        18: ["tampines", "pasir ris", "simei"],
        19: ["serangoon garden", "hougang", "punggol", "sengkang"],
        20: ["bishan", "ang mo kio", "braddell"],
        21: ["upper bukit timah", "ulu pandan"],
        22: ["jurong", "boon lay", "tuas", "lakeside", "jurong west", "jurong east"],
        23: ["hillview", "dairy farm", "bukit panjang", "choa chu kang", "bukit batok"],
        24: ["lim chu kang"],
        25: ["kranji", "woodgrove", "woodlands", "sungei kadut"],
        26: ["upper thomson", "springleaf"],
        27: ["yishun", "sembawang", "admiralty"],
        28: ["seletar", "yio chu kang"]
    }
    return planning_area.map_categorical_binning(DISTRICT_MAPPING).astype("category")

def is_prime_district(district):
    # https://www.propertyguru.com.sg/property-guides/ccr-ocr-rcr-region-singapore-ura-map-21045
    return district.isin([9, 10, 11])

def is_core(prime_district, planning_area):
    # https://www.propertyguru.com.sg/property-guides/ccr-ocr-rcr-region-singapore-ura-map-21045
    return prime_district | planning_area.isin(["bugis", "city hall", "sentosa", "shenton way", "tanjong pagar",
                                                "boat quay", "raffles place", "marina downtown", "suntec city"]) 

def can_use_cpf(remaining_lease):
    return remaining_lease < 30

In [None]:
df = df_.copy(

).rename(
    columns={"month": "date"}
    
).lambdas(inplace=True).sapply(
    flat_type=lambda s: s.str.replace("-", " "),
    
).lambdas(inplace=True).astype(
    planning_area="category",
    flat_model="category",
    subzone="category",
    region="category",
    town="category",
    date="datetime",
    flat_type=['1 room', '2 room', '3 room', '4 room', '5 room', 'executive', 'multi generation']

).lambdas(inplace=True).sapply(
    year=("date", lambda date: date.dt.year),
    avg_storey=("storey_range", get_average_storey),
    remaining_lease=(["year", "lease_commence_date"], get_flat_age),
    inauspicious=("block", is_inauspicious),
    before_covid=("date", is_before_covid),
    district=("planning_area", planning_area_to_district),
    prime_district=("district", is_prime_district),
    core_central_region=(["prime_district", "planning_area"], is_core),
    can_use_cpf=("remaining_lease", can_use_cpf),
    
).drop(
    columns=["storey_range", "eco_category", "elevation", "block", "lease_commence_date",
             "latitude", "longitude"],
#              "markets_1km", "malls_1km", "comm_1km", "prisch_1km", "secsch_1km", "mrt_1km"]
    
)

df

In [None]:
df.columns

In [None]:
df.tests.info(pctg=False).sort_values(by="dtype")

In [None]:
df.quickplot(
    numerical=["floor_area_sqm"],
    categorical=["flat_model"]

).ridgeplot()

# Modelling

* Drop nominal features
* One-hot nominal features
* Keep (label-encode) nominal features

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting #for HistGBR # noqa
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRFRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error

def rmse(clf, X_test, y_true):
    return np.sqrt(mean_squared_error(y_true, clf.predict(X_test)))

## Models for dropping nominal features

* Linear Regression
* Decision Tree
* Random Forest
* XGBRegressor
* GradientBoostingRegressor

RMSE: 50k-55k

In [None]:
X, y, metadata = df.dataset.to_X_y(
    target="resale_price",
    nominal="drop",
    nans="remove",
    nominal_max_cardinality=100
)

X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5228)

Linear Regression

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
score = rmse(model, X_test, y_test)
print(score)

for feat, coef in zip(X.columns, model.coef_):
    print(f"{feat:20}: {coef}")

Decision Tree

In [None]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
score = rmse(model, X_test, y_test)
print(score)

for feat, imp in zip(X.columns, model.feature_importances_):
    print(f"{feat:20}: {imp}")

Random Forest

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
score = rmse(model, X_test, y_test)
print(score)

for feat, imp in zip(X.columns, model.feature_importances_):
    print(f"{feat:20}: {imp}")

XGBRegressor

In [None]:
model = XGBRegressor()
model.fit(X_train, y_train)
score = rmse(model, X_test, y_test)
print(score)

for feat, imp in zip(X.columns, model.feature_importances_):
    print(f"{feat:20}: {imp}")

## Models for one-hot nominal features

* Linear Regression
* Decision Tree
* Random Forest
* XGBRegressor
* GradientBoostingRegressor

RMSE: 26k-28k

In [None]:
X, y, metadata = df.drop(
    columns=["subzone"]
).dataset.to_X_y(
    target="resale_price",
    nominal="one-hot",
    nans="remove",
    nominal_max_cardinality=100
)

X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5228)

Linear Regression

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
score = rmse(model, X_test, y_test)
print(score)

for feat, coef in zip(X.columns, model.coef_):
    print(f"{feat:20}: {coef}")

Decision Tree

In [None]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
score = rmse(model, X_test, y_test)
print(score)

for feat, imp in zip(X.columns, model.feature_importances_):
    print(f"{feat:20}: {imp}")

Random Forest

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
score = rmse(model, X_test, y_test)
print(score)

for feat, imp in zip(X.columns, model.feature_importances_):
    print(f"{feat:20}: {imp}")

XGBRegressor

In [None]:
model = XGBRegressor()
model.fit(X_train, y_train)
score = rmse(model, X_test, y_test)
print(score)

for feat, imp in zip(X.columns, model.feature_importances_):
    print(f"{feat:20}: {imp}")

XGBRFRegressor

In [None]:
model = XGBRFRegressor()
model.fit(X_train, y_train)
score = rmse(model, X_test, y_test)
print(score)

for feat, imp in zip(X.columns, model.feature_importances_):
    print(f"{feat:20}: {imp}")

## Models for nominal feature-aware (tree-based)

* HistGradientBoostingRegressor
* CatBoost
* LGBMRegressor

RMSE: 19k-21k

In [None]:
X, y, metadata = df.dataset.to_X_y(
    target="resale_price",
    nominal="label",
    nans="remove",
    nominal_max_cardinality=100
)

X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5228)

In [None]:
metadata["nominal"]["col_names"]

In [None]:
model = HistGradientBoostingRegressor(max_iter=1000, 
                                      categorical_features=metadata["nominal"]["col_names"])
model.fit(X_train, y_train)
score = rmse(model, X_test, y_test)
print(score)

In [None]:
model = CatBoostRegressor(n_estimators=1000, cat_features=metadata["nominal"]["col_indices"], verbose=0)
model.fit(X_train, y_train)
score = rmse(model, X_test, y_test)
print(score)

for feat, imp in zip(X.columns, model.feature_importances_):
    print(f"{feat:20}: {imp}")

In [None]:
model = LGBMRegressor(n_estimators=1000)
model.fit(X_train, y_train, categorical_feature=metadata["nominal"]["col_indices"])
score = rmse(model, X_test, y_test)
print(score)

for feat, imp in zip(X.columns, model.feature_importances_):
    print(f"{feat:20}: {imp}")

# Next steps

Feature engineering
* Incorporate latlongs (train_edited.csv)

Modelling
* Random Forest
* Hyperparameter tuning with graph
* Ensemble prediction

In [None]:
grid_search_grid = {
    'n_estimators': [10,100,1000,10000],
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}