In [343]:
import os
import math
import pandas as pd
import numpy as np
import pprint
from collections import defaultdict

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [144]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

In [145]:
train_df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,01 to 03,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0
1,2014-10,punggol,5-room,196B,punggol field,10 to 12,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0
2,2020-09,sengkang,5 room,404A,fernvale lane,01 to 03,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0
3,2000-10,clementi,3 room,375,clementi avenue 4,07 to 09,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0
4,2013-01,bukit batok,3-room,163,bukit batok street 11,07 to 09,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0


In [146]:
test_df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region
0,2004-01,bukit batok,4 room,186,bukit batok west avenue 6,04 to 06,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region
1,2001-11,tampines,5 room,366,tampines street 34,04 to 06,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region
2,2002-07,jurong east,3 room,206,jurong east street 21,01 to 03,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region
3,2015-04,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,04 to 06,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region
4,2004-04,clementi,5 room,356,clementi avenue 2,01 to 03,117.0,standard,uncategorized,1978,1.31396,103.769831,0.0,clementi north,clementi,west region


##### Preprocessing

check nan

In [147]:
train_df.isnull().sum().sort_index()/len(train_df)

block                  0.0
eco_category           0.0
elevation              0.0
flat_model             0.0
flat_type              0.0
floor_area_sqm         0.0
latitude               0.0
lease_commence_date    0.0
longitude              0.0
month                  0.0
planning_area          0.0
region                 0.0
resale_price           0.0
storey_range           0.0
street_name            0.0
subzone                0.0
town                   0.0
dtype: float64

Select relevant columns

In [271]:
rel_cols = ["month", "flat_type", "lease_commence_date", "storey_range", "floor_area_sqm", "planning_area"]

train_features_df = train_df[rel_cols]
train_labels = train_df["resale_price"].ravel()
test_features_df = test_df[rel_cols]
train_features_df

Unnamed: 0,month,flat_type,lease_commence_date,storey_range,floor_area_sqm,planning_area
0,2001-08,4 room,1989,01 to 03,118.0,pasir ris
1,2014-10,5-room,2003,10 to 12,110.0,punggol
2,2020-09,5 room,2004,01 to 03,112.0,sengkang
3,2000-10,3 room,1980,07 to 09,67.0,clementi
4,2013-01,3-room,1985,07 to 09,73.0,bukit batok
...,...,...,...,...,...,...
431727,2005-03,4 room,2000,01 to 03,101.0,woodlands
431728,2016-04,4 room,2012,13 to 15,95.0,sengkang
431729,2011-01,3-room,1986,01 to 03,67.0,tampines
431730,2013-05,5-room,1999,16 to 18,123.0,sengkang


In [272]:
def generate_new_features(features_df):
    # new features
    features_df_copy = features_df.copy()
    features_df_copy["month"] = features_df_copy["month"].astype('datetime64[ns]')
    features_df_copy["year"] = features_df_copy["month"].dt.year
    features_df_copy["flat_type"] = features_df_copy["flat_type"].apply(lambda x: x.replace("-", " "))
    features_df_copy["min_storey"] = features_df_copy["storey_range"].apply(lambda x: int(x.split("to")[0].strip().replace("0", ""))).astype(int)
    features_df_copy["max_storey"] = features_df_copy["storey_range"].apply(lambda x: int(x.split("to")[1].strip().replace("0", ""))).astype(int)
    
    # remove unwanted columns
    features_df_copy = features_df_copy.drop(["month", "storey_range"], axis=1)

    return features_df_copy

In [273]:
def cat_transform(train_features_df, test_features_df):
    mapping = {
        "flat_type": {
            "1 room": 1, "2 room": 2,
            "3 room": 3, "4 room": 4,
            "5 room": 5, "executive": 6,
            "multi generation": 7
        }
    }
    train_features_df = train_features_df.replace(mapping)
    test_features_df = test_features_df.replace(mapping)
    
    train_features_df = pd.get_dummies(train_features_df, columns=["planning_area"])
    test_features_df = pd.get_dummies(test_features_df, columns=["planning_area"])

    
    return train_features_df, test_features_df

In [274]:
def transform(train_df, test_df):
    
    train_df_with_new_f = generate_new_features(train_df)
    test_df_with_new_f = generate_new_features(test_df)
    
    transformed_train_df, transformed_test_df = cat_transform(train_df_with_new_f, test_df_with_new_f)
    
    return transformed_train_df, transformed_test_df

In [275]:
final_train_features_df, final_test_features_df = transform(train_features_df, test_features_df)
final_train_features_df

Unnamed: 0,flat_type,lease_commence_date,floor_area_sqm,year,min_storey,max_storey,planning_area_ang mo kio,planning_area_bedok,planning_area_bishan,planning_area_bukit batok,...,planning_area_rochor,planning_area_sembawang,planning_area_sengkang,planning_area_serangoon,planning_area_sungei kadut,planning_area_tampines,planning_area_tanglin,planning_area_toa payoh,planning_area_woodlands,planning_area_yishun
0,4,1989,118.0,2001,1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,2003,110.0,2014,1,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,2004,112.0,2020,1,3,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,3,1980,67.0,2000,7,9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,1985,73.0,2013,7,9,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,4,2000,101.0,2005,1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
431728,4,2012,95.0,2016,13,15,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
431729,3,1986,67.0,2011,1,3,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
431730,5,1999,123.0,2013,16,18,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [276]:
random_state = 20

In [344]:
def root_mean_squared_error(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = math.sqrt(mse)
    
    return rmse

def calculate_rmse(features_df, labels):
    preds = rf.predict(features_df)
    rmse = root_mean_squared_error(labels, preds)

    return rmse

##### RandomSearch

In [345]:
def run_random_search_cv(model, grid, train_features_df, train_labels, test_features_df, random_state=random_state):

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    m = model(random_state=random_state)

    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    random_search = RandomizedSearchCV(
        estimator=m, param_distributions=grid, n_iter=1, scoring=make_scorer(root_mean_squared_error),
        cv=3, verbose=2, random_state=random_state, n_jobs = -1, error_score="raise"
    )

    # Fit the random search model
    random_search.fit(train_features_df, train_labels)
    test_labels = random_search.predict(test_features_df)

    return random_search, test_labels

##### GridSearch

In [346]:
def run_grid_search_cv(model, grid, train_features_df, train_labels, test_features_df, random_state=random_state):
    
    # Create a based model
    m = model(random_state=random_state, error_score="raise")
    # Instantiate the grid search model
    grid_search = GridSearchCV(
        estimator=m, param_grid=grid, scoring=make_scorer(root_mean_squared_error),
        cv=3, n_jobs=-1, verbose=2, error_score="raise"
    )

    # Fit the grid search to the data
    grid_search.fit(train_features_df, train_labels)
    test_labels = grid_search.predict(test_features_df)
    
    return grid_search, test_labels

#### Run GridSearch or RandomSearch on multiple models

In [347]:
def run_search_on_multiple_models(models, model_names, grids, train_features_df, train_labels, test_features_df, search_type="random_search"):
    search_results = {}

    for model_name in model_names:
        grid = grids[model_name]
        model = models[model_name]
        
        if search_type == "random_search":
            search_result = run_random_search_cv(model, grid, train_features_df, train_labels, test_features_df)
        elif search_type == "grid_search":
            search_result = run_grid_search_cv(model, grid, train_features_df, train_labels, test_features_df)
        search_results[model_name] = search_result
    
    return search_results

In [348]:
models = {
    "RandomForestRegressor": RandomForestRegressor,
    "GradientBoostingRegressor": GradientBoostingRegressor
}

model_names = ["RandomForestRegressor", "GradientBoostingRegressor"]



RandomSearch

In [356]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=10, stop=500, num=20)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20, 30, 40]

# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 5, 10, 20, 30, 40]

# Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}


random_grids = {
    "RandomForestRegressor": random_grid,
    "GradientBoostingRegressor": random_grid
}

In [357]:
random_search_results = run_search_on_multiple_models(models, model_names, random_grids, final_train_features_df, train_labels, final_test_features_df, search_type="random_search")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [358]:
for model_name in model_names:
    print(model_name)
    print("Best Params")
    print(random_search_results[model_name][0].best_params_)
    print("Best Score")
    print(random_search_results[model_name][0].best_score_)
    print("Feature importance Params")
    raw_feature_importances = random_search_results[model_name][0].best_estimator_.feature_importances_
    feature_importances = sorted(list(zip(train_features.columns, raw_feature_importances)), key=lambda x: x[1], reverse=True)
    print(feature_importances)
    print("Best Params Predictions")
    print(random_search_results[model_name][1])
    print("\n")

RandomForestRegressor
Best Params
{'n_estimators': 396, 'min_samples_split': 40, 'min_samples_leaf': 30, 'max_features': 'sqrt', 'max_depth': 110}
Best Score
34378.62329390945
Feature importance Params
[('year', 0.3868921866261467), ('floor_area_sqm', 0.18605929010309522), ('flat_type', 0.1742507522446687), ('lease_commence_date', 0.09331517844579455), ('min_storey', 0.027721843502944314), ('max_storey', 0.023670608545456474), ('planning_area_bukit merah', 0.017748425120840107), ('planning_area_queenstown', 0.012072600600297922), ('planning_area_woodlands', 0.009882136131544343), ('planning_area_bishan', 0.009735026628515346), ('planning_area_toa payoh', 0.0063335367292179255), ('planning_area_jurong west', 0.005764359236502465), ('planning_area_yishun', 0.004281780254612921), ('planning_area_choa chu kang', 0.004001061136129973), ('planning_area_outram', 0.003642658547374772), ('planning_area_sengkang', 0.003615887044979625), ('planning_area_kallang', 0.003595254475248254), ('planning

GridSearch

In [25]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=10, stop=100, num=1)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=1)]
# max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Create the random grid
grid_search_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}


grid_search_grids = {
    "RandomForestRegressor": grid_search_grid,
    "GradientBoostingRegressor": grid_search_grid
}

In [30]:
grid_search_results = run_search_on_multiple_models(models, model_names, grid_search_grids, search_type="grid_search")

Fitting 3 folds for each of 18 candidates, totalling 54 fits




Fitting 3 folds for each of 18 candidates, totalling 54 fits




In [37]:
for model_name in model_names:
    print(model_name)
    print("Best Params")
    print(grid_search_results[model_name][0].best_params_)
    print("Feature importance Params")
    raw_feature_importances = grid_search_results[model_name][0].best_estimator_.feature_importances_
    feature_importances = sorted(list(zip(train_features.columns, raw_feature_importances)), key=lambda x: x[1], reverse=True)
    print(feature_importances)
    print("Best Params Predictions")
    print(grid_search_results[model_name][1])
    print("\n")

RandomForestRegressor
Best Params
{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}
Feature importance Params
[('year', 0.4912812137024781), ('flat_type', 0.3960111784559628), ('lease_commence_date', 0.06138216666719886), ('flat_model', 0.03135902546417194), ('street_name', 0.01996641571018848)]
Best Params Predictions
[198801.25395392 314721.71070902 135534.29998854 ... 140282.43006408
 296896.27767416 356328.08300168]


GradientBoostingRegressor
Best Params
{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}
Feature importance Params
[('year', 0.4851073509962419), ('flat_type', 0.3909132924945865), ('lease_commence_date', 0.06293420378194045), ('flat_model', 0.03190577069509537), ('street_name', 0.029139382032135695)]
Best Params Predictions
[232735.73435862 312865.04802935 194405.51148072 ... 194523.28212693
 294871.56309534 337416.05166995]




#### Feature importance

Ensemble predictions

In [300]:
predictions = [r[1] for r in random_search_results.values()]
average_predictions = np.mean(predictions, axis=0)
average_predictions

array([195196.98758854, 333442.18242566, 124791.26537335, ...,
       157740.71207976, 258242.96313784, 326673.85277017])

In [301]:
submission = pd.DataFrame()
submission["Id"] = range(average_predictions.shape[0])
submission["Predicted"] = average_predictions

In [302]:
submission.to_csv("submission.csv", index=False)