https://towardsdatascience.com/predicting-hdb-housing-prices-using-neural-networks-94ab708cccf8

In [11]:
import pandas as pd

# Load the training set
train_set_path = './Datasets/test.csv'
data = pd.read_csv(train_set_path)

# Display the first few rows of the training set to understand its structure
data.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region
0,2023-01,hougang,245,hougang street 22,5-room,improved,121.0,yes,1984,1.358411,103.891722,0.0,lorong ah soo,hougang,north-east region
1,2022-09,sembawang,316,sembawang vista,4-room,model a,100.0,yes,1999,1.446343,103.820817,0.0,sembawang central,sembawang,north region
2,2023-07,clementi,708,Clementi West Street 2,4-room,new generation,91.0,yes,1980,1.305719,103.762168,0.0,clementi west,clementi,west region
3,2021-08,jurong east,351,Jurong East Street 31,3 room,model a,74.0,yes,1986,1.344832,103.730778,0.0,yuhua west,jurong east,west region
4,2022-03,jurong east,305,jurong east street 32,5-room,improved,121.0,yes,1983,1.345437,103.735241,0.0,yuhua west,jurong east,west region


In [12]:
# Split the 'rent_approval_date' into 'year' and 'month'
data['year'] = data['rent_approval_date'].apply(lambda x: x.split('-')[0]).astype(int)
data['month'] = data['rent_approval_date'].apply(lambda x: x.split('-')[1]).astype(int)
data = data.drop(columns='rent_approval_date')

# Standardize the 'flat_type' column
data['flat_type'] = data['flat_type'].str.replace(' ', '-', regex=False)

# Check if all values in 'furnished' and 'elevation' columns are the same
furnished_unique_values = data['furnished'].nunique()
elevation_unique_values = data['elevation'].nunique()

# Drop 'furnished' and 'elevation' columns if all values are the same
if furnished_unique_values == 1:
    data = data.drop(columns='furnished')

if elevation_unique_values == 1:
    data = data.drop(columns='elevation')

# Display the changes to verify
data.head(), furnished_unique_values, elevation_unique_values

(          town block             street_name flat_type      flat_model  \
 0      hougang   245       hougang street 22    5-room        improved   
 1    sembawang   316         sembawang vista    4-room         model a   
 2     clementi   708  Clementi West Street 2    4-room  new generation   
 3  jurong east   351   Jurong East Street 31    3-room         model a   
 4  jurong east   305   jurong east street 32    5-room        improved   
 
    floor_area_sqm  lease_commence_date  latitude   longitude  \
 0           121.0                 1984  1.358411  103.891722   
 1           100.0                 1999  1.446343  103.820817   
 2            91.0                 1980  1.305719  103.762168   
 3            74.0                 1986  1.344832  103.730778   
 4           121.0                 1983  1.345437  103.735241   
 
              subzone planning_area             region  year  month  
 0      lorong ah soo       hougang  north-east region  2023      1  
 1  sembawang ce

In [13]:
# Dropping planning area as there is already a redundant column town which is slightly more general
data.drop(columns=['planning_area'], inplace=True)

# Dropping subzone as it is too granular and does not generalize well enough
data.drop(columns=['subzone', 'block', 'street_name'], inplace=True)

data.head()

Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,region,year,month
0,hougang,5-room,improved,121.0,1984,1.358411,103.891722,north-east region,2023,1
1,sembawang,4-room,model a,100.0,1999,1.446343,103.820817,north region,2022,9
2,clementi,4-room,new generation,91.0,1980,1.305719,103.762168,west region,2023,7
3,jurong east,3-room,model a,74.0,1986,1.344832,103.730778,west region,2021,8
4,jurong east,5-room,improved,121.0,1983,1.345437,103.735241,west region,2022,3


In [14]:
# Define the ordinal mapping for flat_type
flat_type_mapping = {
    '1-room': 0,
    '2-room': 1,
    '3-room': 2,
    '4-room': 3,
    '5-room': 4,
    'executive': 5
}

# Apply the ordinal encoding to flat_type
data['flat_type'] = data['flat_type'].map(flat_type_mapping)

# Display the first few rows to confirm the change
data[['flat_type']].head()

Unnamed: 0,flat_type
0,4
1,3
2,3
3,2
4,4


In [15]:
# Resale price index data
resale_price_index_data = {
    (2023, 'I'): 173.6,
    (2023, 'II'): 176.2,
    (2023, 'III'): 178.5,
    (2022, 'I'): 159.5,
    (2022, 'II'): 163.9,
    (2022, 'III'): 168.1,
    (2022, 'IV'): 171.9,
    (2021, 'I'): 142.2,
    (2021, 'II'): 146.4,
    (2021, 'III'): 150.6,
    (2021, 'IV'): 155.7
}


# Function to determine the quarter based on the month
def get_quarter(month):
    if 1 <= month <= 3:
        return 'I'
    elif 4 <= month <= 6:
        return 'II'
    elif 7 <= month <= 9:
        return 'III'
    else:  # From October to December
        return 'IV'


# Apply the mapping to the dataset
data['resale_price_index'] = data.apply(
    lambda row: resale_price_index_data.get((row['year'], get_quarter(row['month']))), axis=1
)

# Check if the resale price index was successfully encoded
data[['year', 'month', 'resale_price_index']].head()

Unnamed: 0,year,month,resale_price_index
0,2023,1,173.6
1,2022,9,168.1
2,2023,7,178.5
3,2021,8,150.6
4,2022,3,159.5


#Determine whether to drop the 'flat_model' column and 'town' column

In [16]:
# Drop 'flat_model' as we have a more correlated feature 'flat_type'
data.drop(columns=['flat_model'], inplace=True)

# Drop 'region' as it is not a useful feature, although 'town' is a less correlated feature, we will be using that to tag schools(top schools) to a region
data.drop(columns=['region'], inplace=True)

data.head()

Unnamed: 0,town,flat_type,floor_area_sqm,lease_commence_date,latitude,longitude,year,month,resale_price_index
0,hougang,4,121.0,1984,1.358411,103.891722,2023,1,173.6
1,sembawang,3,100.0,1999,1.446343,103.820817,2022,9,168.1
2,clementi,3,91.0,1980,1.305719,103.762168,2023,7,178.5
3,jurong east,2,74.0,1986,1.344832,103.730778,2021,8,150.6
4,jurong east,4,121.0,1983,1.345437,103.735241,2022,3,159.5


In [17]:
data.head()

Unnamed: 0,town,flat_type,floor_area_sqm,lease_commence_date,latitude,longitude,year,month,resale_price_index
0,hougang,4,121.0,1984,1.358411,103.891722,2023,1,173.6
1,sembawang,3,100.0,1999,1.446343,103.820817,2022,9,168.1
2,clementi,3,91.0,1980,1.305719,103.762168,2023,7,178.5
3,jurong east,2,74.0,1986,1.344832,103.730778,2021,8,150.6
4,jurong east,4,121.0,1983,1.345437,103.735241,2022,3,159.5


In [19]:
average_school_rank_by_town = pd.read_csv('./Datasets/auxiliary-data/average_school_rank_by_town.csv')
# Merge the average school rank with the data set
data = data.merge(average_school_rank_by_town,
                  left_on='town',
                  right_on='nearest_town',
                  how='left')

# Drop the extra 'nearest_town' column as it's redundant
data.drop('nearest_town', axis=1, inplace=True)

# Display the updated train set with the school rank and the average school ranks by town
data[['town', 'average_school_rank']].head(), average_school_rank_by_town.head()

(          town  average_school_rank
 0      hougang            86.111111
 1    sembawang            97.833333
 2     clementi            75.250000
 3  jurong east           129.500000
 4  jurong east           129.500000,
   nearest_town  average_school_rank
 0   ang mo kio           108.625000
 1        bedok            96.454545
 2       bishan            52.500000
 3  bukit batok            97.666667
 4  bukit merah           108.375000)

In [21]:
# Identify non-numerical columns
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
# drop 'town' as we have already encoded it
data.drop(categorical_columns, axis=1, inplace=True)
data.to_csv('./Datasets/test_cleaned_110523.csv', index=False)

In [28]:
import datetime
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import ZeroCount
from tpot.export_utils import set_param_recursive

tpot_data = pd.read_csv('./Datasets/train_cleaned_110523.csv', sep=',', dtype=np.float64)
features = tpot_data.drop('monthly_rent', axis=1)
targets = tpot_data['monthly_rent']

# Load the test data
test_data = pd.read_csv('./Datasets/test_cleaned_110523.csv', sep=',', dtype=np.float64)
test_features = test_data

# Average CV score on the training set was: -240613.66858386635
exported_pipeline = make_pipeline(
    ZeroCount(),
    RandomForestRegressor(bootstrap=True, max_features=0.4, min_samples_leaf=16, min_samples_split=16, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(features, targets)
results = exported_pipeline.predict(test_features)

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=['predictions'])

# Get model hyperparameters and current time
hyperparameters = exported_pipeline.get_params()
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
current_time_for_filename = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Append hyperparameters and time to the DataFrame
results_df['model_hyperparameters'] = str(hyperparameters)
results_df['timestamp'] = current_time

# Save the results to a CSV file
output_filepath = f'./Datasets/results/predictions {current_time_for_filename}.csv'
results_df.to_csv(output_filepath, index=False)