In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv("./Datasets/train.csv")

# Display the first few rows of the dataset
data.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent
0,2021-09,jurong east,257,Jurong East Street 24,3 room,new generation,67.0,yes,1983,1.344518,103.73863,0.0,yuhua east,jurong east,west region,1600
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900
3,2021-08,pasir ris,250,Pasir Ris Street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850
4,2022-11,kallang/whampoa,34,Whampoa West,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100


In [2]:
# Convert textual columns to lowercase for normalization
textual_columns = ['town', 'block', 'street_name', 'flat_type', 'flat_model', 'furnished', 'subzone', 'planning_area', 'region']

data[textual_columns] = data[textual_columns].apply(lambda x: x.str.lower())

# Display the first few rows to verify changes
data.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent
0,2021-09,jurong east,257,jurong east street 24,3 room,new generation,67.0,yes,1983,1.344518,103.73863,0.0,yuhua east,jurong east,west region,1600
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900
3,2021-08,pasir ris,250,pasir ris street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850
4,2022-11,kallang/whampoa,34,whampoa west,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100


In [3]:
# Extract year and month from the 'rent_approval_date' column
data['rent_approval_year'] = pd.to_datetime(data['rent_approval_date']).dt.year
data['rent_approval_month'] = pd.to_datetime(data['rent_approval_date']).dt.month

# Drop the original 'rent_approval_date' column
data = data.drop(columns=['rent_approval_date'])

# Display the modified dataset
data.head()

Unnamed: 0,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent,rent_approval_year,rent_approval_month
0,jurong east,257,jurong east street 24,3 room,new generation,67.0,yes,1983,1.344518,103.73863,0.0,yuhua east,jurong east,west region,1600,2021,9
1,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250,2022,5
2,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900,2022,10
3,pasir ris,250,pasir ris street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850,2021,8
4,kallang/whampoa,34,whampoa west,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100,2022,11


In [4]:
# Standardize the flat_type values by replacing " room" with "-room"
data['flat_type'] = data['flat_type'].str.replace(" room", "-room")

# Check the updated unique values in the 'flat_type' column
updated_flat_types = data['flat_type'].unique()

updated_flat_types

array(['3-room', '4-room', 'executive', '5-room', '2-room'], dtype=object)

In [5]:
# Identify columns with only one unique value
columns_to_drop = [col for col in data.columns if data[col].nunique() == 1]

# Drop these columns from the dataset
data = data.drop(columns=columns_to_drop)

# Display the updated dataset columns
remaining_columns = data.columns
remaining_columns

Index(['town', 'block', 'street_name', 'flat_type', 'flat_model',
       'floor_area_sqm', 'lease_commence_date', 'latitude', 'longitude',
       'subzone', 'planning_area', 'region', 'monthly_rent',
       'rent_approval_year', 'rent_approval_month'],
      dtype='object')

In [6]:
# Drop the 'block', 'street names' columns as it is too granular and may not generalize well
data = data.drop(columns=['block', 'street_name'])

In [7]:
# Load the first dataset
sg_coe_prices = pd.read_csv("./Datasets/auxiliary-data/sg-coe-prices.csv")
sg_coe_prices.head()

Unnamed: 0,year,category,month,bidding,price,quota,bids
0,2023,a,july,2,95202,581,728
1,2023,a,july,1,97000,588,756
2,2023,a,june,2,96206,586,751
3,2023,a,june,1,98001,581,881
4,2023,a,may,2,92000,596,936


In [8]:
# Convert month names in sg_coe_prices to numerical values for merging
month_map = {
    'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6,
    'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12
}
sg_coe_prices['month_num'] = sg_coe_prices['month'].map(month_map)
sg_coe_prices.rename(columns={'price': 'coe_price'}, inplace=True)

# Aggregate COE prices by taking the average for each month and year across all categories
average_coe_prices = sg_coe_prices.groupby(['year', 'month_num'])['coe_price'].mean().reset_index()

# Merge the aggregated COE prices back into the rental data
data = pd.merge(data, average_coe_prices,
                           left_on=['rent_approval_year', 'rent_approval_month'],
                           right_on=['year', 'month_num'],
                           how='left').drop(columns=['year', 'month_num'])

data.head()

Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,planning_area,region,monthly_rent,rent_approval_year,rent_approval_month,coe_price
0,jurong east,3-room,new generation,67.0,1983,1.344518,103.73863,yuhua east,jurong east,west region,1600,2021,9,54951.625
1,bedok,4-room,new generation,92.0,1978,1.330186,103.938717,bedok north,bedok,east region,2250,2022,5,77035.625
2,toa payoh,3-room,improved,67.0,1971,1.332242,103.845643,toa payoh central,toa payoh,central region,1900,2022,10,89580.25
3,pasir ris,executive,apartment,149.0,1993,1.370239,103.962894,pasir ris drive,pasir ris,east region,2850,2021,8,51997.375
4,kallang/whampoa,3-room,improved,68.0,1972,1.320502,103.863341,bendemeer,kallang,central region,2100,2022,11,99472.875


In [9]:
# Loading the datasets
sg_stock_prices = pd.read_csv('./Datasets/auxiliary-data/INDEX_SG_XSES_STI.csv')

# Displaying the first few rows of each dataset
sg_stock_prices.head()

Unnamed: 0,Date,Open,High,Low,Close
0,10/20/2023,3076.69,3100.44,3074.07,3076.69
1,10/19/2023,3111.78,3099.6,3099.6,3099.6
2,10/18/2023,3167.63,3136.62,3136.62,3136.62
3,10/17/2023,3185.79,3171.83,3171.83,3171.83
4,10/16/2023,3163.89,3179.39,3157.63,3163.89


In [10]:
# Convert the Date column in the index_data to datetime format
sg_stock_prices['Date'] = pd.to_datetime(sg_stock_prices['Date'])

# Extract year and month from the Date column in index_data for merging
sg_stock_prices['Year'] = sg_stock_prices['Date'].dt.year
sg_stock_prices['Month'] = sg_stock_prices['Date'].dt.month

# Aggregate the stock index data using mean, min, and max for each month and year
for col in ['Open', 'Low', 'Close', 'High']:
    sg_stock_prices[col] = sg_stock_prices[col].str.replace(',','').astype(float)

# Re-run the aggregation
mean_aggregated = sg_stock_prices.groupby(['Year', 'Month']).mean().reset_index()
min_aggregated = sg_stock_prices.groupby(['Year', 'Month']).min().reset_index()
max_aggregated = sg_stock_prices.groupby(['Year', 'Month']).max().reset_index()

# Merge with train_data and calculate correlations for each aggregation method
correlations_updated = {}

for method, aggregated_data in [('mean', mean_aggregated), ('min', min_aggregated), ('max', max_aggregated)]:
    merged = pd.merge(data, aggregated_data, left_on=['rent_approval_year', 'rent_approval_month'],
                      right_on=['Year', 'Month'], how='left')

    # Calculate the correlation with monthly_rent
    correlation = merged[['Open', 'Low', 'Close', 'High', 'monthly_rent']].corr()
    correlations_updated[method] = correlation['monthly_rent']

correlations_updated

{'mean': Open            0.266463
 Low             0.262047
 Close           0.264990
 High            0.268321
 monthly_rent    1.000000
 Name: monthly_rent, dtype: float64,
 'min': Open            0.271586
 Low             0.293945
 Close           0.297813
 High            0.284350
 monthly_rent    1.000000
 Name: monthly_rent, dtype: float64,
 'max': Open            0.263099
 Low             0.265938
 Close           0.264243
 High            0.268788
 monthly_rent    1.000000
 Name: monthly_rent, dtype: float64}

In [11]:
# From the above correlations, the Close value with the Min aggregation method has the highest correlation with monthly_rent. Therefore, if we were to choose a single aggregated stock index value as a feature, the Close price with the Min aggregation would be the most suitable based on its correlation with the target variable.
# Merge the train_data with min_aggregated data using only the 'Close' column
data = pd.merge(data, min_aggregated[['Year', 'Month', 'Close']],
                            left_on=['rent_approval_year', 'rent_approval_month'],
                            right_on=['Year', 'Month'], how='left')

# Rename the 'Close' column for clarity
data.rename(columns={'Close': 'Stock_Price'}, inplace=True)

data.head()

Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,planning_area,region,monthly_rent,rent_approval_year,rent_approval_month,coe_price,Year,Month,Stock_Price
0,jurong east,3-room,new generation,67.0,1983,1.344518,103.73863,yuhua east,jurong east,west region,1600,2021,9,54951.625,2021,9,3041.73
1,bedok,4-room,new generation,92.0,1978,1.330186,103.938717,bedok north,bedok,east region,2250,2022,5,77035.625,2022,5,3165.18
2,toa payoh,3-room,improved,67.0,1971,1.332242,103.845643,toa payoh central,toa payoh,central region,1900,2022,10,89580.25,2022,10,2969.95
3,pasir ris,executive,apartment,149.0,1993,1.370239,103.962894,pasir ris drive,pasir ris,east region,2850,2021,8,51997.375,2021,8,3055.05
4,kallang/whampoa,3-room,improved,68.0,1972,1.320502,103.863341,bendemeer,kallang,central region,2100,2022,11,99472.875,2022,11,3102.51


In [12]:
import numpy as np

# 直线距离
def haversine_distances(df_lats, df_longs, dst_lats, dst_longs):
    R = 6371000  # Earth's average radius in meters

    lat = np.radians(dst_lats) - np.radians(df_lats[:, np.newaxis])
    lon = np.radians(dst_longs) - np.radians(df_longs[:, np.newaxis])

    a = np.sin(lat / 2.0) ** 2 + np.cos(np.radians(df_lats[:, np.newaxis])) * np.cos(np.radians(dst_lats)) * np.sin(lon / 2.0) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c

def add_distance_features(train_df, poi_df, prefix):
    distances = haversine_distances(train_df['latitude'].values, train_df['longitude'].values, poi_df['latitude'].values, poi_df['longitude'].values)

    train_df[prefix + '_nearest_distance'] = np.min(distances, axis=1).round()
    nearest_index = np.argmin(distances, axis=1)
    train_df[prefix + '_nearest'] = poi_df['name'].iloc[nearest_index].values
    train_df[prefix + '_count_within_1km'] = np.sum(distances <= 1000, axis=1)

# Load data
mrt_file = pd.read_csv("./Datasets/auxiliary-data/sg-mrt-existing-stations.csv")
shoppingmall_file = pd.read_csv("./Datasets/auxiliary-data/sg-shopping-malls.csv")

# Add distance features for MRT stations and shopping malls
add_distance_features(data, mrt_file, 'mrt')
add_distance_features(data, shoppingmall_file, 'mall')

data.head(10), data.shape

(              town  flat_type         flat_model  floor_area_sqm  \
 0      jurong east     3-room     new generation            67.0   
 1            bedok     4-room     new generation            92.0   
 2        toa payoh     3-room           improved            67.0   
 3        pasir ris  executive          apartment           149.0   
 4  kallang/whampoa     3-room           improved            68.0   
 5    bukit panjang  executive  premium apartment           130.0   
 6         sengkang     5-room  premium apartment           110.0   
 7       ang mo kio     3-room     new generation            67.0   
 8           bishan     4-room         simplified            84.0   
 9          punggol     5-room  premium apartment           112.0   
 
    lease_commence_date  latitude   longitude            subzone  \
 0                 1983  1.344518  103.738630         yuhua east   
 1                 1978  1.330186  103.938717        bedok north   
 2                 1971  1.332242  

In [13]:
# Load datasets
planned_mrt_file = pd.read_csv("./Datasets/auxiliary-data/sg-mrt-planned-stations.csv")

# Filter planned MRT stations that will open within the next year
upcoming_mrts = planned_mrt_file[planned_mrt_file['opening_year'] == "2024"]

# Combine the existing MRT data with the upcoming MRT data
combined_mrt_file = pd.concat([mrt_file, upcoming_mrts], ignore_index=True)

add_distance_features(data, combined_mrt_file, 'mrt_planned')
data.head(20)

Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,planning_area,region,...,Stock_Price,mrt_nearest_distance,mrt_nearest,mrt_count_within_1km,mall_nearest_distance,mall_nearest,mall_count_within_1km,mrt_planned_nearest_distance,mrt_planned_nearest,mrt_planned_count_within_1km
0,jurong east,3-room,new generation,67.0,1983,1.344518,103.73863,yuhua east,jurong east,west region,...,3041.73,699.0,Chinese Garden,1,1203.0,Westgate,0,699.0,Chinese Garden,1
1,bedok,4-room,new generation,92.0,1978,1.330186,103.938717,bedok north,bedok,east region,...,3165.18,899.0,Tanah Merah,1,1114.0,Djitsun Mall Bedok,0,899.0,Tanah Merah,1
2,toa payoh,3-room,improved,67.0,1971,1.332242,103.845643,toa payoh central,toa payoh,central region,...,2969.95,219.0,Toa Payoh,4,468.0,HDB Hub,3,219.0,Toa Payoh,4
3,pasir ris,executive,apartment,149.0,1993,1.370239,103.962894,pasir ris drive,pasir ris,east region,...,3055.05,1546.0,Pasir Ris,0,402.0,Loyang Point,1,1546.0,Pasir Ris,0
4,kallang/whampoa,3-room,improved,68.0,1972,1.320502,103.863341,bendemeer,kallang,central region,...,3102.51,188.0,Boon Keng,3,1073.0,Aperia,0,188.0,Boon Keng,3
5,bukit panjang,executive,premium apartment,130.0,2001,1.387847,103.764249,saujana,bukit panjang,west region,...,3270.51,982.0,Bukit Panjang,1,355.0,Greenridge Shopping Centre,4,982.0,Bukit Panjang,1
6,sengkang,5-room,premium apartment,110.0,2005,1.388997,103.875148,fernvale,sengkang,north-east region,...,2858.9,2076.0,Buangkok,0,296.0,The Seletar Mall,3,2076.0,Buangkok,0
7,ang mo kio,3-room,new generation,67.0,1978,1.366048,103.838123,shangri-la,ang mo kio,north-east region,...,3092.8,626.0,Mayflower,2,1089.0,Broadway Plaza,0,626.0,Mayflower,2
8,bishan,4-room,simplified,84.0,1987,1.344279,103.855556,bishan east,bishan,central region,...,3051.11,967.0,Bishan,2,1033.0,Junction 8,0,967.0,Bishan,2
9,punggol,5-room,premium apartment,112.0,2003,1.392832,103.91062,punggol field,punggol,north-east region,...,3155.06,1601.0,Punggol,0,315.0,Punggol Plaza,2,1601.0,Punggol,0


In [14]:
# Get the number of rows for each dataset
num_rows_train = data.shape[0]
num_rows_index = data.shape[0]

num_rows_train, num_rows_index

(60000, 60000)

In [15]:
updated_ordinal_mapping = {
    '2-room': 1,
    '3-room': 2,
    '4-room': 3,
    '5-room': 4,
    'executive': 5
}

# Re-encode the flat_type column using the updated ordinal mapping
data['flat_type_ordinal'] = data['flat_type'].map(updated_ordinal_mapping)

# Display the original and updated ordinal encoded 'flat_type' columns for comparison
data[['flat_type', 'flat_type_ordinal']].head()

Unnamed: 0,flat_type,flat_type_ordinal
0,3-room,2
1,4-room,3
2,3-room,2
3,executive,5
4,3-room,2


In [16]:
# Drop the 'flat_type' column
data = data.drop(columns=['flat_type'])

# Display the first few rows of the dataset to confirm
data.head()

Unnamed: 0,town,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,planning_area,region,monthly_rent,...,mrt_nearest_distance,mrt_nearest,mrt_count_within_1km,mall_nearest_distance,mall_nearest,mall_count_within_1km,mrt_planned_nearest_distance,mrt_planned_nearest,mrt_planned_count_within_1km,flat_type_ordinal
0,jurong east,new generation,67.0,1983,1.344518,103.73863,yuhua east,jurong east,west region,1600,...,699.0,Chinese Garden,1,1203.0,Westgate,0,699.0,Chinese Garden,1,2
1,bedok,new generation,92.0,1978,1.330186,103.938717,bedok north,bedok,east region,2250,...,899.0,Tanah Merah,1,1114.0,Djitsun Mall Bedok,0,899.0,Tanah Merah,1,3
2,toa payoh,improved,67.0,1971,1.332242,103.845643,toa payoh central,toa payoh,central region,1900,...,219.0,Toa Payoh,4,468.0,HDB Hub,3,219.0,Toa Payoh,4,2
3,pasir ris,apartment,149.0,1993,1.370239,103.962894,pasir ris drive,pasir ris,east region,2850,...,1546.0,Pasir Ris,0,402.0,Loyang Point,1,1546.0,Pasir Ris,0,5
4,kallang/whampoa,improved,68.0,1972,1.320502,103.863341,bendemeer,kallang,central region,2100,...,188.0,Boon Keng,3,1073.0,Aperia,0,188.0,Boon Keng,3,2


In [17]:
# Compute centroid of all properties(marina bay centroid)
marina_centroid_lat = 1.287515
marina_centroid_lon = 103.866641

# Convert the latitudes and longitudes to numpy arrays
df_lats = data['latitude'].values
df_longs = data['longitude'].values
dst_lats = np.array([marina_centroid_lat])
dst_longs = np.array([marina_centroid_lon])

# Calculate distance of each property to marina bay
distances = haversine_distances(df_lats, df_longs, dst_lats, dst_longs)
data['distance_to_centroid_marina_bay'] = distances[:, 0] / 1000

# Lease Duration
current_year = 2023
data['lease_duration'] = current_year - data['lease_commence_date']

# # 2. Categorical Features
# 
# # 2.1 High Cardinality
# # Drop the 'subzone' column
# data = data.drop(columns=['subzone'])
# 
# # 2.2 Drop Redundant Columns
# # Drop 'mrt_nearest' and other high cardinality columns
# data = data.drop(columns=['mrt_nearest', 'mall_nearest', 'mrt_plus_nearest'])

In [18]:
train_with_schoolscore = pd.read_csv("./Datasets/auxiliary-data/train_with_schoolscore.csv")

# Merge based on latitude and longitude
are_latitudes_equal = (data['latitude'].values == train_with_schoolscore['latitude'].values).all()
are_longitudes_equal = (data['longitude'].values == train_with_schoolscore['longitude'].values).all()

if are_latitudes_equal == are_longitudes_equal == False:
    raise Exception('The latitude and longitude values are not equal between the two datasets')
data['school_score'] = train_with_schoolscore['school_score']

# Display the first few rows of the updated train dataset
data.head()

Unnamed: 0,town,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,planning_area,region,monthly_rent,...,mall_nearest_distance,mall_nearest,mall_count_within_1km,mrt_planned_nearest_distance,mrt_planned_nearest,mrt_planned_count_within_1km,flat_type_ordinal,distance_to_centroid_marina_bay,lease_duration,school_score
0,jurong east,new generation,67.0,1983,1.344518,103.73863,yuhua east,jurong east,west region,1600,...,1203.0,Westgate,0,699.0,Chinese Garden,1,2,15.57818,40,280.1
1,bedok,new generation,92.0,1978,1.330186,103.938717,bedok north,bedok,east region,2250,...,1114.0,Djitsun Mall Bedok,0,899.0,Tanah Merah,1,3,9.311841,45,329.3
2,toa payoh,improved,67.0,1971,1.332242,103.845643,toa payoh central,toa payoh,central region,1900,...,468.0,HDB Hub,3,219.0,Toa Payoh,4,2,5.493976,52,368.1
3,pasir ris,apartment,149.0,1993,1.370239,103.962894,pasir ris drive,pasir ris,east region,2850,...,402.0,Loyang Point,1,1546.0,Pasir Ris,0,5,14.11032,30,254.6
4,kallang/whampoa,improved,68.0,1972,1.320502,103.863341,bendemeer,kallang,central region,2100,...,1073.0,Aperia,0,188.0,Boon Keng,3,2,3.68625,51,239.5


In [20]:
# Save the cleaned and analyzed dataset to a new CSV file
output_filepath = "./Datasets/cleaned_train.csv"
data.to_csv(output_filepath, index=False)

In [22]:
# Load the dataset
cleaned_train = pd.read_csv('./Datasets/cleaned_train.csv')

from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets (80% train, 20% test)
train_set, test_set = train_test_split(cleaned_train, test_size=0.2, random_state=42)

# Save the training and testing sets to separate CSV files
train_set_path = "./Datasets/final-data/train_set.csv"
test_set_path = "./Datasets/final-data/test_set.csv"
train_set.to_csv(train_set_path, index=False)
test_set.to_csv(test_set_path, index=False)