In [1]:
import pandas as pd
data = pd.read_csv('./Datasets/train_added.csv')

# Display the first few rows of the new dataset
data.head()

  data = pd.read_csv('./Datasets/train_added.csv')


Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent
0,2021-09,jurong east,257,jurong east street 24,3 room,new generation,67.0,yes,1983.0,1.344518,103.73863,0,yuhua east,jurong east,west region,1600
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978.0,1.330186,103.938717,0,bedok north,bedok,east region,2250
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971.0,1.332242,103.845643,0,toa payoh central,toa payoh,central region,1900
3,2021-08,pasir ris,250,pasir ris street 21,executive,apartment,149.0,yes,1993.0,1.370239,103.962894,0,pasir ris drive,pasir ris,east region,2850
4,2022-11,kallang/whampoa,34,whampoa west,3-room,improved,68.0,yes,1972.0,1.320502,103.863341,0,bendemeer,kallang,central region,2100


In [2]:
# Find unique combinations of town, block, and street_name with non-missing latitude and longitude
unique_locations = data.dropna(subset=['latitude', 'longitude'])
location_keys = ['town', 'block', 'street_name']
location_coordinates = unique_locations[location_keys + ['latitude', 'longitude']].drop_duplicates()

# Create a dictionary to map from (town, block, street_name) to (latitude, longitude)
coordinates_map = location_coordinates.set_index(location_keys).to_dict('index')

# Check how many unique location combinations we have
unique_location_count = len(coordinates_map)
unique_location_count, list(coordinates_map.items())[:5]  # Show the first 5 entries for an overview

(8614,
 [(('jurong east', '257', 'jurong east street 24'),
   {'latitude': 1.344518043, 'longitude': 103.7386303}),
  (('bedok', '119', 'bedok north road'),
   {'latitude': 1.33018552, 'longitude': 103.9387166}),
  (('toa payoh', '157', 'lorong 1 toa payoh'),
   {'latitude': 1.332242149, 'longitude': 103.8456431}),
  (('pasir ris', '250', 'pasir ris street 21'),
   {'latitude': 1.370238552, 'longitude': 103.9628942}),
  (('kallang/whampoa', '34', 'whampoa west'),
   {'latitude': 1.320501696, 'longitude': 103.8633413})])

In [3]:
# Function to apply the mapping for filling missing values
def fill_coordinates(row, coord_map):
    if pd.isna(row['latitude']) or pd.isna(row['longitude']):
        key = (row['town'], row['block'], row['street_name'])
        if key in coord_map:
            row['latitude'], row['longitude'] = coord_map[key]['latitude'], coord_map[key]['longitude']
    return row

# Apply the function to fill in the missing latitude and longitude
data_filled = data.apply(lambda row: fill_coordinates(row, coordinates_map), axis=1)

# Now check how many are still missing after this operation
remaining_missing_lat_long = data_filled[['latitude', 'longitude']].isnull().sum()
remaining_missing_lat_long

latitude     94748
longitude    94748
dtype: int64

In [4]:
# Redefine the function to fill coordinates without block
def fill_coordinates_without_block(row, coord_map):
    if pd.isna(row['latitude']) or pd.isna(row['longitude']):
        key = (row['town'], row['street_name'])
        if key in coord_map:
            row['latitude'], row['longitude'] = coord_map[key]['latitude'], coord_map[key]['longitude']
    return row

# Group the data by town and street_name and calculate the mean latitude and longitude for these groups
location_coordinates_mean = unique_locations.groupby(['town', 'street_name']).agg({
    'latitude': 'mean',
    'longitude': 'mean'
}).reset_index()

# Create a dictionary to map from (town, street_name) to (mean_latitude, mean_longitude)
coordinates_mean_map = location_coordinates_mean.set_index(['town', 'street_name']).to_dict('index')

# Apply the new mean-based mapping to fill in the missing latitude and longitude
data_filled_mean = data.apply(lambda row: fill_coordinates_without_block(row, coordinates_mean_map), axis=1)

# Check how many are still missing after this operation
remaining_missing_lat_long_mean = data_filled_mean[['latitude', 'longitude']].isnull().sum()
remaining_missing_lat_long_mean

latitude     94654
longitude    94654
dtype: int64

In [5]:
# Group the data by town and calculate the mean latitude and longitude for these groups
location_coordinates_town_only = unique_locations.groupby('town').agg({
    'latitude': 'mean',
    'longitude': 'mean'
}).reset_index()

# Create a dictionary to map from town to mean latitude and longitude
coordinates_town_map = location_coordinates_town_only.set_index('town').to_dict('index')

# Function to apply the town-based mapping for filling missing values
def fill_coordinates_town_only(row, coord_map):
    if pd.isna(row['latitude']) or pd.isna(row['longitude']):
        key = row['town']
        if key in coord_map:
            row['latitude'], row['longitude'] = coord_map[key]['latitude'], coord_map[key]['longitude']
    return row

# Apply the function to fill in the missing latitude and longitude using town only
data_filled_town = data.apply(lambda row: fill_coordinates_town_only(row, coordinates_town_map), axis=1)

# Check how many are still missing after this operation
remaining_missing_lat_long_town = data_filled_town[['latitude', 'longitude']].isnull().sum()
remaining_missing_lat_long_town

latitude     0
longitude    0
dtype: int64

In [7]:
data_filled_town.to_csv('./Datasets/train_added_long_lat.csv', index=False)

In [8]:
# Function to fill in the region, planning_area, and subzone based on the most common value for each town
def fill_location_details(row, details_map):
    if pd.isna(row['subzone']) or pd.isna(row['planning_area']) or pd.isna(row['region']):
        key = row['town']
        if key in details_map:
            row['subzone'], row['planning_area'], row['region'] = details_map[key]['subzone'], details_map[key]['planning_area'], details_map[key]['region']
    return row

# Calculate the mode for the subzone, planning_area, and region for each town
location_details_mode = unique_locations.groupby('town').agg(lambda x: x.mode()[0] if not x.mode().empty else None).reset_index()

# Create a mapping from town to the most common subzone, planning_area, and region
location_details_map = location_details_mode.set_index('town')[['subzone', 'planning_area', 'region']].to_dict('index')

# Apply the mapping to fill in missing subzone, planning_area, and region values
data_filled_details = data_filled_town.apply(lambda row: fill_location_details(row, location_details_map), axis=1)

# Verify if there are any missing values left for subzone, planning_area, and region
remaining_missing_details = data_filled_details[['subzone', 'planning_area', 'region']].isnull().sum()
remaining_missing_details

subzone          0
planning_area    0
region           0
dtype: int64

In [10]:
data_filled_details.to_csv('./Datasets/train_added_subzone_planning_region.csv', index=False)