### Imports
- pandas: for holding the data in a pandas.DataFrame object
- wandb: for logging findings

In [109]:
import pandas as pd
import wandb
import matplotlib.pyplot as plt

### Load data

In [110]:
data = pd.read_csv("data/train.csv")

### Exploratory Data Analysis
1. Check number of rows of data
2. Check for any null values
3. Check for any na values
4. Check for empty strings
5. Check datatypes of all columns
6. Find unique values for each column

In [111]:
# check number of data points
len(data)

60000

In [112]:
# check for any null values
data.isnull().any()

rent_approval_date     False
town                   False
block                  False
street_name            False
flat_type              False
flat_model             False
floor_area_sqm         False
furnished              False
lease_commence_date    False
latitude               False
longitude              False
elevation              False
subzone                False
planning_area          False
region                 False
monthly_rent           False
dtype: bool

In [113]:
# check for any na values
data.isna().any()



rent_approval_date     False
town                   False
block                  False
street_name            False
flat_type              False
flat_model             False
floor_area_sqm         False
furnished              False
lease_commence_date    False
latitude               False
longitude              False
elevation              False
subzone                False
planning_area          False
region                 False
monthly_rent           False
dtype: bool

In [114]:
# check for any empty strings
data.eq('').any()

rent_approval_date     False
town                   False
block                  False
street_name            False
flat_type              False
flat_model             False
floor_area_sqm         False
furnished              False
lease_commence_date    False
latitude               False
longitude              False
elevation              False
subzone                False
planning_area          False
region                 False
monthly_rent           False
dtype: bool

In [115]:
# check data types for each column
data.dtypes

rent_approval_date      object
town                    object
block                   object
street_name             object
flat_type               object
flat_model              object
floor_area_sqm         float64
furnished               object
lease_commence_date      int64
latitude               float64
longitude              float64
elevation              float64
subzone                 object
planning_area           object
region                  object
monthly_rent             int64
dtype: object

In [116]:
# find unique values
for c in data.columns:
    print(c, data[c].unique(), len(data[c].unique()))

rent_approval_date ['2021-09' '2022-05' '2022-10' '2021-08' '2022-11' '2023-04' '2021-01'
 '2022-06' '2021-10' '2021-04' '2021-03' '2022-02' '2021-07' '2022-12'
 '2023-01' '2022-01' '2023-07' '2021-06' '2023-05' '2023-03' '2021-05'
 '2021-02' '2022-07' '2023-06' '2022-03' '2023-02' '2021-12' '2022-04'
 '2021-11' '2022-08' '2022-09'] 31
town ['jurong east' 'bedok' 'toa payoh' 'pasir ris' 'kallang/whampoa'
 'bukit panjang' 'sengkang' 'ang mo kio' 'bishan' 'punggol'
 'choa chu kang' 'clementi' 'bukit batok' 'sembawang' 'jurong west'
 'woodlands' 'queenstown' 'yishun' 'bukit timah' 'marine parade'
 'bukit merah' 'geylang' 'hougang' 'tampines' 'serangoon' 'central'] 26
block ['257' '119' '157' ... '509c' '218a' '880a'] 2553
street_name ['Jurong East Street 24' 'bedok north road' 'lorong 1 toa payoh' ...
 'seng poh road' 'Jurong West Street 51' 'Zion Road'] 1083
flat_type ['3 room' '4-room' '3-room' 'executive' '5 room' '4 room' '5-room'
 '2-room' '2 room'] 9
flat_model ['new generation' 'im

### Data Cleaning


##### 1. Convert all strings to lowercase

In [117]:
# convert strings to lowecase
for col in data.columns:
    if data[col].dtype == "object":
        data[col] = data[col].str.lower()
        print(data[col].unique())

['2021-09' '2022-05' '2022-10' '2021-08' '2022-11' '2023-04' '2021-01'
 '2022-06' '2021-10' '2021-04' '2021-03' '2022-02' '2021-07' '2022-12'
 '2023-01' '2022-01' '2023-07' '2021-06' '2023-05' '2023-03' '2021-05'
 '2021-02' '2022-07' '2023-06' '2022-03' '2023-02' '2021-12' '2022-04'
 '2021-11' '2022-08' '2022-09']
['jurong east' 'bedok' 'toa payoh' 'pasir ris' 'kallang/whampoa'
 'bukit panjang' 'sengkang' 'ang mo kio' 'bishan' 'punggol'
 'choa chu kang' 'clementi' 'bukit batok' 'sembawang' 'jurong west'
 'woodlands' 'queenstown' 'yishun' 'bukit timah' 'marine parade'
 'bukit merah' 'geylang' 'hougang' 'tampines' 'serangoon' 'central']
['257' '119' '157' ... '509c' '218a' '880a']
['jurong east street 24' 'bedok north road' 'lorong 1 toa payoh'
 'pasir ris street 21' 'whampoa west' 'senja road' 'fernvale road'
 'ang mo kio avenue 1' 'bishan street 11' 'edgedale plains'
 'choa chu kang avenue 4' 'pasir ris drive 1' 'bedok sth road'
 'fernvale link' 'choa chu kang avenue 3' 'clementi west 

##### 2. Split date into a year and a month column and convert to integer

In [118]:
# split date into year and month
data["rent_approval_year"] = data["rent_approval_date"].apply(lambda x: x[:4]).astype(int)
data["rent_approval_month"] = data["rent_approval_date"].apply(lambda x: x[5:]).astype(int)

In [119]:
data

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent,rent_approval_year,rent_approval_month
0,2021-09,jurong east,257,jurong east street 24,3 room,new generation,67.0,yes,1983,1.344518,103.738630,0.0,yuhua east,jurong east,west region,1600,2021,9
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250,2022,5
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900,2022,10
3,2021-08,pasir ris,250,pasir ris street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850,2021,8
4,2022-11,kallang/whampoa,34,whampoa west,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100,2022,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,2021-09,ang mo kio,441,ang mo kio avenue 10,3 room,new generation,67.0,yes,1979,1.366050,103.854168,0.0,chong boon,ang mo kio,north-east region,2200,2021,9
59996,2023-04,bukit merah,95a,henderson road,4-room,model a,83.0,yes,2019,1.286493,103.821434,0.0,henderson hill,bukit merah,central region,4100,2023,4
59997,2022-06,tampines,862a,tampines street 83,5-room,improved,122.0,yes,1988,1.355064,103.936507,0.0,tampines west,tampines,east region,2250,2022,6
59998,2023-01,bedok,67,bedok sth avenue 3,5-room,standard,123.0,yes,1977,1.318974,103.944076,0.0,bedok south,bedok,east region,4700,2023,1


##### 3. Clean Up Flat Type Labels (Merge 'x-room' with 'x room' labels)

In [120]:
data['flat_type'].replace({'2 room': '2-room',
                           '3 room': '3-room',
                           '4 room': '4-room',
                           '5 room': '5-room'}, inplace=True)
print(data['flat_type'].unique())

['3-room' '4-room' 'executive' '5-room' '2-room']


##### 4. Remove columns that have no meaningful data - furnished and elevation

In [121]:
data = data.drop(['furnished', 'elevation'], axis='columns')
data.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,planning_area,region,monthly_rent,rent_approval_year,rent_approval_month
0,2021-09,jurong east,257,jurong east street 24,3-room,new generation,67.0,1983,1.344518,103.73863,yuhua east,jurong east,west region,1600,2021,9
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,1978,1.330186,103.938717,bedok north,bedok,east region,2250,2022,5
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,1971,1.332242,103.845643,toa payoh central,toa payoh,central region,1900,2022,10
3,2021-08,pasir ris,250,pasir ris street 21,executive,apartment,149.0,1993,1.370239,103.962894,pasir ris drive,pasir ris,east region,2850,2021,8
4,2022-11,kallang/whampoa,34,whampoa west,3-room,improved,68.0,1972,1.320502,103.863341,bendemeer,kallang,central region,2100,2022,11


##### 5. Remove columns where there is too much variation in data
- block (latitude and longitude data can provide more information)
- street name (latitude and longitude data can provide more information)
- town (use planning area is better as there is 3 more classes)
- subzone (planning area + latitude and longitude may give better results)

In [122]:
data = data.drop(['block', 'street_name', 'town', 'subzone'], axis='columns')
data.head()

Unnamed: 0,rent_approval_date,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,planning_area,region,monthly_rent,rent_approval_year,rent_approval_month
0,2021-09,3-room,new generation,67.0,1983,1.344518,103.73863,jurong east,west region,1600,2021,9
1,2022-05,4-room,new generation,92.0,1978,1.330186,103.938717,bedok,east region,2250,2022,5
2,2022-10,3-room,improved,67.0,1971,1.332242,103.845643,toa payoh,central region,1900,2022,10
3,2021-08,executive,apartment,149.0,1993,1.370239,103.962894,pasir ris,east region,2850,2021,8
4,2022-11,3-room,improved,68.0,1972,1.320502,103.863341,kallang,central region,2100,2022,11


##### 6. Check distribution of flat_model to determine any further steps

In [123]:
data['flat_model'].value_counts()

flat_model
model a                   17122
improved                  17010
new generation            11292
premium apartment          5109
simplified                 2798
standard                   2268
apartment                  2080
maisonette                  956
model a2                    663
dbss                        371
type s1                     108
model a-maisonette           56
adjoined flat                49
type s2                      46
2-room                       34
premium apartment loft       25
premium maisonette            6
terrace                       4
3gen                          3
Name: count, dtype: int64

#### Sanity Check for Data Again

In [124]:
for col in data.columns:
    print(len(data[col].unique()), data[col].unique())

31 ['2021-09' '2022-05' '2022-10' '2021-08' '2022-11' '2023-04' '2021-01'
 '2022-06' '2021-10' '2021-04' '2021-03' '2022-02' '2021-07' '2022-12'
 '2023-01' '2022-01' '2023-07' '2021-06' '2023-05' '2023-03' '2021-05'
 '2021-02' '2022-07' '2023-06' '2022-03' '2023-02' '2021-12' '2022-04'
 '2021-11' '2022-08' '2022-09']
5 ['3-room' '4-room' 'executive' '5-room' '2-room']
19 ['new generation' 'improved' 'apartment' 'premium apartment' 'simplified'
 'model a' 'standard' 'dbss' 'model a2' 'maisonette' 'model a-maisonette'
 'type s1' 'adjoined flat' 'type s2' 'premium apartment loft' '2-room'
 'premium maisonette' 'terrace' '3gen']
146 [ 67.   92.  149.   68.  130.  110.   84.  112.  104.  123.   60.   93.
  91.  113.   69.   81.   73.  121.   97.   64.   65.  100.   98.  141.
  85.  114.   90.   59.  120.  105.  115.  107.  144.  142.  147.  146.
 111.  125.   99.   47.  102.  116.   76.  145.   45.   74.   75.  122.
 106.   96.  117.  103.  132.   82.   70.  124.  119.  127.  101.   88.
 15

### Data Preprocessing

#### Convert categorical data into numerical data

##### 1. One-Hot Encoding for Flat Type
5 classes, relatively easier to to one-hot encoding

In [125]:
data = pd.get_dummies(data, columns=['flat_type'], dtype="int64")
data

Unnamed: 0,rent_approval_date,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,planning_area,region,monthly_rent,rent_approval_year,rent_approval_month,flat_type_2-room,flat_type_3-room,flat_type_4-room,flat_type_5-room,flat_type_executive
0,2021-09,new generation,67.0,1983,1.344518,103.738630,jurong east,west region,1600,2021,9,0,1,0,0,0
1,2022-05,new generation,92.0,1978,1.330186,103.938717,bedok,east region,2250,2022,5,0,0,1,0,0
2,2022-10,improved,67.0,1971,1.332242,103.845643,toa payoh,central region,1900,2022,10,0,1,0,0,0
3,2021-08,apartment,149.0,1993,1.370239,103.962894,pasir ris,east region,2850,2021,8,0,0,0,0,1
4,2022-11,improved,68.0,1972,1.320502,103.863341,kallang,central region,2100,2022,11,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,2021-09,new generation,67.0,1979,1.366050,103.854168,ang mo kio,north-east region,2200,2021,9,0,1,0,0,0
59996,2023-04,model a,83.0,2019,1.286493,103.821434,bukit merah,central region,4100,2023,4,0,0,1,0,0
59997,2022-06,improved,122.0,1988,1.355064,103.936507,tampines,east region,2250,2022,6,0,0,0,1,0
59998,2023-01,standard,123.0,1977,1.318974,103.944076,bedok,east region,4700,2023,1,0,0,0,1,0


##### 2. Categorical Encoding for Flat Model

In [127]:
flat_models = data['flat_model'].unique()
categories_flat_model = {f: idx for idx, f in enumerate(flat_models)}

data["flat_model_cat"] = data["flat_model"].apply(lambda x: categories_flat_model[x])
data

Unnamed: 0,rent_approval_date,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,planning_area,region,monthly_rent,rent_approval_year,rent_approval_month,flat_type_2-room,flat_type_3-room,flat_type_4-room,flat_type_5-room,flat_type_executive,flat_model_cat
0,2021-09,new generation,67.0,1983,1.344518,103.738630,jurong east,west region,1600,2021,9,0,1,0,0,0,0
1,2022-05,new generation,92.0,1978,1.330186,103.938717,bedok,east region,2250,2022,5,0,0,1,0,0,0
2,2022-10,improved,67.0,1971,1.332242,103.845643,toa payoh,central region,1900,2022,10,0,1,0,0,0,1
3,2021-08,apartment,149.0,1993,1.370239,103.962894,pasir ris,east region,2850,2021,8,0,0,0,0,1,2
4,2022-11,improved,68.0,1972,1.320502,103.863341,kallang,central region,2100,2022,11,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,2021-09,new generation,67.0,1979,1.366050,103.854168,ang mo kio,north-east region,2200,2021,9,0,1,0,0,0,0
59996,2023-04,model a,83.0,2019,1.286493,103.821434,bukit merah,central region,4100,2023,4,0,0,1,0,0,5
59997,2022-06,improved,122.0,1988,1.355064,103.936507,tampines,east region,2250,2022,6,0,0,0,1,0,1
59998,2023-01,standard,123.0,1977,1.318974,103.944076,bedok,east region,4700,2023,1,0,0,0,1,0,6


### Logging
Log the results of the session in Weights & Biases

#### Initialization

In [None]:
wandb.init(
    project="hdb_rental",
    name="eda"
)

#### Log the data table

In [9]:
table = wandb.Table(data=data, columns=data.columns)

In [10]:
wandb.log({
    "ReningOutofFlats Table": table
})