In [None]:
import numpy as np
import pandas as pd

# 5. Data Cleaning 
[Back to top](#Contents)<br>
<br>

The acquired data contains the following features: 
1. Address 
2. Building Name 
3. Built On - Year that the building was built 
4. Land Area - Size of the house 
5. No. of Bathroom 
6. No. of Bedroom 
7. Price 
8. Price per sqft 
9. Tenure 
10. Type of property 
11. Latitude
12. Longtitude 


The data cleaning was conducted in the following process: 
1. Eliminate missing values and values that contain unknown data (e.g unknown tenure). 
2. Remove uncommon houses like shophouse and convervation house from the dataset (as there are insufficient data to train the model for such type of houses)
3. Remove string symbols like 'S$', 'psf', 'sqft' 
4. Extract numeric value from the 'Land_Area' variable 

In [2]:
data = pd.read_csv('../data/property_loc.csv',delimiter =',', index_col=0)
data.head()

Unnamed: 0,Address,Building Name,Built On,Land Area,Number of Bathroom,Number of Bedroom,Price,Price per sqft,Tenure,Type,latitude,longitude
0,190 Bukit Batok West Avenue 6,190 Bukit Batok West Avenue 6,1995,1270 sqft,2,3,"S$ 546,000",S$ 429.92 psf,99-year Leasehold,HDB Apartment,1.345383,103.746046
1,Sims Drive,Sims Urban Oasis,99-year Leasehold,409 sqft,1,1,"S$ 720,000","S$ 1,760.39 psf",Condominium,New Project: 2020,1.318261,103.879391
2,57 Choa Chu Kang Loop,The Warren,2004,1066 sqft,2,2,"S$ 980,000",S$ 919.32 psf,99-year Leasehold,Condominium,1.386702,103.743679
3,21 Fernvale Road,High Park Residences,99-year Leasehold,680 sqft,2,2,"S$ 880,000","S$ 1,294.12 psf",Condominium,New Project: 2019,1.39644,103.875251
4,21 Nathan Road,Nathan Suites,2014,926 sqft,2,2,"S$ 2,500,000","S$ 2,699.78 psf",Freehold,Condominium,1.295316,103.827096


<p> First, we dropped the features that are not useful, and a few of the data points with missing values. We dropped the feature <b>"Price"</b> too as it is just equal to the product of area and price per sqft. We also renamed some of the features. </p>

In [3]:
data.drop(['Address', 'Building Name', 'Tenure', 'Price'], axis=1, inplace=True) #drop variables that are not useful 
data = data.rename(columns={"Built On": "Built", "Land Area": "area", "Number of Bathroom": "bathroom"
                          , "Number of Bedroom":"bedroom", "Price per sqft": "price_sqft"}) #simplify the variable names 
house = data.dropna() #drop missing values
house.head()

Unnamed: 0,Built,area,bathroom,bedroom,price_sqft,Type,latitude,longitude
0,1995,1270 sqft,2,3,S$ 429.92 psf,HDB Apartment,1.345383,103.746046
1,99-year Leasehold,409 sqft,1,1,"S$ 1,760.39 psf",New Project: 2020,1.318261,103.879391
2,2004,1066 sqft,2,2,S$ 919.32 psf,Condominium,1.386702,103.743679
3,99-year Leasehold,680 sqft,2,2,"S$ 1,294.12 psf",New Project: 2019,1.39644,103.875251
4,2014,926 sqft,2,2,"S$ 2,699.78 psf",Condominium,1.295316,103.827096


In [4]:
print('Number of data points: ', house.shape[0])
print("Number of features: ", house.shape[1])

Number of data points:  8546
Number of features:  8


Next, we removed non-numeric values from the 'Built' variable as we need the numeric value to obtain a new variable (Building Age) 

In [5]:
a = house.loc[:, ('Built')].str.isnumeric()
house = house.drop(house.index[a == False])
house['Built'].astype('int64')

0       1995
2       2004
4       2014
8       2019
9       2008
        ... 
9158    2006
9160    1986
9161    1999
9162    1999
9163    2010
Name: Built, Length: 7339, dtype: int64

Since we are only looking at apartments, we removed house types that are not <b>HDB </b> and <b>Condo</b>.

In [6]:
house.Type = house.Type.str.replace('Condominium', 'Condo').replace('Apartment', 'Condo').replace('Executive Condo','Condo').replace('Walk-up','Condo')
house.Type = house.Type.str.replace('HDB Apartment', 'HDB')

mask = (house['Type'] != 'HDB') & (house['Type'] != 'Condo')
house.drop(house[mask].index, inplace=True)

We dropped the '$', 'S'  and 'psf' from the price features.

In [8]:
house['price_sqft'] = house['price_sqft'].replace({'S':' ', 'psf':' ',',': ''}, regex=True)
house['price_sqft'] = house['price_sqft'].str.replace('$', '')
house['price_sqft'] = house['price_sqft'].astype(float)

We dropped the '$', 'S','land' and 'sqft' to make the variables to be float value

In [9]:
house['area'] = house['area'].replace({'sqft':' ', 'land':' '}, regex=True)
house['area'] = house['area'].str.replace('(', '')
house['area'] = house['area'].str.replace(')', '')
house['area'] = house['area'].astype(float)

In [10]:
house.reset_index(inplace=True, drop=True)
print('Number of data points: ', house.shape[0])
print("Number of features: ", house.shape[1])

house.head()

Number of data points:  6771
Number of features:  8


Unnamed: 0,Built,area,bathroom,bedroom,price_sqft,Type,latitude,longitude
0,1995,1270.0,2,3,429.92,HDB,1.345383,103.746046
1,2004,1066.0,2,2,919.32,Condo,1.386702,103.743679
2,2014,926.0,2,2,2699.78,Condo,1.295316,103.827096
3,2008,668.0,1,1,1871.26,Condo,1.280772,103.85266
4,2000,1959.0,4,4,2118.43,Condo,1.313388,103.827361


In [11]:
house.to_csv('../data/data_cleaned.csv', index=False)