In [2]:
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
from geopy.distance import distance

# Data Cleaning 

Step 1: Changed the variables names 
Step 2: Remove the missing variables and check for the dimension of the dataset 

In [3]:
data = pd.read_csv('data/property_loc.csv',delimiter =',')
data = data.drop(['Unnamed: 0', 'Address', 'Building Name'], axis=1) #drop variables that are not useful 
data = data.rename(columns={"Built On": "Built", "Land Area": "Land_Area", "Number of Bathroom": "No_of_bathrm"
                          , "Number of Bedroom":"No_of_bedrm", "Price per sqft": "Price_per_sqft"})
house = data.dropna() #drop missing vales 
print("The dimension of the data is : ",house.shape)
house.head()

The dimension of the data is :  (8546, 10)


Unnamed: 0,Built,Land_Area,No_of_bathrm,No_of_bedrm,Price,Price_per_sqft,Tenure,Type,latitude,longitude
0,1995,1270 sqft,2,3,"S$ 546,000",S$ 429.92 psf,99-year Leasehold,HDB Apartment,1.345383,103.746046
1,99-year Leasehold,409 sqft,1,1,"S$ 720,000","S$ 1,760.39 psf",Condominium,New Project: 2020,1.318261,103.879391
2,2004,1066 sqft,2,2,"S$ 980,000",S$ 919.32 psf,99-year Leasehold,Condominium,1.386702,103.743679
3,99-year Leasehold,680 sqft,2,2,"S$ 880,000","S$ 1,294.12 psf",Condominium,New Project: 2019,1.39644,103.875251
4,2014,926 sqft,2,2,"S$ 2,500,000","S$ 2,699.78 psf",Freehold,Condominium,1.295316,103.827096


Step 3: Remove data that does not have a numeric value for the Built variable, as it is important to determine the age of the building 

In [31]:
a = house.loc[:, ('Built')].str.isnumeric()
house =house.drop(house.index[a == False])
print("The dimension of the data is : ",house.shape)

(7339, 10)


Step 4: Removed house types data that are uncommon like shophouse and convervation house 
The reason for removing these data types is such houses has one or two data, it is not sufficient to train the model 

In [32]:
#remove shophouse and other uncommon house types 
house.reset_index(inplace=True,drop=True)
a = np.zeros(len(house))
for i in range(len(house)):
    if (house.Type[i] == 'Shophouse' or house.Type[i] == 'Conservation House'):
        a[i] = False
    else:
        a[i] = True

house = house.drop(house.index[a==False])
print(house.shape)

(7336, 10)


Step 5: Drop the '$', 'S'  and 'psf' to make the variables to be float value

In [33]:
cols_to_check = ['Price','Price_per_sqft']
for i in cols_to_check:
    house[i] = house[i].replace({'S':' ', 'psf':' ',',': ''}, regex=True)
    house[i] = house[i].str.replace('$', '')
    house[i] = house[i].astype(float)
house.head()


Unnamed: 0,Built,Land_Area,No_of_bathrm,No_of_bedrm,Price,Price_per_sqft,Tenure,Type,latitude,longitude
0,1995,1270 sqft,2,3,546000.0,429.92,99-year Leasehold,HDB Apartment,1.345383,103.746046
1,2004,1066 sqft,2,2,980000.0,919.32,99-year Leasehold,Condominium,1.386702,103.743679
2,2014,926 sqft,2,2,2500000.0,2699.78,Freehold,Condominium,1.295316,103.827096
3,2019,"4425 sqft, 1940 sqft (land)",4,4,3600000.0,1855.67,999-year Leasehold,Terraced House,1.33236,103.94614
4,2008,668 sqft,1,1,1250000.0,1871.26,99-year Leasehold,Apartment,1.280772,103.85266


Step 6: As the variable *Land_Area* has 2 values, we removed one of the value (size of the house) and keep the land size value

In [34]:
house.reset_index(inplace=True,drop=True)
for i in range(len(house)):
    house.loc[i, 'Land_Area'] = house.loc[i, 'Land_Area'].split(',')[-1];
house.head()

Unnamed: 0,Built,Land_Area,No_of_bathrm,No_of_bedrm,Price,Price_per_sqft,Tenure,Type,latitude,longitude
0,1995,1270 sqft,2,3,546000.0,429.92,99-year Leasehold,HDB Apartment,1.345383,103.746046
1,2004,1066 sqft,2,2,980000.0,919.32,99-year Leasehold,Condominium,1.386702,103.743679
2,2014,926 sqft,2,2,2500000.0,2699.78,Freehold,Condominium,1.295316,103.827096
3,2019,1940 sqft (land),4,4,3600000.0,1855.67,999-year Leasehold,Terraced House,1.33236,103.94614
4,2008,668 sqft,1,1,1250000.0,1871.26,99-year Leasehold,Apartment,1.280772,103.85266


Step 7: Drop the '$', 'S','land' and 'sqft' to make the variables to be float value

In [35]:
house.Land_Area = house.Land_Area.replace({'sqft':' ', 'land':' '}, regex=True)
house.Land_Area = house.Land_Area.str.replace('(', '')
house.Land_Area = house.Land_Area.str.replace(')', '')
house.Land_Area = house.Land_Area.astype(float)
house.head()

Unnamed: 0,Built,Land_Area,No_of_bathrm,No_of_bedrm,Price,Price_per_sqft,Tenure,Type,latitude,longitude
0,1995,1270.0,2,3,546000.0,429.92,99-year Leasehold,HDB Apartment,1.345383,103.746046
1,2004,1066.0,2,2,980000.0,919.32,99-year Leasehold,Condominium,1.386702,103.743679
2,2014,926.0,2,2,2500000.0,2699.78,Freehold,Condominium,1.295316,103.827096
3,2019,1940.0,4,4,3600000.0,1855.67,999-year Leasehold,Terraced House,1.33236,103.94614
4,2008,668.0,1,1,1250000.0,1871.26,99-year Leasehold,Apartment,1.280772,103.85266


Step 8: Removed the unknown tenure 

In [36]:
a = np.zeros(len(house))
for i in range(len(house)):
    if (house.Tenure[i] == 'Unknown Tenure'):
        a[i] = False
    else:
        a[i] = True

house = house.drop(house.index[a==False])

Step 9: Changed the house type to Condo and HDB for easier classification 

In [38]:
house.Type = house.Type.str.replace('Condominium', 'Condo').replace('Apartment', 'Condo').replace('Executive Condo','Condo').replace('Walk-up','Condo')
house.Type = house.Type.str.replace('HDB Apartment', 'HDB')

house.head()

Unnamed: 0,Built,Land_Area,No_of_bathrm,No_of_bedrm,Price,Price_per_sqft,Tenure,Type,latitude,longitude
0,1995,1270.0,2,3,546000.0,429.92,99-year Leasehold,HDB,1.345383,103.746046
1,2004,1066.0,2,2,980000.0,919.32,99-year Leasehold,Condo,1.386702,103.743679
2,2014,926.0,2,2,2500000.0,2699.78,Freehold,Condo,1.295316,103.827096
3,2019,1940.0,4,4,3600000.0,1855.67,999-year Leasehold,Terraced House,1.33236,103.94614
4,2008,668.0,1,1,1250000.0,1871.26,99-year Leasehold,Condo,1.280772,103.85266


Step 10: We decided to deal with housing type with only Condo or HDB, hence we dropped landed property 

In [39]:
house.reset_index(inplace=True,drop=True)
a = np.zeros(len(house))
for i in range(len(house)):
    if (house.Type[i] == 'HDB' or house.Type[i] == 'Condo'):
        a[i] = True
    else:
        a[i] = False

Final = house.drop(house.index[a==False])

In [28]:
Final.drop(columns=['Tenure'], inplace=True)
whole_data= Final.to_csv('data/cleaned_data.csv')