In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

In [29]:
from sklearn.model_selection import train_test_split
train_file=pd.read_csv("train.csv")
train, valid=train_test_split(train_file, train_size=0.8, shuffle=False)
print(train.shape, valid.shape)

(48000, 16) (12000, 16)


Drop the *date, furnished, elevation* data 

In [3]:
target=train['monthly_rent']
train=train.drop(['furnished','monthly_rent','elevation','rent_approval_date'],axis=1)
train.head()

Unnamed: 0,town,block,street_name,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,planning_area,region
0,jurong east,257,Jurong East Street 24,3 room,new generation,67.0,1983,1.344518,103.73863,yuhua east,jurong east,west region
1,bedok,119,bedok north road,4-room,new generation,92.0,1978,1.330186,103.938717,bedok north,bedok,east region
2,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,1971,1.332242,103.845643,toa payoh central,toa payoh,central region
3,pasir ris,250,Pasir Ris Street 21,executive,apartment,149.0,1993,1.370239,103.962894,pasir ris drive,pasir ris,east region
4,kallang/whampoa,34,Whampoa West,3-room,improved,68.0,1972,1.320502,103.863341,bendemeer,kallang,central region


Reduce the flat_type data

In [4]:
train.flat_type=train.flat_type.str.replace(' ', '-')
train.flat_type.unique()

array(['3-room', '4-room', 'executive', '5-room', '2-room'], dtype=object)

Identify the numeric attributes and categorical attributes

In [6]:
dtypes = train.dtypes
cate_cols = []
num_cols = []
for col, dtype in dtypes.items():
    if dtype=='object':
        cate_cols.append(col)
    else:
        num_cols.append(col)
print(cate_cols)
print(num_cols)

['town', 'block', 'street_name', 'flat_type', 'flat_model', 'subzone', 'planning_area', 'region']
['floor_area_sqm', 'lease_commence_date', 'latitude', 'longitude']


Normalize the numeric attributes

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train.loc[:,num_cols])
X_numeric=scaler.transform(train.loc[:,num_cols])
X_numeric=pd.DataFrame(X_numeric, columns=num_cols)
X_numeric

Unnamed: 0,floor_area_sqm,lease_commence_date,latitude,longitude
0,0.182320,0.307692,0.397117,0.190935
1,0.320442,0.211538,0.320346,0.906328
2,0.182320,0.076923,0.331363,0.573551
3,0.635359,0.500000,0.534887,0.992773
4,0.187845,0.096154,0.268476,0.636829
...,...,...,...,...
47995,0.419890,0.653846,0.381059,0.048513
47996,0.309392,0.653846,0.640210,0.721853
47997,0.419890,0.730769,0.683927,0.823152
47998,0.138122,0.076923,0.094996,0.514189


For categorical attributes, if the number of unique values is less than max_n_cat, then use one_hot method, else encoding.

In [8]:
max_n_cat=7
onehot_cols=[]
orders={}
df_cat=train.loc[:,cate_cols]
print(df_cat.shape)
for n, c in df_cat.items():
    df_cat[n] = c.astype('category').cat.as_ordered()
    if n in orders:
        df_cat[n].cat.set_categories(self.orders[n], ordered=True, inplace=True)
    cats_count = len(df_cat[n].cat.categories)
    if cats_count > max_n_cat:
        if n in onehot_cols:
            onehot_cols.remove(n)
    elif n not in onehot_cols:
        onehot_cols.append(n)
print(onehot_cols)

(48000, 8)
['flat_type', 'region']


In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [10]:
class CatEncoder(BaseEstimator, TransformerMixin):
    def __init__(self,cols, max_n_cat=7, onehot_cols=[], orders={}):
        self.cols = cols
        self.onehot_cols=onehot_cols
        self.cats = {}
        self.max_n_cat = max_n_cat
        self.orders = orders
        
    def fit(self, X, y=None):
        df_cat =  X.loc[:, self.cols]
        for n,c in df_cat.items():
            df_cat[n] = c.astype('category').cat.as_ordered()
            if n in self.orders:
                df_cat[n].cat.set_categories(self.orders[n], ordered=True, inplace=True)
            cats_count = len(df_cat[n].cat.categories)
            if cats_count<=2 or cats_count>self.max_n_cat:
                self.cats[n] = df_cat[n].cat.categories
                if n in self.onehot_cols:
                    self.onehot_cols.remove(n)
            elif n not in self.onehot_cols:
                self.onehot_cols.append(n)

        print(self.onehot_cols)
        return self
    
    def transform(self, df, y=None):
        X = df.loc[:, self.cols]
        for col in self.cats:
            X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
            X.loc[:,col] = X[col].cat.codes
        if len(self.onehot_cols):
            df_1h = pd.get_dummies(X[self.onehot_cols], dummy_na=True)
            df_drop=X.drop(self.onehot_cols,axis=1)
            return pd.concat([df_drop, df_1h], axis=1)

        return X
    
cat_pipeline = Pipeline([
    ('cat_encoder', CatEncoder(cols=cate_cols))
])
X_cate = cat_pipeline.fit_transform(train)
X_cate = X_cate.drop(['flat_type_nan','region_nan'],axis=1)
X_cate

['flat_type', 'region']


  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes


Unnamed: 0,town,block,street_name,flat_model,subzone,planning_area,flat_type_2-room,flat_type_3-room,flat_type_4-room,flat_type_5-room,flat_type_executive,region_central region,region_east region,region_north region,region_north-east region,region_west region
0,12,539,238,10,149,12,0,1,0,0,0,0,0,0,0,1
1,1,86,572,10,8,1,0,0,1,0,0,0,1,0,0,0
2,23,184,822,5,128,26,0,1,0,0,0,1,0,0,0,0
3,16,524,324,3,91,18,0,0,0,0,1,0,1,0,0,0
4,14,929,471,5,12,14,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,13,1698,251,5,61,13,0,0,0,1,0,0,0,0,0,1
47996,20,787,544,7,4,23,0,0,1,0,0,0,0,0,1,0
47997,17,1758,879,11,137,19,0,0,0,1,0,0,0,0,1,0
47998,4,362,759,15,24,4,0,1,0,0,0,1,0,0,0,0


In [11]:
X_cate=(X_cate-X_cate[:].min())/(X_cate[:].max()-X_cate[:].min())

In [12]:
train=pd.concat([X_numeric, X_cate],axis=1)
print(train.shape)

(48000, 20)


In [13]:
# target=(target-target.min())/(target.max()-target.min())

In [14]:
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(oob_score=True, random_state=3, n_jobs=-1)
params ={
    'n_estimators': 200, # [200, 300, 400, 500],
    'min_samples_leaf': 3, #[1, 2, 3, 5, 10, 25],
    'max_features': 0.5, #[0.5, 'sqrt', 'log2'],
    'max_depth': 10, # [5, 6, 7, 8, 10, 15, 20],
    'min_samples_split': 3, #[2, 3, 4]
}

model.set_params(**params)

In [15]:
from sklearn.metrics import mean_squared_error
model.fit(train,target)

In [16]:
predict = model.predict(train)
MSE=mean_squared_error(target.to_numpy(),predict)
print(MSE)

365204.67478277633


Valid

In [30]:
valid_target=valid["monthly_rent"]
valid=valid.drop(['furnished','elevation','rent_approval_date','monthly_rent'],axis=1)
valid.flat_type=valid.flat_type.str.replace(' ', '-')

In [31]:
valid_numeric=scaler.transform(valid.loc[:,num_cols])
valid_numeric=pd.DataFrame(valid_numeric, columns=num_cols)
valid_numeric

Unnamed: 0,floor_area_sqm,lease_commence_date,latitude,longitude
0,0.198895,0.865385,0.355300,0.572942
1,0.464088,0.576923,0.887409,0.388899
2,0.182320,0.346154,0.318658,0.800867
3,0.265193,0.173077,0.382414,0.610723
4,0.182320,0.250000,0.252740,0.295855
...,...,...,...,...
11995,0.182320,0.230769,0.512453,0.604032
11996,0.270718,1.000000,0.086308,0.486991
11997,0.486188,0.403846,0.453605,0.898429
11998,0.491713,0.192308,0.260293,0.925491


In [32]:
valid_cate = cat_pipeline.transform(valid)
valid_cate = valid_cate.drop(['flat_type_nan','region_nan'],axis=1)
valid_cate=(valid_cate-valid_cate[:].min())/(valid_cate[:].max()-valid_cate[:].min())
valid_cate=valid_cate.reset_index(drop=True)
valid_cate

  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes


Unnamed: 0,town,block,street_name,flat_model,subzone,planning_area,flat_type_2-room,flat_type_3-room,flat_type_4-room,flat_type_5-room,flat_type_executive,region_central region,region_east region,region_north region,region_north-east region,region_west region
0,0.92,0.061557,0.264624,0.222222,0.847682,0.928571,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.96,0.000000,0.953575,0.277778,0.562914,0.964286,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.04,0.012708,0.537604,0.555556,0.410596,0.035714,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.92,0.181890,0.772516,0.277778,0.503311,0.928571,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.36,0.407069,0.623955,0.555556,0.225166,0.285714,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,0.00,0.469420,0.012071,0.555556,0.205298,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
11996,0.16,0.973392,0.671309,0.388889,0.344371,0.142857,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
11997,0.88,0.906275,0.905292,0.277778,0.781457,0.892857,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
11998,0.04,0.737490,0.043640,0.833333,0.066225,0.035714,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [33]:
valid=pd.concat([valid_numeric, valid_cate],axis=1)
print(valid.shape)

(12000, 20)


In [34]:
predict = model.predict(valid)
MSE=mean_squared_error(valid_target.to_numpy(),predict)
print(MSE)

388665.4093917575


Test

In [35]:
test_file=pd.read_csv('./test.csv')
test_file.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region
0,2023-01,hougang,245,hougang street 22,5-room,improved,121.0,yes,1984,1.358411,103.891722,0.0,lorong ah soo,hougang,north-east region
1,2022-09,sembawang,316,sembawang vista,4-room,model a,100.0,yes,1999,1.446343,103.820817,0.0,sembawang central,sembawang,north region
2,2023-07,clementi,708,Clementi West Street 2,4-room,new generation,91.0,yes,1980,1.305719,103.762168,0.0,clementi west,clementi,west region
3,2021-08,jurong east,351,Jurong East Street 31,3 room,model a,74.0,yes,1986,1.344832,103.730778,0.0,yuhua west,jurong east,west region
4,2022-03,jurong east,305,jurong east street 32,5-room,improved,121.0,yes,1983,1.345437,103.735241,0.0,yuhua west,jurong east,west region


In [36]:
test_file.shape

(30000, 15)

In [37]:
# target=test_file['monthly_rent']
test=test_file.drop(['furnished','elevation','rent_approval_date'],axis=1)
test.head()

Unnamed: 0,town,block,street_name,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,planning_area,region
0,hougang,245,hougang street 22,5-room,improved,121.0,1984,1.358411,103.891722,lorong ah soo,hougang,north-east region
1,sembawang,316,sembawang vista,4-room,model a,100.0,1999,1.446343,103.820817,sembawang central,sembawang,north region
2,clementi,708,Clementi West Street 2,4-room,new generation,91.0,1980,1.305719,103.762168,clementi west,clementi,west region
3,jurong east,351,Jurong East Street 31,3 room,model a,74.0,1986,1.344832,103.730778,yuhua west,jurong east,west region
4,jurong east,305,jurong east street 32,5-room,improved,121.0,1983,1.345437,103.735241,yuhua west,jurong east,west region


In [38]:
test.flat_type=test.flat_type.str.replace(' ', '-')
test.flat_type.unique()

array(['5-room', '4-room', '3-room', 'executive', '2-room'], dtype=object)

In [39]:
test_X_numeric=scaler.transform(test.loc[:,num_cols])
test_X_numeric=pd.DataFrame(test_X_numeric, columns=num_cols)
test_X_numeric

Unnamed: 0,floor_area_sqm,lease_commence_date,latitude,longitude
0,0.480663,0.326923,0.471535,0.738302
1,0.364641,0.615385,0.942534,0.484785
2,0.314917,0.250000,0.189292,0.275091
3,0.220994,0.365385,0.398801,0.162860
4,0.480663,0.307692,0.402037,0.178815
...,...,...,...,...
29995,0.320442,0.865385,0.714367,0.803235
29996,0.309392,0.711538,0.994866,0.473702
29997,0.187845,0.250000,0.551705,0.600007
29998,0.607735,0.596154,0.892415,0.407203


In [40]:
X_test_cate = cat_pipeline.transform(test)
X_test_cate = X_test_cate.drop(['flat_type_nan','region_nan'],axis=1)
X_test_cate

  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes
  X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
  X.loc[:,col] = X[col].cat.codes


Unnamed: 0,town,block,street_name,flat_model,subzone,planning_area,flat_type_2-room,flat_type_3-room,flat_type_4-room,flat_type_5-room,flat_type_executive,region_central region,region_east region,region_north region,region_north-east region,region_west region
0,11,518,740,5,77,11,0,0,0,1,0,0,0,0,1,0
1,19,827,906,7,102,22,0,0,1,0,0,0,0,1,0,0
2,9,2001,141,10,35,8,0,0,1,0,0,0,0,0,0,1
3,12,956,239,7,150,12,0,1,0,0,0,0,0,0,0,1
4,12,774,777,5,150,12,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,17,1663,701,7,137,19,0,0,1,0,0,0,0,0,1,0
29996,19,1337,536,7,104,22,0,0,1,0,0,0,0,1,0,0
29997,0,1484,551,10,26,0,0,1,0,0,0,0,0,0,1,0
29998,24,1790,1036,6,140,27,0,0,0,0,1,0,0,1,0,0


In [41]:
X_test_cate=(X_test_cate-X_test_cate[:].min())/(X_test_cate[:].max()-X_test_cate[:].min())
X_test_cate

Unnamed: 0,town,block,street_name,flat_model,subzone,planning_area,flat_type_2-room,flat_type_3-room,flat_type_4-room,flat_type_5-room,flat_type_executive,region_central region,region_east region,region_north region,region_north-east region,region_west region
0,0.44,0.206116,0.688662,0.277778,0.509934,0.392857,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.76,0.328832,0.842937,0.388889,0.675497,0.785714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.36,0.795075,0.131970,0.555556,0.231788,0.285714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.48,0.380064,0.223048,0.388889,0.993377,0.428571,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.48,0.307784,0.723048,0.277778,0.993377,0.428571,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0.68,0.660842,0.652416,0.388889,0.907285,0.678571,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
29996,0.76,0.531374,0.499071,0.388889,0.688742,0.785714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
29997,0.00,0.589754,0.513011,0.555556,0.172185,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
29998,0.96,0.711279,0.963755,0.333333,0.927152,0.964286,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [42]:
test=pd.concat([test_X_numeric,X_test_cate],axis=1)
test.shape

(30000, 20)

In [43]:
result=model.predict