In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [2]:
# Read the KL Property Listing dataset csv file downloaded from kaggle

df = pd.read_csv('data_kaggle.csv')
df.head()

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,"KLCC, Kuala Lumpur","RM 1,250,000",2+1,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,"Damansara Heights, Kuala Lumpur","RM 6,800,000",6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished
2,"Dutamas, Kuala Lumpur","RM 1,030,000",3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished
3,"Cheras, Kuala Lumpur",,,,,,,
4,"Bukit Jalil, Kuala Lumpur","RM 900,000",4+1,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished


In [3]:
# Check the total numbers of records and features that we have

df.shape

(53883, 8)

In [4]:
# Check the total of null in each features

df.isnull().sum()

Location             0
Price              248
Rooms             1706
Bathrooms         2013
Car Parks        17567
Property Type       25
Size              1063
Furnishing        6930
dtype: int64

In [5]:
# Drop the records that have null in features Price, Property Type, Size and Furnishing

df1 = df.dropna(subset = ['Price', 'Property Type', 'Size', 'Furnishing'])
df1.shape

(46044, 8)

In [6]:
df1.isnull().sum()

Location             0
Price                0
Rooms              595
Bathrooms          748
Car Parks        13858
Property Type        0
Size                 0
Furnishing           0
dtype: int64

In [7]:
pd.set_option('display.max_rows', None)

In [8]:
type(df1.Size[1])

str

In [9]:
df1['Size'][0].split(":")

['Built-up ', ' 1,335 sq. ft.']

In [10]:
# As there is two types of area in the size columns, extract the types of area and put it in a new columns

df1['Type of Area'] = df1['Size'].apply(lambda x : x.split(':')[0])
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing,Type of Area
0,"KLCC, Kuala Lumpur","RM 1,250,000",2+1,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished,Built-up
1,"Damansara Heights, Kuala Lumpur","RM 6,800,000",6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished,Land area
2,"Dutamas, Kuala Lumpur","RM 1,030,000",3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished,Built-up
4,"Bukit Jalil, Kuala Lumpur","RM 900,000",4+1,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished,Built-up
5,"Taman Tun Dr Ismail, Kuala Lumpur","RM 5,350,000",4+2,5.0,4.0,Bungalow,Land area : 7200 sq. ft.,Partly Furnished,Land area


In [11]:
# Extract the area values from the Size column

df1['Area'] = df1['Size'].apply(lambda x : x.split(':')[1])
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing,Type of Area,Area
0,"KLCC, Kuala Lumpur","RM 1,250,000",2+1,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished,Built-up,"1,335 sq. ft."
1,"Damansara Heights, Kuala Lumpur","RM 6,800,000",6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished,Land area,6900 sq. ft.
2,"Dutamas, Kuala Lumpur","RM 1,030,000",3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished,Built-up,"1,875 sq. ft."
4,"Bukit Jalil, Kuala Lumpur","RM 900,000",4+1,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished,Built-up,"1,513 sq. ft."
5,"Taman Tun Dr Ismail, Kuala Lumpur","RM 5,350,000",4+2,5.0,4.0,Bungalow,Land area : 7200 sq. ft.,Partly Furnished,Land area,7200 sq. ft.


In [12]:
df1['Area'] = df1['Area'].apply(lambda x :x.split(" ")[-3])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
df2 = df1.copy()

In [14]:
df2.groupby('Area')['Area'].count()

Area
                             20
(1500)                        1
(40x80)                       1
(50X80)                       1
-                             8
0                            87
1,000                       175
1,001                        54
1,002                         9
1,003                        73
1,004                         6
1,005                        10
1,006                        17
1,007                        53
1,008                        20
1,009                        58
1,010                        80
1,011                        24
1,012                        47
1,013                         5
1,014                         1
1,015                        37
1,016                        13
1,017                         9
1,018                        80
1,019                        79
1,020                        44
1,021                        27
1,022                        15
1,023                        66
1,024                        24
1,0

In [15]:
# Create afunction to remove punctuations and letters from the data in Area column

import re

def remove_special_char(area):
    spec_char =re.compile(r'[-,.~'"'"'";#&$*()`+;/]')
    letters = re.compile(r'[a-wyzA-WYZ]')
    area = spec_char.sub('', area)
    area = letters.sub('', area)
    
    return area

In [16]:
df2["Area"] = df2['Area'].apply(remove_special_char)

In [17]:
df2.groupby('Area')['Area'].count()

Area
                    87
0                   87
100                 23
1000               187
10000               72
100000               1
1001                58
1002                 9
1003                80
1004                 7
1005                10
10059                1
1006                17
10060                2
10062                1
1007                54
1008                21
10085                1
1009                65
1010                84
10100                2
10103                1
1011                28
1012                49
1013                 6
1014                 2
1015                40
10150                2
10157                7
1016                13
10160                1
10161                1
1017                10
1018                87
10183                2
1019                83
10197                2
102                  1
1020                48
10200                3
10204                1
1021                27
1022                17
10220 

In [18]:
# Create a function to convert the data from string into number

def convert_to_float(x):
    tokens = x.lower().split('x')
    if len(tokens) == 2:
        try:
            return float(tokens[0]) * float(tokens[1])
        except:
            return None
    try:
        return float(x)
    except:
        return None

In [19]:
df2["Area"] = df2['Area'].apply(convert_to_float)

In [20]:
df2.isnull().sum()

Location             0
Price                0
Rooms              595
Bathrooms          748
Car Parks        13858
Property Type        0
Size                 0
Furnishing           0
Type of Area         0
Area               110
dtype: int64

In [21]:
# Drop the null values that is a byprouct of the previous 

df3 = df2.dropna(subset = ['Area'])
df3.shape

(45934, 10)

In [22]:
df3.groupby('Area')['Area'].count()

Area
0.000000e+00     88
1.100000e+01      1
1.700000e+01      1
1.900000e+01      1
2.200000e+01      1
3.500000e+01      1
4.000000e+01      1
4.500000e+01      3
5.000000e+01      4
5.400000e+01      4
5.500000e+01      9
6.000000e+01     13
6.100000e+01      1
6.500000e+01     40
6.600000e+01      3
7.000000e+01     57
7.200000e+01      2
7.500000e+01    110
7.600000e+01      1
7.700000e+01      1
7.800000e+01      2
8.000000e+01     91
8.200000e+01      4
8.500000e+01     41
8.600000e+01      5
8.700000e+01      8
8.800000e+01      1
9.000000e+01     25
9.100000e+01      1
9.200000e+01      1
9.300000e+01      2
9.500000e+01      8
1.000000e+02     23
1.020000e+02      1
1.030000e+02      3
1.040000e+02      2
1.110000e+02      1
1.150000e+02      2
1.200000e+02      1
1.280000e+02      1
1.300000e+02      2
1.430000e+02      1
1.540000e+02      1
1.550000e+02      1
1.890000e+02      1
2.000000e+02      1
2.250000e+02      1
2.370000e+02      1
2.500000e+02     13
2.560000e+02   

In [23]:
# Remove the outliers in the value of area assuming that typical area for housing in KL ranging between 50 sq ft to 20000 sq ft

df4 = df3[df3.Area <= 20000]
df5 = df4[df3.Area >= 50]
df5.groupby('Area')['Area'].count()

  after removing the cwd from sys.path.


Area
50.0         4
54.0         4
55.0         9
60.0        13
61.0         1
65.0        40
66.0         3
70.0        57
72.0         2
75.0       110
76.0         1
77.0         1
78.0         2
80.0        91
82.0         4
85.0        41
86.0         5
87.0         8
88.0         1
90.0        25
91.0         1
92.0         1
93.0         2
95.0         8
100.0       23
102.0        1
103.0        3
104.0        2
111.0        1
115.0        2
120.0        1
128.0        1
130.0        2
143.0        1
154.0        1
155.0        1
189.0        1
200.0        1
225.0        1
237.0        1
250.0       13
256.0        1
279.0        1
300.0        1
304.0        1
322.0        1
325.0        2
330.0        1
350.0        1
351.0        5
361.0        1
375.0        1
380.0        6
398.0        1
400.0        1
406.0        3
409.0        1
410.0       15
411.0       12
413.0        1
417.0        1
418.0        2
420.0       11
430.0       12
431.0        1
434.0        1
435.0

In [24]:
df5.groupby('Car Parks')['Car Parks'].count()

Car Parks
1.0     12119
2.0     13393
3.0      3385
4.0      1782
5.0       459
6.0       541
7.0        53
8.0        81
9.0         4
10.0       58
11.0        3
12.0        4
13.0        6
15.0        6
16.0        1
18.0        1
20.0        2
24.0        1
28.0        4
30.0        1
Name: Car Parks, dtype: int64

In [25]:
# Drop the car parks column as it is hard to determine exact car park numbers for landed house and the data is unreliable

df6 = df5.drop(['Car Parks'], axis = 'columns')
df6.head()

Unnamed: 0,Location,Price,Rooms,Bathrooms,Property Type,Size,Furnishing,Type of Area,Area
0,"KLCC, Kuala Lumpur","RM 1,250,000",2+1,3.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished,Built-up,1335.0
1,"Damansara Heights, Kuala Lumpur","RM 6,800,000",6,7.0,Bungalow,Land area : 6900 sq. ft.,Partly Furnished,Land area,6900.0
2,"Dutamas, Kuala Lumpur","RM 1,030,000",3,4.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished,Built-up,1875.0
4,"Bukit Jalil, Kuala Lumpur","RM 900,000",4+1,3.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished,Built-up,1513.0
5,"Taman Tun Dr Ismail, Kuala Lumpur","RM 5,350,000",4+2,5.0,Bungalow,Land area : 7200 sq. ft.,Partly Furnished,Land area,7200.0


In [26]:
df6.groupby('Rooms')['Rooms'].count()

Rooms
1            2076
1+1          1054
1+2             5
10             20
10+             6
11              3
11+1            1
12              7
12+             3
13              1
13+             3
14              1
15+             1
16              1
18              1
2            4688
2+1          1733
2+2           123
20 Above        4
3           11952
3+1          7162
3+2           423
4            4253
4+1          3857
4+2           389
5            1764
5+1          2105
5+2           352
6             792
6+             68
6+1           685
7             309
7+             25
7+1           201
8             100
8+             11
8+1            55
9              33
9+              2
9+1             5
Studio        760
Name: Rooms, dtype: int64

In [27]:
# Convert the Studio into 0 value as studio apartment indicates that the hous don't have a bedroom

df6['Rooms'] = df6["Rooms"].apply(lambda x : 0 if x == 'Studio' else x)

In [28]:
df6.groupby('Rooms')['Rooms'].count()

Rooms
0             760
1            2076
1+1          1054
1+2             5
10             20
10+             6
11              3
11+1            1
12              7
12+             3
13              1
13+             3
14              1
15+             1
16              1
18              1
2            4688
2+1          1733
2+2           123
20 Above        4
3           11952
3+1          7162
3+2           423
4            4253
4+1          3857
4+2           389
5            1764
5+1          2105
5+2           352
6             792
6+             68
6+1           685
7             309
7+             25
7+1           201
8             100
8+             11
8+1            55
9              33
9+              2
9+1             5
Name: Rooms, dtype: int64

In [29]:
# COnvert all data in the Rooms column to string to avoid errors for the next processing function

df6["Rooms"] = df6["Rooms"].apply(lambda x : str(x))

In [30]:
# Create a function to convert the plus room and add it to the total no. of rooms
# As the additional room is typically smaller than other rooms, we made the value as half of the normal room

def convert_plus_to_total(x):
    if "+" in x:
        tokens = x.split('+')
        if tokens[1] != "":
            return float(tokens[0]) + float(tokens[1])/2
    try:
        return float(x)
    except:
        return None

In [31]:
df6["Rooms"] = df6["Rooms"].apply(convert_plus_to_total)

In [32]:
df6.groupby('Rooms')['Rooms'].count()

Rooms
0.0       760
1.0      2076
1.5      1054
2.0      4693
2.5      1733
3.0     12075
3.5      7162
4.0      4676
4.5      3857
5.0      2153
5.5      2105
6.0      1144
6.5       685
7.0       309
7.5       201
8.0       100
8.5        55
9.0        33
9.5         5
10.0       20
11.0        3
11.5        1
12.0        7
13.0        1
14.0        1
16.0        1
18.0        1
Name: Rooms, dtype: int64

In [33]:
df7 = df6[df6.Rooms <= 10]

In [34]:
df7.groupby('Bathrooms')['Bathrooms'].count()

Bathrooms
1.0      3957
2.0     18196
3.0      8238
4.0      5844
5.0      4202
6.0      2835
7.0       875
8.0       335
9.0       116
10.0       40
11.0        5
12.0       10
15.0        1
20.0        5
Name: Bathrooms, dtype: int64

In [35]:
df8 = df7[df7.Bathrooms <= 10]

In [36]:
df8.isnull().sum()

Location         0
Price            0
Rooms            0
Bathrooms        0
Property Type    0
Size             0
Furnishing       0
Type of Area     0
Area             0
dtype: int64

In [37]:
df8.shape

(44638, 9)

In [38]:
len(df8['Property Type'].unique())

93

In [39]:
# Create a list that contains all the locations that are under 25 

location_total = df8.groupby('Location')['Location'].count().sort_values(ascending = False)
location_total_less_than_25 = location_total[location_total <= 25]
len(location_total_less_than_25)

39

In [40]:
# Reduce the data by transforming all the location that has less than 25 count into "Other"

df8['Location'] = df8['Location'].apply(lambda x : 'Other' if x in location_total_less_than_25 else x)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [41]:
df8['Location'] = df8['Location'].apply(lambda x : x.split(',')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [42]:
len(df8['Location'].unique())

57

In [43]:
len(df8['Furnishing'].unique())

4

In [44]:
# Create dummy variable for these categorical features for machine learning purposes

dummies_location = pd.get_dummies(df8.Location)
dummies_type_pro = pd.get_dummies(df8['Property Type'])
dummies_furnish = pd.get_dummies(df8.Furnishing)

In [45]:
dummies_location.head()

Unnamed: 0,Ampang,Ampang Hilir,Bandar Damai Perdana,Bandar Menjalara,Bandar Tasik Selatan,Bangsar,Bangsar South,Batu Caves,Brickfields,Bukit Bintang,...,Setiawangsa,Sri Hartamas,Sri Petaling,Sungai Besi,Sunway SPK,Taman Desa,Taman Melawati,Taman Tun Dr Ismail,Titiwangsa,Wangsa Maju
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [46]:
dummies_furnish.head()

Unnamed: 0,Fully Furnished,Partly Furnished,Unfurnished,Unknown
0,1,0,0,0
1,0,1,0,0
2,0,1,0,0
4,0,1,0,0
5,0,1,0,0


In [47]:
dummies_type_pro.head()

Unnamed: 0,1-sty Terrace/Link House,1-sty Terrace/Link House (Corner),1-sty Terrace/Link House (EndLot),1-sty Terrace/Link House (Intermediate),1.5-sty Terrace/Link House,1.5-sty Terrace/Link House (Corner),1.5-sty Terrace/Link House (EndLot),1.5-sty Terrace/Link House (Intermediate),2-sty Terrace/Link House,2-sty Terrace/Link House (Corner),...,Serviced Residence (Intermediate),Serviced Residence (Penthouse),Serviced Residence (SOHO),Serviced Residence (Studio),Serviced Residence (Triplex),Townhouse,Townhouse (Corner),Townhouse (Duplex),Townhouse (EndLot),Townhouse (Intermediate)
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
df8.head()

Unnamed: 0,Location,Price,Rooms,Bathrooms,Property Type,Size,Furnishing,Type of Area,Area
0,KLCC,"RM 1,250,000",2.5,3.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished,Built-up,1335.0
1,Damansara Heights,"RM 6,800,000",6.0,7.0,Bungalow,Land area : 6900 sq. ft.,Partly Furnished,Land area,6900.0
2,Dutamas,"RM 1,030,000",3.0,4.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished,Built-up,1875.0
4,Bukit Jalil,"RM 900,000",4.5,3.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished,Built-up,1513.0
5,Taman Tun Dr Ismail,"RM 5,350,000",5.0,5.0,Bungalow,Land area : 7200 sq. ft.,Partly Furnished,Land area,7200.0


In [49]:
# Combine the dummy variable to the original dataframe and drop one column from each dummy variable to avoid dummy variale trap

df9 = pd.concat([df8.drop(['Size', 'Type of Area'], axis= 'columns'), dummies_location.drop(['Other'], axis = 'columns'),
                 dummies_type_pro.drop(['Townhouse'], axis = 'columns'), dummies_furnish.drop(['Unknown'], axis = 'columns')], axis = 'columns')
df9.head()

Unnamed: 0,Location,Price,Rooms,Bathrooms,Property Type,Furnishing,Area,Ampang,Ampang Hilir,Bandar Damai Perdana,...,Serviced Residence (SOHO),Serviced Residence (Studio),Serviced Residence (Triplex),Townhouse (Corner),Townhouse (Duplex),Townhouse (EndLot),Townhouse (Intermediate),Fully Furnished,Partly Furnished,Unfurnished
0,KLCC,"RM 1,250,000",2.5,3.0,Serviced Residence,Fully Furnished,1335.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,Damansara Heights,"RM 6,800,000",6.0,7.0,Bungalow,Partly Furnished,6900.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,Dutamas,"RM 1,030,000",3.0,4.0,Condominium (Corner),Partly Furnished,1875.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,Bukit Jalil,"RM 900,000",4.5,3.0,Condominium (Corner),Partly Furnished,1513.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,Taman Tun Dr Ismail,"RM 5,350,000",5.0,5.0,Bungalow,Partly Furnished,7200.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [50]:
# Process the target data which is Price so that it will be converted to numerical

df9["Price"] = df9["Price"].apply(lambda x : int(x.strip(" ")[2:].replace(',','')))

In [51]:
df9.shape

(44638, 158)

In [52]:
# Remove the outliers in the Price Data

df10 = df9[df9.Price > 100000]
df11 = df10[df10.Price <= 10000000]
df11.shape

(43824, 158)

In [53]:
# Set features and targets as a preparation for the modelling

x = df11.drop(['Location', 'Price', 'Property Type', 'Furnishing'], axis = 'columns')
y = df11.Price

In [54]:
x.head()

Unnamed: 0,Rooms,Bathrooms,Area,Ampang,Ampang Hilir,Bandar Damai Perdana,Bandar Menjalara,Bandar Tasik Selatan,Bangsar,Bangsar South,...,Serviced Residence (SOHO),Serviced Residence (Studio),Serviced Residence (Triplex),Townhouse (Corner),Townhouse (Duplex),Townhouse (EndLot),Townhouse (Intermediate),Fully Furnished,Partly Furnished,Unfurnished
0,2.5,3.0,1335.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,6.0,7.0,6900.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3.0,4.0,1875.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,4.5,3.0,1513.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,5.0,5.0,7200.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [55]:
y.head()

0    1250000
1    6800000
2    1030000
4     900000
5    5350000
Name: Price, dtype: int64

In [56]:
# Using GridSearchCV, a function is created to compare which ML model and parameters are the best for these prediction model

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

def find_best_model_using_gs(x,y):
    algos = {
        'linear_regression': {
            'model' : LinearRegression(),
            'params': {
                'normalize' : [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha' : [1,2],
                'selection': ['random','cyclic'],
                'tol' : [1]
            }
        },
        'decision_tree' : {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse', 'friedman_mse'],
                'splitter' : ['best', 'random']
            }
        },
        'random_forest' : {
            'model': RandomForestRegressor(),
            'params' : {
                'n_estimators' : [10, 50],
                'criterion' : ['mse', 'friedman_mse']
            }
        }
    }
    scores=[]
    cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 1)
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score = False)
        gs.fit(x,y)
        scores.append({
            'model' : algo_name,
            'best_score' : gs.best_score_,
            'best_params': gs.best_params_
        })
    
    return pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params'])



In [57]:
find_best_model_using_gs(x,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.788836,{'normalize': False}
1,lasso,0.748079,"{'alpha': 2, 'selection': 'cyclic', 'tol': 1}"
2,decision_tree,0.861965,"{'criterion': 'mse', 'splitter': 'best'}"
3,random_forest,0.913884,"{'criterion': 'friedman_mse', 'n_estimators': 50}"


In [58]:
# Based on the previous function, RandomForestRegressor gives out the higest accuracy so we will be using it in our model

rf_model = RandomForestRegressor(criterion = 'mse', n_estimators = 50)
rf_model.fit(x,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=50,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [59]:
# Create a function to predict the price using RandomForestRegressor 
# Includes also the condition where the previously dropped columns from the dummy variable can also be 

def predict_price(location, bedrooms, bathrooms, area, property_type, furnishing):
    a = np.zeros(len(x.columns))
    a[0] = bedrooms
    a[1] = bathrooms
    a[2] = area
    
    if location == 'Other':
        pass
    else:
        loc_index = np.where(x.columns == location)[0][0]
        a[loc_index] = 1
    
    if property_type == 'Townhouse':
        pass
    else:
        pro_index = np.where(x.columns == property_type)[0][0]
        a[pro_index] = 1
        
    if furnishing == 'Unknown':
        pass
    else:
        fur_index = np.where(x.columns == furnishing)[0][0]
        a[fur_index] = 1
        
    return rf_model.predict([a])[0]

In [60]:
predict_price('Titiwangsa', 3, 2, 850, 'Apartment', 'Fully Furnished')

324047.5238095238