## Importing Libraries

In [1]:
import numpy as np
import pandas as pd

## Importing Dataset

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,60.5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29451 entries, 0 to 29450
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   POSTED_BY              29451 non-null  object 
 1   UNDER_CONSTRUCTION     29451 non-null  int64  
 2   RERA                   29451 non-null  int64  
 3   BHK_NO.                29451 non-null  int64  
 4   BHK_OR_RK              29451 non-null  object 
 5   SQUARE_FT              29451 non-null  float64
 6   READY_TO_MOVE          29451 non-null  int64  
 7   RESALE                 29451 non-null  int64  
 8   ADDRESS                29451 non-null  object 
 9   LONGITUDE              29451 non-null  float64
 10  LATITUDE               29451 non-null  float64
 11  TARGET(PRICE_IN_LACS)  29451 non-null  float64
dtypes: float64(4), int64(5), object(3)
memory usage: 2.7+ MB


In [4]:
df.isnull().sum()

POSTED_BY                0
UNDER_CONSTRUCTION       0
RERA                     0
BHK_NO.                  0
BHK_OR_RK                0
SQUARE_FT                0
READY_TO_MOVE            0
RESALE                   0
ADDRESS                  0
LONGITUDE                0
LATITUDE                 0
TARGET(PRICE_IN_LACS)    0
dtype: int64

## Split into Features and Target Variable

In [5]:
X = df.loc[:, ('POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'BHK_OR_RK', 'SQUARE_FT', 'READY_TO_MOVE', 'RESALE', 'LONGITUDE', 'LATITUDE')].values
X

array([['Owner', 0, 0, ..., 1, 12.96991, 77.59796],
       ['Dealer', 0, 0, ..., 1, 12.274538, 76.644605],
       ['Owner', 0, 0, ..., 1, 12.778033, 77.632191],
       ...,
       ['Dealer', 0, 0, ..., 1, 26.928785, 75.828002],
       ['Owner', 0, 0, ..., 1, 12.90015, 80.22791],
       ['Dealer', 0, 1, ..., 1, 26.832353, 75.841749]], dtype=object)

In [6]:
y = df.iloc[:, -1].values
y

array([55. , 51. , 43. , ..., 27.1, 67. , 27.8])

## Encoding Column Categorical Data

In [7]:
df.loc[:, ('POSTED_BY', 'BHK_OR_RK')].head()

Unnamed: 0,POSTED_BY,BHK_OR_RK
0,Owner,BHK
1,Dealer,BHK
2,Owner,BHK
3,Owner,BHK
4,Dealer,BHK


In [8]:
df['POSTED_BY'].unique()

array(['Owner', 'Dealer', 'Builder'], dtype=object)

In [9]:
df['BHK_OR_RK'].unique()

array(['BHK', 'RK'], dtype=object)

### Encode Column BHK_OR_RK

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:, 4] = le.fit_transform(X[:, 4])
X

array([['Owner', 0, 0, ..., 1, 12.96991, 77.59796],
       ['Dealer', 0, 0, ..., 1, 12.274538, 76.644605],
       ['Owner', 0, 0, ..., 1, 12.778033, 77.632191],
       ...,
       ['Dealer', 0, 0, ..., 1, 26.928785, 75.828002],
       ['Owner', 0, 0, ..., 1, 12.90015, 80.22791],
       ['Dealer', 0, 1, ..., 1, 26.832353, 75.841749]], dtype=object)

### Encode Column POSTED_BY

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X

array([[0.0, 0.0, 1.0, ..., 1, 12.96991, 77.59796],
       [0.0, 1.0, 0.0, ..., 1, 12.274538, 76.644605],
       [0.0, 0.0, 1.0, ..., 1, 12.778033, 77.632191],
       ...,
       [0.0, 1.0, 0.0, ..., 1, 26.928785, 75.828002],
       [0.0, 0.0, 1.0, ..., 1, 12.90015, 80.22791],
       [0.0, 1.0, 0.0, ..., 1, 26.832353, 75.841749]], dtype=object)

## Split into Training Set and Test Set

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training Model

In [13]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=11, random_state=0)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=11, random_state=0)

## Predict

In [14]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[399.44 100.  ]
 [ 41.    36.5 ]
 [ 51.32  80.  ]
 ...
 [ 30.45  32.  ]
 [ 31.91  15.  ]
 [ 66.83  67.5 ]]


## Evaluate Model

In [15]:
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error
print(np.sqrt(mean_squared_log_error(y_test, y_pred)))
print(np.sqrt(mean_squared_error(y_test, y_pred)))
print(r2_score(y_test, y_pred))

0.3747109944057838
136.32124846865315
0.9595159247913908


## Predicting test.csv

### Importing test.csv

In [16]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE
0,Owner,0,0,1,BHK,545.17134,1,1,"Kamrej,Surat",21.262,73.0477
1,Dealer,1,1,2,BHK,800.0,0,0,"Panvel,Lalitpur",18.966114,73.148278
2,Dealer,0,0,2,BHK,1257.096513,1,1,"New Town,Kolkata",22.5922,88.484911
3,Dealer,0,0,3,BHK,1400.329489,1,1,"Kalwar Road,Jaipur",26.9883,75.5846
4,Owner,0,0,1,BHK,430.47783,1,1,"Mai Mandir,Nadiad",22.7,72.87


In [17]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68720 entries, 0 to 68719
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   POSTED_BY           68720 non-null  object 
 1   UNDER_CONSTRUCTION  68720 non-null  int64  
 2   RERA                68720 non-null  int64  
 3   BHK_NO.             68720 non-null  int64  
 4   BHK_OR_RK           68720 non-null  object 
 5   SQUARE_FT           68720 non-null  float64
 6   READY_TO_MOVE       68720 non-null  int64  
 7   RESALE              68720 non-null  int64  
 8   ADDRESS             68720 non-null  object 
 9   LONGITUDE           68720 non-null  float64
 10  LATITUDE            68720 non-null  float64
dtypes: float64(3), int64(5), object(3)
memory usage: 5.8+ MB


In [18]:
df_test.isnull().sum()

POSTED_BY             0
UNDER_CONSTRUCTION    0
RERA                  0
BHK_NO.               0
BHK_OR_RK             0
SQUARE_FT             0
READY_TO_MOVE         0
RESALE                0
ADDRESS               0
LONGITUDE             0
LATITUDE              0
dtype: int64

### Make Variable Features

In [19]:
test = df_test.loc[:, ('POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'BHK_OR_RK', 'SQUARE_FT', 'READY_TO_MOVE', 'RESALE', 'LONGITUDE', 'LATITUDE')].values
test

array([['Owner', 0, 0, ..., 1, 21.262, 73.0477],
       ['Dealer', 1, 1, ..., 0, 18.966114, 73.148278],
       ['Dealer', 0, 0, ..., 1, 22.5922, 88.484911],
       ...,
       ['Dealer', 1, 1, ..., 0, 19.222101, 72.988231],
       ['Dealer', 0, 0, ..., 1, 18.49667, 73.94167],
       ['Dealer', 0, 0, ..., 1, 19.124896, 72.89350300000001]],
      dtype=object)

### Encoding Column Categorical Data

#### Encode Column BHK_OR_RK

In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
test[:, 4] = le.fit_transform(test[:, 4])
test

array([['Owner', 0, 0, ..., 1, 21.262, 73.0477],
       ['Dealer', 1, 1, ..., 0, 18.966114, 73.148278],
       ['Dealer', 0, 0, ..., 1, 22.5922, 88.484911],
       ...,
       ['Dealer', 1, 1, ..., 0, 19.222101, 72.988231],
       ['Dealer', 0, 0, ..., 1, 18.49667, 73.94167],
       ['Dealer', 0, 0, ..., 1, 19.124896, 72.89350300000001]],
      dtype=object)

#### Encode Column POSTED_BY

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [0])], remainder='passthrough')
test = np.array(ct.fit_transform(test))
test

array([[0.0, 0.0, 1.0, ..., 1, 21.262, 73.0477],
       [0.0, 1.0, 0.0, ..., 0, 18.966114, 73.148278],
       [0.0, 1.0, 0.0, ..., 1, 22.5922, 88.484911],
       ...,
       [0.0, 1.0, 0.0, ..., 0, 19.222101, 72.988231],
       [0.0, 1.0, 0.0, ..., 1, 18.49667, 73.94167],
       [0.0, 1.0, 0.0, ..., 1, 19.124896, 72.89350300000001]],
      dtype=object)

### Predict

In [22]:
y_result = regressor.predict(test)
y_result

array([  23.84,   80.  ,   66.36, ..., 8103.64,   85.81,  622.73])

### Output Predictions File

In [23]:
pd.DataFrame({'TARGET(PRICE_IN_LACS)': y_result}).to_csv('submission.csv', index=False)