## Importing Libraries

In [1]:
import numpy as np
import pandas as pd

## Importing Dataset

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,60.5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29451 entries, 0 to 29450
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   POSTED_BY              29451 non-null  object 
 1   UNDER_CONSTRUCTION     29451 non-null  int64  
 2   RERA                   29451 non-null  int64  
 3   BHK_NO.                29451 non-null  int64  
 4   BHK_OR_RK              29451 non-null  object 
 5   SQUARE_FT              29451 non-null  float64
 6   READY_TO_MOVE          29451 non-null  int64  
 7   RESALE                 29451 non-null  int64  
 8   ADDRESS                29451 non-null  object 
 9   LONGITUDE              29451 non-null  float64
 10  LATITUDE               29451 non-null  float64
 11  TARGET(PRICE_IN_LACS)  29451 non-null  float64
dtypes: float64(4), int64(5), object(3)
memory usage: 2.7+ MB


In [4]:
df.isnull().sum()

POSTED_BY                0
UNDER_CONSTRUCTION       0
RERA                     0
BHK_NO.                  0
BHK_OR_RK                0
SQUARE_FT                0
READY_TO_MOVE            0
RESALE                   0
ADDRESS                  0
LONGITUDE                0
LATITUDE                 0
TARGET(PRICE_IN_LACS)    0
dtype: int64

## Add Column CITY

In [5]:
df['ADDRESS'].str.split(',').str[-1]

0        Bangalore
1           Mysore
2        Bangalore
3        Ghaziabad
4          Kolkata
           ...    
29446         Agra
29447         Vapi
29448       Jaipur
29449      Chennai
29450       Jaipur
Name: ADDRESS, Length: 29451, dtype: object

In [6]:
df['CITY'] = df['ADDRESS'].str.split(',').str[-1]
df.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS),CITY
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,55.0,Bangalore
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0,Mysore
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0,Bangalore
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,62.5,Ghaziabad
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,60.5,Kolkata


## Split into Features and Target Variable

In [7]:
X = df.loc[:, ('POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'BHK_OR_RK', 'SQUARE_FT', 'READY_TO_MOVE', 'RESALE', 'LONGITUDE', 'LATITUDE', 'CITY')].values
X

array([['Owner', 0, 0, ..., 12.96991, 77.59796, 'Bangalore'],
       ['Dealer', 0, 0, ..., 12.274538, 76.644605, 'Mysore'],
       ['Owner', 0, 0, ..., 12.778033, 77.632191, 'Bangalore'],
       ...,
       ['Dealer', 0, 0, ..., 26.928785, 75.828002, 'Jaipur'],
       ['Owner', 0, 0, ..., 12.90015, 80.22791, 'Chennai'],
       ['Dealer', 0, 1, ..., 26.832353, 75.841749, 'Jaipur']],
      dtype=object)

In [8]:
y = df.iloc[:, -2].values
y

array([55. , 51. , 43. , ..., 27.1, 67. , 27.8])

## Encoding Column Categorical Data

In [9]:
df.loc[:, ('POSTED_BY', 'BHK_OR_RK', 'CITY')].head()

Unnamed: 0,POSTED_BY,BHK_OR_RK,CITY
0,Owner,BHK,Bangalore
1,Dealer,BHK,Mysore
2,Owner,BHK,Bangalore
3,Owner,BHK,Ghaziabad
4,Dealer,BHK,Kolkata


In [10]:
df['POSTED_BY'].unique()

array(['Owner', 'Dealer', 'Builder'], dtype=object)

In [11]:
df['BHK_OR_RK'].unique()

array(['BHK', 'RK'], dtype=object)

In [12]:
df['CITY'].unique()

array(['Bangalore', 'Mysore', 'Ghaziabad', 'Kolkata', 'Kochi', 'Jaipur',
       'Mohali', 'Chennai', 'Siliguri', 'Noida', 'Raigad', 'Bhubaneswar',
       'Wardha', 'Pune', 'Mumbai', 'Nagpur', 'Deoghar', 'Bhiwadi',
       'Faridabad', 'Lalitpur', 'Maharashtra', 'Vadodara',
       'Visakhapatnam', 'Vapi', 'Mangalore', 'Aurangabad', 'Ottapalam',
       'Vijayawada', 'Belgaum', 'Bhopal', 'Lucknow', 'Kanpur',
       'Gandhinagar', 'Pondicherry', 'Agra', 'Ranchi', 'Gurgaon', 'Udupi',
       'Indore', 'Jodhpur', 'Coimbatore', 'Valsad', 'Palghar', 'Surat',
       'Varanasi', 'Guwahati', 'Amravati', 'Anand', 'Tirupati',
       'Secunderabad', 'Raipur', 'Vizianagaram', 'Thrissur', 'Satna',
       'Madurai', 'Chandigarh', 'Shimla', 'Gwalior', 'Rajkot', 'Sonipat',
       'Allahabad', 'Berhampur', 'Roorkee', 'Dharuhera', 'Latur',
       'Durgapur', 'Panchkula', 'Solapur', 'Durg', 'Goa', 'Jamshedpur',
       'Hazaribagh', 'Jabalpur', 'Hosur', 'Morbi', 'Hubli', 'Karnal',
       'Patna', 'Bilaspur', '

### Encode Column CITY

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:, 10] = le.fit_transform(X[:, 10])
X

array([['Owner', 0, 0, ..., 12.96991, 77.59796, 21],
       ['Dealer', 0, 0, ..., 12.274538, 76.644605, 160],
       ['Owner', 0, 0, ..., 12.778033, 77.632191, 21],
       ...,
       ['Dealer', 0, 0, ..., 26.928785, 75.828002, 105],
       ['Owner', 0, 0, ..., 12.90015, 80.22791, 49],
       ['Dealer', 0, 1, ..., 26.832353, 75.841749, 105]], dtype=object)

### Encode Column BHK_OR_RK

In [14]:
le = LabelEncoder()
X[:, 4] = le.fit_transform(X[:, 4])
X

array([['Owner', 0, 0, ..., 12.96991, 77.59796, 21],
       ['Dealer', 0, 0, ..., 12.274538, 76.644605, 160],
       ['Owner', 0, 0, ..., 12.778033, 77.632191, 21],
       ...,
       ['Dealer', 0, 0, ..., 26.928785, 75.828002, 105],
       ['Owner', 0, 0, ..., 12.90015, 80.22791, 49],
       ['Dealer', 0, 1, ..., 26.832353, 75.841749, 105]], dtype=object)

### Encode Column POSTED_BY

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X

array([[0.0, 0.0, 1.0, ..., 12.96991, 77.59796, 21],
       [0.0, 1.0, 0.0, ..., 12.274538, 76.644605, 160],
       [0.0, 0.0, 1.0, ..., 12.778033, 77.632191, 21],
       ...,
       [0.0, 1.0, 0.0, ..., 26.928785, 75.828002, 105],
       [0.0, 0.0, 1.0, ..., 12.90015, 80.22791, 49],
       [0.0, 1.0, 0.0, ..., 26.832353, 75.841749, 105]], dtype=object)

## Split into Training Set and Test Set

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training Model

In [17]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=11, random_state=0)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=11, random_state=0)

## Predict

In [18]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[359.68 100.  ]
 [ 42.64  36.5 ]
 [ 40.99  80.  ]
 ...
 [ 35.09  32.  ]
 [ 32.83  15.  ]
 [ 66.83  67.5 ]]


## Evaluate Model

In [19]:
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error
print(np.sqrt(mean_squared_log_error(y_test, y_pred)))
print(np.sqrt(mean_squared_error(y_test, y_pred)))
print(r2_score(y_test, y_pred))

0.37004644978513235
134.36035898381462
0.960672220670468


## Predicting test.csv

### Importing test.csv

In [20]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE
0,Owner,0,0,1,BHK,545.17134,1,1,"Kamrej,Surat",21.262,73.0477
1,Dealer,1,1,2,BHK,800.0,0,0,"Panvel,Lalitpur",18.966114,73.148278
2,Dealer,0,0,2,BHK,1257.096513,1,1,"New Town,Kolkata",22.5922,88.484911
3,Dealer,0,0,3,BHK,1400.329489,1,1,"Kalwar Road,Jaipur",26.9883,75.5846
4,Owner,0,0,1,BHK,430.47783,1,1,"Mai Mandir,Nadiad",22.7,72.87


In [21]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68720 entries, 0 to 68719
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   POSTED_BY           68720 non-null  object 
 1   UNDER_CONSTRUCTION  68720 non-null  int64  
 2   RERA                68720 non-null  int64  
 3   BHK_NO.             68720 non-null  int64  
 4   BHK_OR_RK           68720 non-null  object 
 5   SQUARE_FT           68720 non-null  float64
 6   READY_TO_MOVE       68720 non-null  int64  
 7   RESALE              68720 non-null  int64  
 8   ADDRESS             68720 non-null  object 
 9   LONGITUDE           68720 non-null  float64
 10  LATITUDE            68720 non-null  float64
dtypes: float64(3), int64(5), object(3)
memory usage: 5.8+ MB


In [22]:
df_test.isnull().sum()

POSTED_BY             0
UNDER_CONSTRUCTION    0
RERA                  0
BHK_NO.               0
BHK_OR_RK             0
SQUARE_FT             0
READY_TO_MOVE         0
RESALE                0
ADDRESS               0
LONGITUDE             0
LATITUDE              0
dtype: int64

### Add Column CITY

In [23]:
df_test['CITY'] = df_test['ADDRESS'].str.split(',').str[-1]
df_test.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,CITY
0,Owner,0,0,1,BHK,545.17134,1,1,"Kamrej,Surat",21.262,73.0477,Surat
1,Dealer,1,1,2,BHK,800.0,0,0,"Panvel,Lalitpur",18.966114,73.148278,Lalitpur
2,Dealer,0,0,2,BHK,1257.096513,1,1,"New Town,Kolkata",22.5922,88.484911,Kolkata
3,Dealer,0,0,3,BHK,1400.329489,1,1,"Kalwar Road,Jaipur",26.9883,75.5846,Jaipur
4,Owner,0,0,1,BHK,430.47783,1,1,"Mai Mandir,Nadiad",22.7,72.87,Nadiad


### Make Features Variable 

In [24]:
test = df_test.loc[:, ('POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'BHK_OR_RK', 'SQUARE_FT', 'READY_TO_MOVE', 'RESALE', 'LONGITUDE', 'LATITUDE', 'CITY')].values
test

array([['Owner', 0, 0, ..., 21.262, 73.0477, 'Surat'],
       ['Dealer', 1, 1, ..., 18.966114, 73.148278, 'Lalitpur'],
       ['Dealer', 0, 0, ..., 22.5922, 88.484911, 'Kolkata'],
       ...,
       ['Dealer', 1, 1, ..., 19.222101, 72.988231, 'Maharashtra'],
       ['Dealer', 0, 0, ..., 18.49667, 73.94167, 'Pune'],
       ['Dealer', 0, 0, ..., 19.124896, 72.89350300000001, 'Mumbai']],
      dtype=object)

### Encoding Column Categorical Data

#### Encode Column CITY

In [25]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
test[:, 10] = le.fit_transform(test[:, 10])
test

array([['Owner', 0, 0, ..., 21.262, 73.0477, 264],
       ['Dealer', 1, 1, ..., 18.966114, 73.148278, 164],
       ['Dealer', 0, 0, ..., 22.5922, 88.484911, 157],
       ...,
       ['Dealer', 1, 1, ..., 19.222101, 72.988231, 170],
       ['Dealer', 0, 0, ..., 18.49667, 73.94167, 219],
       ['Dealer', 0, 0, ..., 19.124896, 72.89350300000001, 184]],
      dtype=object)

#### Encode Column BHK_OR_RK

In [26]:
le = LabelEncoder()
test[:, 4] = le.fit_transform(test[:, 4])
test

array([['Owner', 0, 0, ..., 21.262, 73.0477, 264],
       ['Dealer', 1, 1, ..., 18.966114, 73.148278, 164],
       ['Dealer', 0, 0, ..., 22.5922, 88.484911, 157],
       ...,
       ['Dealer', 1, 1, ..., 19.222101, 72.988231, 170],
       ['Dealer', 0, 0, ..., 18.49667, 73.94167, 219],
       ['Dealer', 0, 0, ..., 19.124896, 72.89350300000001, 184]],
      dtype=object)

#### Encode Column POSTED_BY

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [0])], remainder='passthrough')
test = np.array(ct.fit_transform(test))
test

array([[0.0, 0.0, 1.0, ..., 21.262, 73.0477, 264],
       [0.0, 1.0, 0.0, ..., 18.966114, 73.148278, 164],
       [0.0, 1.0, 0.0, ..., 22.5922, 88.484911, 157],
       ...,
       [0.0, 1.0, 0.0, ..., 19.222101, 72.988231, 170],
       [0.0, 1.0, 0.0, ..., 18.49667, 73.94167, 219],
       [0.0, 1.0, 0.0, ..., 19.124896, 72.89350300000001, 184]],
      dtype=object)

### Predict

In [28]:
y_result = regressor.predict(test)
y_result

array([  22.49,   47.56,  143.47, ..., 7702.73,   60.  ,  626.36])

### Output Predictions File

In [29]:
pd.DataFrame({'TARGET(PRICE_IN_LACS)': y_result}).to_csv('submission.csv', index=False)