# House Rent Prediction

## Objective :

### import required packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Read the dataset

In [2]:
dataset = pd.read_csv("House_rent_unclean.csv")

### Data Pre-Processing

In [3]:
# Delete the unnamed column which is unnecessary is there in database
dataset.drop("Unnamed: 0",inplace=True,axis=1)

### Exploratory Data Analysis

In [4]:
# checking the data in dataset
dataset.head()

Unnamed: 0,seller_type,bedroom,layout_type,property_type,locality,price,area,furnish_type,bathroom,city
0,OWNER,2.0,BHK,Apartment,Bodakdev,20000.0,1450.0,Furnished,2.0,Ahmedabad
1,OWNER,1.0,RK,Studio Apartment,CG Road,7350.0,210.0,Semi-Furnished,1.0,Ahmedabad
2,OWNER,3.0,BHK,Apartment,Jodhpur,22000.0,1900.0,Unfurnished,3.0,Ahmedabad
3,OWNER,2.0,BHK,Independent House,Sanand,13000.0,1285.0,Semi-Furnished,2.0,Ahmedabad
4,OWNER,2.0,BHK,Independent House,Navrangpura,18000.0,1600.0,Furnished,2.0,Ahmedabad


In [5]:
# Checking the count of the types in every dimension
dataset.count()

seller_type      192688
bedroom          192809
layout_type      192997
property_type    192500
locality         193011
price            192264
area             192012
furnish_type     192933
bathroom         192827
city             192665
dtype: int64

#### Cheking for unique values

In [6]:
# cheking seller type categories
pd.unique(dataset["seller_type"])

array(['OWNER', 'AGENT', nan, 'BUILDER'], dtype=object)

In [7]:
# Checking for bedroom categories
pd.unique(dataset["bedroom"])

array([ 2.,  1.,  3.,  4.,  5., nan,  6., 10.,  7.,  8., 15.,  9., 12.,
       14., 11.])

In [8]:
# checking for layout_type
pd.unique(dataset["layout_type"])

array(['BHK', 'RK', nan], dtype=object)

In [9]:
# cheking for property type
pd.unique(dataset["property_type"])

array(['Apartment', 'Studio Apartment', 'Independent House',
       'Independent Floor', 'Villa', nan, 'Penthouse'], dtype=object)

In [10]:
# checking for locality type
pd.unique(dataset["locality"])

array(['Bodakdev', 'CG Road', 'Jodhpur', ..., 'Renuka Nagar',
       'Gananjay Society', 'Aundh Gaon'], dtype=object)

In [11]:
# checking for the furnish type
pd.unique(dataset["furnish_type"])

array(['Furnished', 'Semi-Furnished', 'Unfurnished', nan], dtype=object)

In [12]:
# checking for the bathroom
pd.unique(dataset["bathroom"])

array([ 2.,  1.,  3.,  4.,  5., nan,  6., 12.,  7.,  8., 15.,  9., 18.,
       19., 10., 16., 14.])

In [13]:
# checking for the city
pd.unique(dataset["city"])

array(['Ahmedabad', nan, 'Bangalore', 'Chennai', 'Delhi', 'Hyderabad',
       'Kolkata', 'Mumbai', 'Pune'], dtype=object)

#### Checking for Null Values

In [14]:
dataset.isna().sum()

seller_type      323
bedroom          202
layout_type       14
property_type    511
locality           0
price            747
area             999
furnish_type      78
bathroom         184
city             346
dtype: int64

#### Handling NaN Values

In [15]:
# Calculating with mode values
dataset.mode()

Unnamed: 0,seller_type,bedroom,layout_type,property_type,locality,price,area,furnish_type,bathroom,city
0,AGENT,2.0,BHK,Apartment,Thane West,15000.0,1200.0,Semi-Furnished,2.0,Mumbai


In [16]:
# Replace all the values with mode values of every dimension
values = {"seller_type": "AGENT", "bedroom": 2.0, "layout_type": "BHK", "property_type":"Apartment","locality":"Thane West","price":1200.0,"area":1200.0,"furnish_type":"Semi-Furnished","bathroom":2.0,"city":"Mumbai"}
dataset.fillna(inplace=True,value=values)
dataset.isna().sum()

seller_type      0
bedroom          0
layout_type      0
property_type    0
locality         0
price            0
area             0
furnish_type     0
bathroom         0
city             0
dtype: int64

#### Data Balance Checking

In [17]:
# data balance check for seller type
dataset["seller_type"].value_counts()

AGENT      153014
OWNER       38651
BUILDER      1346
Name: seller_type, dtype: int64

In [18]:
# data balance check for bedroom
dataset["bedroom"].value_counts()

2.0     77221
1.0     58166
3.0     44293
4.0     11562
5.0      1389
6.0       147
10.0       76
8.0        50
7.0        48
15.0       23
9.0        23
12.0        6
14.0        4
11.0        3
Name: bedroom, dtype: int64

In [19]:
# data balance check for layout type
dataset["layout_type"].value_counts()

BHK    183749
RK       9262
Name: layout_type, dtype: int64

In [20]:
# data balace check for property type
dataset["property_type"].value_counts()

Apartment            140481
Independent Floor     27221
Independent House     12805
Studio Apartment       9237
Villa                  3111
Penthouse               156
Name: property_type, dtype: int64

In [21]:
# data balance check for locality
dataset["locality"].value_counts()

Thane West                 4478
Chembur                    3549
Andheri East               3415
Bopal                      2933
Kharghar                   2614
                           ... 
Kuthambakkam                  1
Agaramthen                    1
Daighar Gaon                  1
Mettukuppam Kanchipuram       1
Aundh Gaon                    1
Name: locality, Length: 4146, dtype: int64

In [22]:
# data balance check for area
dataset["area"].value_counts()

1200.0    9409
600.0     7378
1000.0    7231
650.0     5773
900.0     5442
          ... 
9004.0       1
6825.0       1
2523.0       1
2312.0       1
258.0        1
Name: area, Length: 2761, dtype: int64

In [23]:
# data balance check for furnish type
dataset["furnish_type"].value_counts()

Semi-Furnished    95769
Unfurnished       58668
Furnished         38574
Name: furnish_type, dtype: int64

In [24]:
# data balance check for bathroom
dataset["bathroom"].value_counts()

2.0     90155
1.0     54218
3.0     35418
4.0     10664
5.0      1926
6.0       381
7.0        80
9.0        55
8.0        52
10.0       34
15.0        8
16.0        6
14.0        6
12.0        5
19.0        2
18.0        1
Name: bathroom, dtype: int64

In [25]:
# data balancing check for city
dataset["city"].value_counts()

Mumbai       67654
Delhi        32542
Bangalore    23028
Pune         22558
Ahmedabad    18413
Hyderabad    10527
Kolkata       9645
Chennai       8644
Name: city, dtype: int64

#### Data Types and Dimension Checking

In [26]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193011 entries, 0 to 193010
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   seller_type    193011 non-null  object 
 1   bedroom        193011 non-null  float64
 2   layout_type    193011 non-null  object 
 3   property_type  193011 non-null  object 
 4   locality       193011 non-null  object 
 5   price          193011 non-null  float64
 6   area           193011 non-null  float64
 7   furnish_type   193011 non-null  object 
 8   bathroom       193011 non-null  float64
 9   city           193011 non-null  object 
dtypes: float64(4), object(6)
memory usage: 14.7+ MB


## Encoding

In [27]:
# Lable Encoding for all categorical columns
# importing LabelEncoder
from sklearn.preprocessing import LabelEncoder

#making encoder object
encoder = LabelEncoder()

#Start Encoding
dataset["seller_type"] = encoder.fit_transform(dataset["seller_type"])
dataset["bedroom"]     = encoder.fit_transform(dataset["bedroom"])
dataset["layout_type"] = encoder.fit_transform(dataset["layout_type"])
dataset["property_type"] = encoder.fit_transform(dataset["property_type"])
dataset["locality"] = encoder.fit_transform(dataset["locality"])
dataset["furnish_type"] = encoder.fit_transform(dataset["furnish_type"])
dataset["bathroom"] = encoder.fit_transform(dataset["bathroom"])
dataset["city"] = encoder.fit_transform(dataset["city"])

In [28]:
# all the values in int & float
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193011 entries, 0 to 193010
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   seller_type    193011 non-null  int64  
 1   bedroom        193011 non-null  int64  
 2   layout_type    193011 non-null  int64  
 3   property_type  193011 non-null  int64  
 4   locality       193011 non-null  int64  
 5   price          193011 non-null  float64
 6   area           193011 non-null  float64
 7   furnish_type   193011 non-null  int64  
 8   bathroom       193011 non-null  int64  
 9   city           193011 non-null  int64  
dtypes: float64(2), int64(8)
memory usage: 14.7 MB


### Data Processing

In [29]:
# price is dependant columns

X = dataset.drop("price",axis = 1)
Y = dataset["price"]


In [30]:
# Split the data into Train and Test
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.7,random_state=12345)

## Model Building

In [55]:
def linear_regression_model():
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    model.fit(X_train,Y_train)
    return model

In [70]:
def decision_tree_model():
    from sklearn.tree import DecisionTreeRegressor
    model = DecisionTreeRegressor()
    model.fit(X_train,Y_train)
    return model

In [71]:
def random_forest_model():
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor()
    model.fit(X_train,Y_train)
    return model

[sudo] password for sunbeam: 

In [83]:
def catBoost_model():
    from catboost import CatBoost
    model = CatBoostRegressor()
    model.fit(X_train,Y_train)
    return model

In [81]:
def xgBoost_model():
    from xgboost import XGBRegressor
    model = XGBRegressor()
    model.fit(X_train,Y_train)
    return model

In [82]:
# reference variable
linear_regression = linear_regression_model()
decision_tree = decision_tree_model()
catboost = catBoost_model()
xgboost = xgBoost_model()

ModuleNotFoundError: No module named 'catboost'

## Model Evaluation

In [75]:
def model_evaluation(model):
    from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
    
    y_true = Y_test
    y_pred = model.predict(X_test)
    print(f"Mean Absolute Error = {mean_absolute_error(y_true,y_pred):.2f}")
    print(f"Mean Squared Error = {mean_squared_error(y_true,y_pred):.2f}")
    print(f"R2 Square = {r2_score(y_true,y_pred)*100:.2f}%")
    

In [76]:
model_evaluation(linear_regression)

Mean Absolute Error = 27258.99
Mean Squared Error = 3370918559.50
R2 Square = 60.78%
