In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, mean_squared_error, r2_score

In [2]:
df=pd.read_csv('House_Rent_Dataset.csv')

In [3]:
df.shape

(4746, 12)

In [4]:
df.head()

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Posted On          4746 non-null   object
 1   BHK                4746 non-null   int64 
 2   Rent               4746 non-null   int64 
 3   Size               4746 non-null   int64 
 4   Floor              4746 non-null   object
 5   Area Type          4746 non-null   object
 6   Area Locality      4746 non-null   object
 7   City               4746 non-null   object
 8   Furnishing Status  4746 non-null   object
 9   Tenant Preferred   4746 non-null   object
 10  Bathroom           4746 non-null   int64 
 11  Point of Contact   4746 non-null   object
dtypes: int64(4), object(8)
memory usage: 445.1+ KB


In [6]:
pd.options.display.float_format = '{:,.0f}'.format
df.describe()

Unnamed: 0,BHK,Rent,Size,Bathroom
count,4746,4746,4746,4746
mean,2,34993,967,2
std,1,78106,634,1
min,1,1200,10,1
25%,2,10000,550,1
50%,2,16000,850,2
75%,3,33000,1200,2
max,6,3500000,8000,10


*Findings from describe():*

*Unrealistic minimum 'size' value (needs to be removed)*
*Extremely high maximum 'rent' value (needs to be removed)*

Data Cleaning

In [7]:
df = df[(df['Size'] >= 100) & (df['Rent'] <= 200000)]

In [8]:
df=df.drop(columns=['Point of Contact','Area Locality','Posted On'])  # Point of Contact - unnecessary to the model
                                                        # Area Locality - High cardinality and of less value to the model
                                                        # Posted On - contains data from year 2022 only. So not relevant for this model.

In [9]:
df.shape

(4556, 9)

In [10]:
df.isna().sum()

BHK                  0
Rent                 0
Size                 0
Floor                0
Area Type            0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
dtype: int64

In [11]:
df.duplicated().sum()

np.int64(41)

In [12]:
df.head()

Unnamed: 0,BHK,Rent,Size,Floor,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom
0,2,10000,1100,Ground out of 2,Super Area,Kolkata,Unfurnished,Bachelors/Family,2
1,2,20000,800,1 out of 3,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1
2,2,17000,1000,1 out of 3,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1
3,2,10000,800,1 out of 2,Super Area,Kolkata,Unfurnished,Bachelors/Family,1
4,2,7500,850,1 out of 2,Carpet Area,Kolkata,Unfurnished,Bachelors,1


Encoding Categorical Features

*Floor - Split the columns into two (current & Total floors) and provide the appropriate numerical values.*

*Area Type - Ordinal Encoding.*

*City - Target-based ordinal Encoding(Encoding cities based on average rent).*

*Furnishing Status - Ordinal Encoding since furnished > semi-furnished > unfurnished in rental value.*

*Tenant Preferred -	Multi-label binary encoding. We'll split the feature into two and then binary encode them   after which we drop the original column.*

In [13]:
# Target-based ordinal Encoding(Encoding cities based on average rent)

city_rent_order=df.groupby('City')['Rent'].mean().sort_values()
city_mapping={city:index for index, city in enumerate(city_rent_order.index)}
df['City']=df['City'].map(city_mapping)


In [46]:
city_mapping

{'Kolkata': 0,
 'Hyderabad': 1,
 'Chennai': 2,
 'Bangalore': 3,
 'Delhi': 4,
 'Mumbai': 5}

In [14]:
Furnishing_ordinal=OrdinalEncoder(categories=[['Unfurnished', 'Semi-Furnished', 'Furnished']])
df['Furnishing Status']=Furnishing_ordinal.fit_transform(df[['Furnishing Status']])

In [15]:
Area_type_ordinal=OrdinalEncoder(categories=[['Carpet Area', 'Built Area','Super Area']])
df['Area Type']=Area_type_ordinal.fit_transform(df[['Area Type']])

In [16]:
df['Tenant_Bachelors']=df['Tenant Preferred'].str.contains('Bachelors').astype(int)
df['Tenant_Family']=df['Tenant Preferred'].str.contains('Family').astype(int)
df=df.drop(columns='Tenant Preferred')

In [17]:
def extract_floors(value):
    if pd.isna(value):
        return np.nan, np.nan
    value=value.strip()

    if 'Ground' in value:
        current=0
    elif 'Upper Basement' in value:
        current=-1
    elif 'Lower Basement' in value:
        current=-2
    elif value.isdigit():
        current=int(value)
        return current,np.nan
    else:
        try:
            current=int(value.split(' out of ')[0])
        except:
            current=np.nan
    try:
        total=int(value.split(' out of ')[-1])
    except:
        total=np.nan
    return current,total    

# Applying function
df[['Current_Floor','Total_Floors']]=df['Floor'].apply(lambda x: pd.Series(extract_floors(x)))

In [18]:
df.head()

Unnamed: 0,BHK,Rent,Size,Floor,Area Type,City,Furnishing Status,Bathroom,Tenant_Bachelors,Tenant_Family,Current_Floor,Total_Floors
0,2,10000,1100,Ground out of 2,2,0,0,2,1,1,0,2
1,2,20000,800,1 out of 3,2,0,1,1,1,1,1,3
2,2,17000,1000,1 out of 3,2,0,1,1,1,1,1,3
3,2,10000,800,1 out of 2,2,0,0,1,1,1,1,2
4,2,7500,850,1 out of 2,0,0,0,1,1,0,1,2


In [19]:
df.isna().sum()

BHK                  0
Rent                 0
Size                 0
Floor                0
Area Type            0
City                 0
Furnishing Status    0
Bathroom             0
Tenant_Bachelors     0
Tenant_Family        0
Current_Floor        0
Total_Floors         4
dtype: int64

In [20]:
df[df['Total_Floors'].isna()]['Floor']

2553         3
2883    Ground
4490         1
4560         1
Name: Floor, dtype: object

In [21]:
df = df[df['Total_Floors'].notna()] # Removing null values from Total_Floors

In [22]:
df.isna().sum()

BHK                  0
Rent                 0
Size                 0
Floor                0
Area Type            0
City                 0
Furnishing Status    0
Bathroom             0
Tenant_Bachelors     0
Tenant_Family        0
Current_Floor        0
Total_Floors         0
dtype: int64

In [23]:
df=df.drop(columns='Floor', axis=1)

In [24]:
df.head()

Unnamed: 0,BHK,Rent,Size,Area Type,City,Furnishing Status,Bathroom,Tenant_Bachelors,Tenant_Family,Current_Floor,Total_Floors
0,2,10000,1100,2,0,0,2,1,1,0,2
1,2,20000,800,2,0,1,1,1,1,1,3
2,2,17000,1000,2,0,1,1,1,1,1,3
3,2,10000,800,2,0,0,1,1,1,1,2
4,2,7500,850,0,0,0,1,1,0,1,2


Splitting

In [25]:
X=df.drop(columns='Rent')
y=df[['Rent']]

In [26]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

Scaling

*Both features and label needs to be scaled in this case since the values of 'rent' are large numbers. Besides Linear regression benefits from scaled data.*

In [27]:
x_scaler=StandardScaler()
y_scaler=StandardScaler()

In [28]:
X_train_scaled=x_scaler.fit_transform(X_train)
X_test_scaled=x_scaler.fit_transform(X_test)

y_train_scaled=y_scaler.fit_transform(y_train)
y_test_scaled=y_scaler.fit_transform(y_test)


Model Selection

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error,root_mean_squared_error ,r2_score

model=LinearRegression()

In [30]:
model.fit(X_train_scaled,y_train_scaled)

In [31]:
y_pred=model.predict(X_test_scaled)

In [32]:
print("R Squared Value:", r2_score(y_test_scaled,y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test_scaled,y_pred))
print("Root Mean Squared Error:", root_mean_squared_error(y_test_scaled,y_pred))
print("Mean Squared Error:", mean_squared_error(y_test_scaled,y_pred))

R Squared Value: 0.6315850896131685
Mean Absolute Error: 0.4041319458669695
Root Mean Squared Error: 0.6069719189442222
Mean Squared Error: 0.36841491038683144


Other Regression Models

In [33]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

models=[KNeighborsRegressor(),SVR(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor()]

for model in models:
    model.fit(X_train_scaled, y_train_scaled)
    y_pred=model.predict(X_test_scaled)
    print(model)
    print("_"*27)
    print("R Squared Value:", r2_score(y_test_scaled,y_pred))
    print("Mean Absolute Error:", mean_absolute_error(y_test_scaled,y_pred))
    print("Root Mean Squared Error:", root_mean_squared_error(y_test_scaled,y_pred))   
    print("Mean Squared Error:", mean_squared_error(y_test_scaled,y_pred))
    print('\n')
    

KNeighborsRegressor()
___________________________
R Squared Value: 0.6978635557088679
Mean Absolute Error: 0.3091013335189459
Root Mean Squared Error: 0.5496693954470561
Mean Squared Error: 0.3021364442911321




  y = column_or_1d(y, warn=True)


SVR()
___________________________
R Squared Value: 0.7186662167883501
Mean Absolute Error: 0.2801620013938119
Root Mean Squared Error: 0.5304090715774475
Mean Squared Error: 0.2813337832116499


DecisionTreeRegressor()
___________________________
R Squared Value: 0.5859875328113783
Mean Absolute Error: 0.3466023809217593
Root Mean Squared Error: 0.6434380057073267
Mean Squared Error: 0.4140124671886217




  return fit_method(estimator, *args, **kwargs)


RandomForestRegressor()
___________________________
R Squared Value: 0.7447576575801087
Mean Absolute Error: 0.2806461337096387
Root Mean Squared Error: 0.5052151446857975
Mean Squared Error: 0.2552423424198913


GradientBoostingRegressor()
___________________________
R Squared Value: 0.7586613153467623
Mean Absolute Error: 0.28537996743045135
Root Mean Squared Error: 0.49126233791451757
Mean Squared Error: 0.24133868465323766




  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Cross-Validation

In [34]:
best_model=GradientBoostingRegressor()
scores = cross_val_score(best_model, X_train_scaled, y_train_scaled.ravel(), cv=5, scoring='r2')

In [35]:
print("Cross-Validation R² Scores:", scores)
print("Mean R² Score:", np.mean(scores))
print("Standard Deviation:", np.std(scores))

Cross-Validation R² Scores: [0.72011891 0.68169383 0.7329395  0.81869241 0.78320923]
Mean R² Score: 0.7473307776815522
Standard Deviation: 0.04824012904008472


Hyperparameter Tuning

In [36]:
from sklearn.model_selection import GridSearchCV

In [37]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0],
    'min_samples_split': [2, 5]
}


In [38]:
gbr = GradientBoostingRegressor()

grid_search = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid,
    scoring='r2',
    cv=5,
    n_jobs=-1,
    verbose=1
)


In [39]:
grid_search.fit(X_train_scaled, y_train_scaled.ravel())

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [40]:
print("Best R² Score:", grid_search.best_score_)
print("Best Hyperparameters:", grid_search.best_params_)


Best R² Score: 0.7532194647936055
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_split': 5, 'n_estimators': 100, 'subsample': 0.8}


In [41]:
final_model = grid_search.best_estimator_
final_model.fit(X_train_scaled, y_train_scaled.ravel())


Deployment

In [42]:
data={'model':final_model, 'x_scaler': x_scaler,'y_scaler': y_scaler, 'features':list(X.columns)}

In [43]:
import pickle
with open('rent_prediction.pkl','wb') as obj1:
    pickle.dump(data,obj1)

In [44]:
with open('rent_prediction.pkl','rb') as obj1:
    a=pickle.load(obj1)

In [45]:
a

{'model': GradientBoostingRegressor(max_depth=4, min_samples_split=5, subsample=0.8),
 'x_scaler': StandardScaler(),
 'y_scaler': StandardScaler(),
 'features': ['BHK',
  'Size',
  'Area Type',
  'City',
  'Furnishing Status',
  'Bathroom',
  'Tenant_Bachelors',
  'Tenant_Family',
  'Current_Floor',
  'Total_Floors']}