In [115]:
import pandas as pd
import numpy as np

In [116]:
data = pd.read_csv("Bengaluru_House_Data.csv")

In [117]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [118]:
data.shape

(13320, 9)

In [119]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [120]:
for column in data.columns:
  print(data[column].value_counts())
  print("*"*20)

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
********************
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
********************
location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64
********************
size
2 BHK    

In [121]:
data.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [122]:
data.drop(columns = ["area_type", "availability", "society", "balcony"], inplace = True)

In [123]:
data.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [124]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [125]:
data["location"].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

## Handling null values

In [126]:
data["location"] = data["location"].fillna("Sarjapur Road")

In [127]:
data["size"].value_counts()

size
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: count, dtype: int64

In [128]:
data["size"] = data["size"].fillna("2 BHK")

In [129]:
data["bath"] = data["bath"].fillna(data["bath"].median())

In [130]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


## Extracting BHK from size and creating a new column called bhk to store the value

In [131]:
data["bhk"] = data["size"].str.split().str.get(0).astype(int)

In [132]:
data[data.bhk > 20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


## Handling range value in total_sqft

In [133]:
data["total_sqft"].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [134]:
def convertRange(x):
  temp = x.split("-")
  if len(temp) == 2:
    return (float(temp[0]) + float(temp[1])) / 2
  try:
    return float(x)
  except:
    return None

In [135]:
data["total_sqft"] = data["total_sqft"].apply(convertRange)

In [136]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


## Price per square feet

In [137]:
data["price_per_sqft"] = data["price"] * 100000 / data["total_sqft"]

In [138]:
data["price_per_sqft"]

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_sqft, Length: 13320, dtype: float64

In [139]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


## Handling location column

In [140]:
data["location"].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Uvce Layout                         1
Abshot Layout                       1
Name: count, Length: 1306, dtype: int64

In [141]:
data["location"] = data["location"].apply(lambda x: x.strip())
location_count = data["location"].value_counts()

In [142]:
location_count_less_10 = location_count[location_count <= 10]
location_count_less_10

location
BTM 1st Stage                         10
Nagadevanahalli                       10
Basapura                              10
Sector 1 HSR Layout                   10
Dairy Circle                          10
                                      ..
1Channasandra                          1
Hosahalli                              1
Vijayabank bank layout                 1
near Ramanashree California resort     1
Abshot Layout                          1
Name: count, Length: 1054, dtype: int64

In [143]:
data["location"] = data["location"].apply(lambda x: "other" if x in location_count_less_10 else x)

In [144]:
data["location"].value_counts()

location
other                 2886
Whitefield             541
Sarjapur  Road         399
Electronic City        304
Kanakpura Road         273
                      ... 
Nehru Nagar             11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: count, Length: 242, dtype: int64

## Outlier detection and removal

In [145]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [146]:
(data["total_sqft"] / data["bhk"]).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [147]:
data = data[((data["total_sqft"]/data["bhk"]) >= 300)]
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [148]:
data.shape

(12530, 7)

In [149]:
data.price_per_sqft.describe()

count     12530.000000
mean       6303.979357
std        4162.237981
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [150]:
def remove_outliers_sqft(df):
  df_output = pd.DataFrame()
  for key, subdf in df.groupby("location"):
    m = np.mean(subdf.price_per_sqft)
    st = np.std(subdf.price_per_sqft)
    gen_df = subdf[(subdf.price_per_sqft > (m - st)) & (subdf.price_per_sqft <= (m + st))]
    df_output = pd.concat([df_output, gen_df], ignore_index = True)
  return df_output

data = remove_outliers_sqft(data)
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,880.694214,0.979449,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


In [151]:
def bhk_outlier_remover(df):
  exclude_indices = np.array([])
  for location, location_df in df.groupby("location"):
    bhk_stats = {}
    for bhk, bhk_df in location_df.groupby("bhk"):
      bhk_stats = {
          "mean": np.mean(bhk_df.price_per_sqft),
          "std": np.std(bhk_df.price_per_sqft),
          "count": bhk_df.shape[0]
      }
      for bhk, bhk_df in location_df.groupby("bhk"):
        stats = bhk_stats.get(bhk-1)
        if stats and stats["count"] > 5:
          exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < (stats["mean"])].index.values)
  return df.drop(exclude_indices, axis = "index")

In [152]:
data = bhk_outlier_remover(data)

In [153]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.54386
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668


In [154]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,880.694214,0.979449,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


In [155]:
data.shape

(10301, 7)

In [156]:
data.drop(columns = ["size", "price_per_sqft"], inplace = True)

## Cleaned Data

In [157]:
data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [None]:
data.to_csv("Cleaned_data.csv")

## Processing,scaling,encoding the data and training the model

In [158]:
X = data.drop(columns = ["price"])
y = data["price"]

In [159]:
X

Unnamed: 0,location,total_sqft,bath,bhk
0,1st Block Jayanagar,2850.0,4.0,4
1,1st Block Jayanagar,1630.0,3.0,3
2,1st Block Jayanagar,1875.0,2.0,3
3,1st Block Jayanagar,1200.0,2.0,3
4,1st Block Jayanagar,1235.0,2.0,2
...,...,...,...,...
10296,other,1353.0,2.0,2
10297,other,812.0,1.0,1
10298,other,1440.0,2.0,3
10299,other,1075.0,2.0,2


In [160]:
y

0        428.00
1        194.00
2        235.00
3        130.00
4        148.00
          ...  
10296    110.00
10297     26.00
10298     63.93
10299     48.00
10300    400.00
Name: price, Length: 10301, dtype: float64

In [183]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics  import r2_score

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

In [163]:
X_train

Unnamed: 0,location,total_sqft,bath,bhk
8586,other,1460.0,2.0,3
8748,other,1540.0,3.0,3
6756,Thanisandra,760.0,1.0,1
1417,Brookefield,1594.0,3.0,3
103,7th Phase JP Nagar,1420.0,2.0,3
...,...,...,...,...
9225,other,1800.0,4.0,5
4859,Lingadheeranahalli,1506.0,3.0,3
3264,Horamavu Banaswadi,1357.0,2.0,2
9845,other,1194.0,2.0,2


In [164]:
X_test

Unnamed: 0,location,total_sqft,bath,bhk
6122,Sarjapur Road,4395.0,4.0,4
7629,Whitefield,1170.0,2.0,2
4277,Kasavanhalli,3260.0,4.0,4
9363,other,800.0,2.0,2
7436,Whitefield,2856.0,5.0,4
...,...,...,...,...
8685,other,1390.0,2.0,3
388,Ambedkar Nagar,1850.0,4.0,3
4144,Kanakpura Road,1340.0,2.0,2
6413,Sector 7 HSR Layout,1760.0,3.0,3


In [165]:
y_train

8586     73.0
8748     68.0
6756     50.4
1417    140.0
103      96.0
        ...  
9225     68.0
4859     95.0
3264     54.0
9845     51.0
2732     56.0
Name: price, Length: 8240, dtype: float64

In [166]:
y_test

6122    242.0
7629     56.0
4277    240.0
9363     85.0
7436    154.5
        ...  
8685     80.0
388     121.0
4144     60.0
6413    160.0
9824     92.0
Name: price, Length: 2061, dtype: float64

### Pipeline that include column transformer(for one hot encoding), standard scaler for sclaing and linear regression as ML model

In [167]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ["location"]),
                                       remainder="passthrough")

In [168]:
scaler = StandardScaler()

In [169]:
lr = LinearRegression()

In [170]:
pipe = make_pipeline(column_trans, scaler, lr)

In [171]:
pipe.fit(X_train, y_train)



In [172]:
y_pred_lr = pipe.predict(X_test)

In [173]:
r2_score(y_test, y_pred_lr)

0.8284621985687635

### Pipeline that include column transformer(for one hot encoding), standard scaler for sclaing and lasso regression as ML model

In [174]:
lasso = Lasso()

In [175]:
pipe = make_pipeline(column_trans, scaler, lasso)

In [176]:
pipe.fit(X_train, y_train)



In [177]:
y_pred_lasso = pipe.predict(X_test)
r2_score(y_test, y_pred_lasso)

0.821316543187225

### Pipeline that include column transformer(for one hot encoding), standard scaler for sclaing and ridge regression as ML model

In [204]:
ridge = Ridge()

In [205]:
pipe = make_pipeline(column_trans, scaler, ridge)

In [206]:
pipe.fit(X_train, y_train)



In [207]:
y_pred_ridge = pipe.predict(X_test)
r2_score(y_test, y_pred_ridge)

0.8284595107915464

In [208]:
import pickle

In [209]:
pickle.dump(pipe, open("RidgeModel.pkl","wb"))