In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Bengaluru_House_Data.csv')

In [3]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
data.shape

(13320, 9)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:
# use value counts in all columns
for column in data.columns:
    print(data[column].value_counts())
    print("*" *20)

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
********************
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
14-Nov               1
15-Jun               1
16-Nov               1
17-Jan               1
Name: availability, Length: 81, dtype: int64
********************
Whitefield                            540
Sarjapur  Road                        399
Electronic City                       302
Kanakpura Road                        273
Thanisandra                           234
                                     ... 
Neelasandra                             1
Prakruthi Township                      1
Banashankari 6th stage , 2nd block      1
Mukkutam Nagar                          1
BEL Layout                              1
Name: location, Length: 1305, dtype: int64
*********

In [7]:
# check null values
data.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [8]:
# these four feature does not use in model
data.drop(columns=['area_type','availability','society','balcony'], inplace=True)

In [9]:
data.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [11]:
# missing one value
data['location'].value_counts()

Whitefield                            540
Sarjapur  Road                        399
Electronic City                       302
Kanakpura Road                        273
Thanisandra                           234
                                     ... 
Neelasandra                             1
Prakruthi Township                      1
Banashankari 6th stage , 2nd block      1
Mukkutam Nagar                          1
BEL Layout                              1
Name: location, Length: 1305, dtype: int64

In [12]:
data['location'] = data['location'].fillna('Sarjapur  Road')

In [13]:
data['size'].value_counts()

2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
19 BHK           1
13 BHK           1
16 BHK           1
18 Bedroom       1
43 Bedroom       1
12 Bedroom       1
27 BHK           1
Name: size, dtype: int64

In [14]:
data['size'] = data['size'].fillna('2 BHK')

In [15]:
data['bath'] = data['bath'].fillna(data['bath'].median())

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [17]:
# in size column according to space split and get value and store in new column bhk
data['bhk'] = data['size'].str.split().str.get(0).astype(int)

In [18]:
# these are basically outliers in data
data[data.bhk > 20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [19]:
# square feet must be either integer or float but here is range so put average value in it
data['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [20]:
# fix the - in total_sqft
def ConvertRange(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [21]:
data['total_sqft'] = data['total_sqft'].apply(ConvertRange)

In [22]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


### Price per square feet
remove outliers

In [23]:
# convert lakh to rupees
data['price_per_sqft'] = data['price']*100000 / data['total_sqft']

In [24]:
data['price_per_sqft'] 

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_sqft, Length: 13320, dtype: float64

In [25]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [26]:
data['location'].value_counts()

Whitefield                            540
Sarjapur  Road                        400
Electronic City                       302
Kanakpura Road                        273
Thanisandra                           234
                                     ... 
Neelasandra                             1
Prakruthi Township                      1
Banashankari 6th stage , 2nd block      1
Mukkutam Nagar                          1
BEL Layout                              1
Name: location, Length: 1305, dtype: int64

In [27]:
# location less than 10 time replace with others, that reduce number of location
# remove space
data['location'] = data['location'].apply(lambda x: x.strip())
location_count = data['location'].value_counts()

In [28]:
# new locations
location_count

Whitefield                541
Sarjapur  Road            400
Electronic City           304
Kanakpura Road            273
Thanisandra               237
                         ... 
BEML Layout 5th stage       1
Neelasandra                 1
Prakruthi Township          1
Banashankari 2nd Stage      1
BEL Layout                  1
Name: location, Length: 1294, dtype: int64

In [29]:
# count less than 10 , here 1053
location_count_less_10 = location_count[location_count<=10]
location_count_less_10

Naganathapura             10
Dairy Circle              10
Ganga Nagar               10
Nagadevanahalli           10
1st Block Koramangala     10
                          ..
BEML Layout 5th stage      1
Neelasandra                1
Prakruthi Township         1
Banashankari 2nd Stage     1
BEL Layout                 1
Name: location, Length: 1053, dtype: int64

In [30]:
# replace the count less than 10 with others
data['location'] = data['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

In [31]:
data['location'].value_counts()

other                        2885
Whitefield                    541
Sarjapur  Road                400
Electronic City               304
Kanakpura Road                273
                             ... 
Tindlu                         11
Marsur                         11
Kodigehalli                    11
2nd Phase Judicial Layout      11
LB Shastri Nagar               11
Name: location, Length: 242, dtype: int64

## Outliers detections and removal

In [32]:
data.describe()
# in min no flat should be of 1 sqft so remove it

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [33]:
(data['total_sqft']/data['bhk']).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [34]:
# use filter, whose sqft is less than 300 , keep only greater than 300
data = data[((data['total_sqft']/data['bhk'] >= 300))]
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [35]:
data.shape

(12530, 7)

In [36]:
# max value must be an outlier
data.price_per_sqft.describe()

count     12530.000000
mean       6303.979357
std        4162.237981
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [37]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key,subdf in df.groupby('location'):
        # one location mean of price per sqft
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        # use filter, take the preice_per_sqft whose mean is one step less or more of standard deviation
        gen_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_output = pd.concat([df_output, gen_df], ignore_index=True)
    return df_output
data = remove_outliers_sqft(data)
data.describe()
# reduce max value, mean, std

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,880.694214,0.979449,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


In [38]:
def bhk_outlier_remover(df):
    # store indices which does not need
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        # bhk stats store
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk]={
                'mean' : np.mean(bhk_df.price_per_sqft),
                'std' : np.std(bhk_df.price_per_sqft),
                'count' : bhk_df.shape[0]
            }
       # print(location, bhk_stats)
    
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            # remove indices whose bhk is greater than 5
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')

In [39]:
data = bhk_outlier_remover(data)

In [40]:
data.shape

(7360, 7)

In [41]:
data.head(20)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.54386
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668
5,1st Block Jayanagar,4 BHK,2750.0,4.0,413.0,4,15018.181818
6,1st Block Jayanagar,4 BHK,2450.0,4.0,368.0,4,15020.408163
8,1st Phase JP Nagar,3 BHK,1875.0,3.0,167.0,3,8906.666667
9,1st Phase JP Nagar,5 Bedroom,1500.0,5.0,85.0,5,5666.666667
10,1st Phase JP Nagar,3 BHK,2065.0,4.0,210.0,3,10169.491525


In [42]:
data.drop(columns=['size','price_per_sqft'], inplace=True)

### Cleaned Data

In [43]:
data

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2
...,...,...,...,...,...
10292,other,1200.0,2.0,70.0,2
10293,other,1800.0,1.0,200.0,1
10296,other,1353.0,2.0,110.0,2
10297,other,812.0,1.0,26.0,1


In [44]:
# save as csv file 
data.to_csv("cleaned_data.csv")

In [45]:
# seprate into independent and dependent feature
x = data.drop(columns=['price'])
y = data['price']

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [47]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

In [48]:
x_train

Unnamed: 0,location,total_sqft,bath,bhk
10044,other,660.0,2.0,2
9216,other,1200.0,4.0,4
3157,Hennur Road,1445.0,2.0,3
4844,Lakshminarayana Pura,1200.0,2.0,2
1,1st Block Jayanagar,1630.0,3.0,3
...,...,...,...,...
6062,Sarjapur,1175.0,2.0,2
3985,Kambipura,883.0,2.0,2
2007,Electronic City,1575.0,3.0,3
3197,Hoodi,706.0,1.0,1


In [49]:
print(x_train.shape)
print(x_test.shape)

(5888, 4)
(1472, 4)


## Applying Linear Regression

In [50]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['location']), remainder='passthrough')

In [51]:
scaler = StandardScaler()

In [52]:
lr = LinearRegression(normalize=True)

In [53]:
pipe = make_pipeline(column_trans, scaler, lr)

In [54]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression(normalize=True))])

In [55]:
y_pred_lr = pipe.predict(x_test)

In [56]:
r2_score(y_test, y_pred_lr)

0.8294338752483829

## Applying Lasso

In [57]:
lasso = Lasso()

In [58]:
pipe = make_pipeline(column_trans, scaler, lasso)

In [59]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()), ('lasso', Lasso())])

In [60]:
y_pred_lasso = pipe.predict(x_test)
r2_score(y_test, y_pred_lasso)

0.8199181874762704

## Applying Ridge

In [61]:
ridge = Ridge()

In [62]:
pipe = make_pipeline(column_trans, scaler, ridge)

In [63]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()), ('ridge', Ridge())])

In [64]:
y_pred_ridge = pipe.predict(x_test)
r2_score(y_test, y_pred_ridge)

0.8296651410179644

In [65]:
print("No Regularization: ",  r2_score(y_test, y_pred_lr))
print("Lasso: ", r2_score(y_test, y_pred_lasso))
print("Ridge: ", r2_score(y_test, y_pred_ridge))

No Regularization:  0.8294338752483829
Lasso:  0.8199181874762704
Ridge:  0.8296651410179644


In [66]:
import pickle

In [None]:
pickle.dump(pipe, open('RidgeModel.pkl','wb'))