In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Bengaluru_House_Data.csv')

In [3]:
data.head()
# price is in lakhs

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
data.shape

(13320, 9)

In [5]:
data.info()
# Null valuues in society, location, size, bath, balcony

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:
# to count values in columns
for column in data.columns:
    print(data[column].value_counts())
    print('*'*20)

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
********************
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
16-Oct               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
********************
location
Whitefield                         540
Sarjapur  Road                     399
Electronic City                    302
Kanakpura Road                     273
Thanisandra                        234
                                  ... 
3rd Stage Raja Rajeshwari Nagar      1
Chuchangatta Colony                  1
Electronic City Phase 1,             1
Chikbasavanapura                     1
Abshot Layout                        1
Name: count, Length: 1305, dtype: int64
********************
siz

In [7]:
# Count Null values
data.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [8]:
# Remove irrelevant and most Null values columns
data.drop(columns=['area_type', 'availability', 'society', 'balcony'], inplace= True)

In [9]:
data.describe()
# Only Int or float datatype will be executed

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [11]:
# Drop Null values
data = data.dropna()

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13246 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13246 non-null  object 
 1   size        13246 non-null  object 
 2   total_sqft  13246 non-null  object 
 3   bath        13246 non-null  float64
 4   price       13246 non-null  float64
dtypes: float64(2), object(3)
memory usage: 620.9+ KB


In [13]:
# Fix bedroom and bedroom problem
# create new bhk column
data['bhk'] = data['size'].str.split().str.get(0).astype(int)
# split string by space and taken 1st value as int type

In [14]:
data[data['bhk'] > 20]
# Kind of outlier

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [15]:
data['total_sqft'].unique()
# As some of the values are not in structured way

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [16]:
# define function to convert range from object to float
def convertRange(x):
    y = x.split('-')
    if y==2:
        return (float(y[0]) + float(y[1]))/2
    try:
        return float(x)
    except:
        return None

In [17]:
# apply convertRange
data['total_sqft'] = data['total_sqft'].apply(convertRange)

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13246 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13246 non-null  object 
 1   size        13246 non-null  object 
 2   total_sqft  13056 non-null  float64
 3   bath        13246 non-null  float64
 4   price       13246 non-null  float64
 5   bhk         13246 non-null  int64  
dtypes: float64(3), int64(1), object(2)
memory usage: 724.4+ KB


In [19]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [20]:
# Price per square feet
# convert price from decimal to actula price in rupees and then divide by total sqft
data['price_per_sqft'] = data['price']* 100000/data['total_sqft']

In [21]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [22]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13056.0,13246.0,13246.0,13246.0,13056.0
mean,1554.458192,2.692586,112.389392,2.801902,7952.593
std,1238.479835,1.341506,149.076587,1.295758,107313.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.434
50%,1275.0,2.0,72.0,3.0,5454.545
75%,1670.0,3.0,120.0,3.0,7340.052
max,52272.0,40.0,3600.0,43.0,12000000.0


In [23]:
# count location
data['location'].value_counts()

location
Whitefield                         534
Sarjapur  Road                     392
Electronic City                    302
Kanakpura Road                     266
Thanisandra                        233
                                  ... 
KAMAKIYA                             1
Wheelers Road                        1
MM Layout                            1
Devarabeesana Halli                  1
beml layout, basaveshwara nagar      1
Name: count, Length: 1304, dtype: int64

In [24]:
# strip will remove extra character, here will remove spaces
data['location'] = data['location'].apply(lambda x: x.strip()) 
location_count = data['location'].value_counts()

In [25]:
location_count

location
Whitefield                              535
Sarjapur  Road                          392
Electronic City                         304
Kanakpura Road                          266
Thanisandra                             236
                                       ... 
Duddanahalli                              1
Doddanakunte                              1
Jogupalya                                 1
Subhash Nagar                             1
Kengeri Satellite Town KHB Apartment      1
Name: count, Length: 1293, dtype: int64

In [26]:
# Find out location value counts less tha 10
location_count_less_than_10 = location_count[location_count <=10]

In [27]:
location_count_less_than_10

location
Sector 1 HSR Layout                     10
Basapura                                10
Nagadevanahalli                         10
BTM 1st Stage                           10
Nagappa Reddy Layout                    10
                                        ..
Duddanahalli                             1
Doddanakunte                             1
Jogupalya                                1
Subhash Nagar                            1
Kengeri Satellite Town KHB Apartment     1
Name: count, Length: 1052, dtype: int64

In [28]:
# reduce locations who has less than 10 house by replacing with others

data['location'] = data['location'].apply(lambda x: 'other' if x in location_count_less_than_10 else x)

In [29]:
data['location'].value_counts()

location
other                        2881
Whitefield                    535
Sarjapur  Road                392
Electronic City               304
Kanakpura Road                266
                             ... 
Tindlu                         11
Marsur                         11
2nd Phase Judicial Layout      11
Thyagaraja Nagar               11
HAL 2nd Stage                  11
Name: count, Length: 242, dtype: int64

In [30]:
# removing Outliers
# there is 1 square feet flat, how's possible?

# how much square feet in 1 bhk?
(data['total_sqft']/data['bhk']).describe()
# here 1 flat's bhk is 0.25 sqft

count    13056.000000
mean       572.904171
std        389.662389
min          0.250000
25%        471.666667
50%        551.000000
75%        625.000000
max      26136.000000
dtype: float64

In [31]:
data['total_sqft']/data['bhk']

0         528.00
1         650.00
2         480.00
3         507.00
4         600.00
          ...   
13315     690.60
13316     900.00
13317     570.50
13318    1172.25
13319     550.00
Length: 13246, dtype: float64

In [32]:
# remove flats less than 300 sqft per bhk

data=data[data['total_sqft']/data['bhk'] >= 300]

In [33]:
data.describe()
# min values in sqft is 300  

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12312.0,12312.0,12312.0,12312.0,12312.0
mean,1589.702335,2.562135,111.421129,2.651966,6323.403514
std,1261.895153,1.073093,152.771439,0.973438,4187.211055
min,300.0,1.0,9.0,1.0,267.829813
25%,1118.0,2.0,49.3825,2.0,4208.545855
50%,1300.0,2.0,70.0,3.0,5300.0
75%,1700.0,3.0,115.0,3.0,6938.987948
max,52272.0,16.0,3600.0,16.0,176470.588235


In [34]:
data.shape

(12312, 7)

In [35]:
data.price_per_sqft.describe()
# max value is definately an outlier

count     12312.000000
mean       6323.403514
std        4187.211055
min         267.829813
25%        4208.545855
50%        5300.000000
75%        6938.987948
max      176470.588235
Name: price_per_sqft, dtype: float64

In [36]:
# function to remove outliers in sqft
def remove_outliers_sqft(df):

    # Create an output dataframe
    df_output = pd.DataFrame()
    
    for key,subdf in df.groupby('location'):
        
        # mean of price per sqft for every location
        m = np.mean(subdf.price_per_sqft)

        # Standard variation of price per sqft for every location
        st = np.std(subdf.price_per_sqft)

        # Keeping values between mean - st and mean + st 
        # removing extream values
        gen_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]

        # concating in output data frame
        df_output = pd.concat([df_output,gen_df], ignore_index = True)

    return df_output

In [37]:
# Applying outlier function
data = remove_outliers_sqft(data)
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10140.0,10140.0,10140.0,10140.0,10140.0
mean,1502.295919,2.472288,91.070015,2.573964,5670.117874
std,873.190439,0.972408,86.173386,0.893121,2275.56879
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.178277
50%,1283.0,2.0,67.0,2.0,5184.504357
75%,1650.0,3.0,100.0,3.0,6451.612903
max,30400.0,16.0,2200.0,16.0,24509.803922


In [38]:
# Function to remove bhk outlier
def bhk_outlier_remover(df):

    # Create an array to store and then remove indices
    exclude_indices = np.array([])

    # groupby by location
    for location, location_df in df.groupby('location'):

        # create empy dict for bhk stats
        bhk_stats = {}

        # group by bhk on location
        for bhk, bhk_df in location_df.groupby('bhk'):

            # mean, std and count for each bhk(1,2...)
            bhk_stats[bhk] = {
                'mean' : np.mean(bhk_df.price_per_sqft),
                'std' : np.std(bhk_df.price_per_sqft),
                'count' : bhk_df.shape[0]
            }

        # price of 3bhk flat per square feet's lower bound(price) can be 2bhk's mean price
        # so if 3 bhk mean per square feet is higher than 2 bhk per square feet, then we will kep other wise will drop
        for bhk, bhk_df in location_df.groupby('bhk'):

            # so if bhk is 3, then we will look 2 bhi stats
            stats = bhk_stats.get(bhk-1) # get is used as if there is no 3 bhk flat and oly 2 then we will get none
            # if we used like this bhk_stats[bhk-1], it can show error

            # should be 5 data points
            if stats and stats['count']>5:

                # if value of 2 bhk mean per square feet is higher than 3 bhk, then we will add in exclude_indices
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)

    return df.drop(exclude_indices,axis='index')

In [39]:
# applying the bhk_outlier_remover function
data = bhk_outlier_remover(data)

In [40]:
data.shape

(7217, 7)

In [41]:
data
# this is the clean data

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.543860
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668
...,...,...,...,...,...,...,...
10131,other,2 BHK,1200.0,2.0,70.0,2,5833.333333
10132,other,1 BHK,1800.0,1.0,200.0,1,11111.111111
10135,other,2 BHK,1353.0,2.0,110.0,2,8130.081301
10136,other,1 Bedroom,812.0,1.0,26.0,1,3201.970443


In [42]:
# we added price_per_sqft to check and remove outlier, as data is cleaned we can drop this column
# there is alos no use of size column
data.drop(columns = ['size', 'price_per_sqft'], inplace=True)

In [43]:
#super clean data
data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [44]:
#saving clean data
data.to_csv("Cleaned_data.csv")

In [45]:
# data splitting
X = data.drop(columns=['price'])
y = data['price']

In [46]:
# Import regression and machine leaening model for prediction
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [47]:
# data splitting sung train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

In [48]:
print(X_train.shape)
print(X_test.shape)

(5773, 4)
(1444, 4)


In [49]:
# Applying Linear Rergression
# one hot encoder on location colums
column_trans = make_column_transformer((OneHotEncoder(sparse_output=False), ['location']), remainder = 'passthrough')

In [50]:
scaler = StandardScaler()

In [51]:
lr = LinearRegression()

In [52]:
pipe = make_pipeline(column_trans, scaler, lr)

In [53]:
pipe.fit(X_train, y_train)
# with this pipe, data will enter in column_trans, will apply on hot encoding on
# Location column and then data will be scaled on scaler and will fit in linear regression


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [54]:
y_pred_lr = pipe.predict(X_test)

In [55]:
r2_score(y_test, y_pred_lr)

0.8318474563264686

In [56]:
# Applying lasso
lasso = Lasso()

In [57]:
pipe = make_pipeline(column_trans, scaler, lasso)

In [58]:
pipe.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [59]:
y_pred_lasso = pipe.predict(X_test)

In [60]:
r2_score(y_test, y_pred_lasso)

0.8179777305421649

In [61]:
# applying ridge
ridge=Ridge()
pipe = make_pipeline(column_trans, scaler, ridge)
pipe.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [62]:
y_pred_ridge = pipe.predict(X_test)
r2_score(y_test, y_pred_ridge)

0.8319168210064901

In [63]:
print('No Regularization', r2_score(y_test, y_pred_lr))
print('Lasso', r2_score(y_test, y_pred_lasso))
print('Ridge', r2_score(y_test, y_pred_ridge))

No Regularization 0.8318474563264686
Lasso 0.8179777305421649
Ridge 0.8319168210064901


In [64]:
# Here Linear Reg and Ridge almost giving same prediction

In [65]:
import pickle
pickle.dump(pipe, open('RidgeModel.pkl', 'wb'))