In [111]:
import pandas as pd
import numpy as np

In [112]:
df = pd.read_csv('data/Bengaluru_House_Data.csv')

In [113]:
# Check the null values
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [114]:
# Dropping 4 features as there is no use for these on the model as we checked by corr matrix
df = df.drop(columns=['area_type', 'availability', 'society', 'balcony'])

In [115]:
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [116]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [118]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def convert_total_sq(x):
    y = x.split('-')
    
    if len(y) == 2:
        
        return (float(y[0]) + float(y[1]))/2
    try:
        return float(x)
    except:
        return None
    
def remove_outliers(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):

        mean = np.mean(subdf.Price_per_sq_feet) # calculated mean

        sd = np.std(subdf.Price_per_sq_feet) # calculated standard deviation

        #outlier detection and removal, 1 std idhar udhar ke rakhe hain data
        general_df = subdf[(subdf.Price_per_sq_feet > (mean-sd)) & (subdf.Price_per_sq_feet <= (mean + sd))]
    
        df_output = pd.concat([df_output, general_df], ignore_index=True)
        
    return df_output


def cleaning_data(df):
    # Handling null values
    df['location'] = df['location'].fillna('Whitefield')
    df['bath'] = df['bath'].fillna(df['bath'].median())
    df['size'] = df['size'].fillna('2 BHK')
    
    # Converting size (object) into BHK (int)
    df['BHK'] = df['size'].str.split(' ').str.get(0).astype(int)
    
    # cleaing total_sqft
    df['total_sqft'] = df['total_sqft'].apply(convert_total_sq)
    
    # handling location column
    df['location'] = df['location'].apply(lambda x : x.strip())
    location_count = df['location'].value_counts()
    location_less_10 = location_count[location_count <= 10]
    df['location'] = df['location'].apply(lambda x : 'other' if x in location_less_10 else x )
    
    df = df.drop(columns=['size'])
    
    return df

new_df = cleaning_data(df)

def remove_out(df):
    
    df['Price_per_sq_feet'] = df['price']*100000/df['total_sqft']
    
    #remove outliers for total_sq_feet
    df = df[((df['total_sqft']/df['BHK']) >=300)]
    
    df = remove_outliers(df)
    
    Q1 = df["BHK"].quantile(0.25)
    Q3 = df["BHK"].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_threshold = Q1 - 1.5 * IQR
    upper_threshold = Q3 + 1.5 * IQR

    df["is_outlier"] = (df["BHK"] < lower_threshold) | (df["BHK"] > upper_threshold)
    df = df[~df['is_outlier']]
    df = df.drop(columns=['Price_per_sq_feet','is_outlier'])

    return df
    

final_df = remove_out(new_df)

In [119]:
final_df.head()

Unnamed: 0,location,total_sqft,bath,price,BHK
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [120]:
d = final_df

Unnamed: 0,location,total_sqft,bath,price,BHK
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2
