In [1]:
import numpy as np 
import pandas as pd

In [2]:
dataset = pd.read_csv("Bengaluru_House_RawData.csv")
dataset.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [4]:
# checking all column and there values 
for column in dataset.columns:
    print(dataset[column].value_counts())
    print("*"*20)

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
********************
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
********************
location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64
********************
size
2 BHK    

In [5]:
# checking null values
dataset.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [6]:
# drop useless column for ml model 
dataset.drop(columns=["society", "availability", "balcony"] , inplace=True)

In [7]:
# filling the missing values
dataset["location"].fillna("Whitefield", inplace=True)
dataset["size"].fillna("2 BHK", inplace=True)
dataset["bath"].fillna(dataset["bath"].median() , inplace=True)

In [8]:
# created new column "bhk" and and added only number of bhk and bedrooms
dataset["bhk"] = dataset["size"].str.split().str.get(0).astype("int")
dataset.drop(columns=["size"] , inplace=True)

In [9]:
# here column is in object and have string values '1133 - 1384' so i will replace it with mean  
# of this value after float coversion
print(dataset["total_sqft"].unique())

def convert_to_float(x):
    # temp will store value in list 
    temp = x.split("-")
    if len(temp) == 2:
        # returning mean of temp
        return (float(temp[0]) + float(temp[1])) / 2
    try:
        return float(x)
    except:
        return None

    
dataset["total_sqft"] = dataset["total_sqft"].apply(convert_to_float)

['1056' '2600' '1440' ... '1133 - 1384' '774' '4689']


In [10]:
# float type with mean 
dataset["total_sqft"].unique()

array([1056. , 2600. , 1440. , ..., 1258.5,  774. , 4689. ])

In [11]:
# new column - price per sq feet 
dataset["price_per_sqft"] = dataset["price"]*100000 / dataset["total_sqft"]

In [12]:
dataset["price_per_sqft"]

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_sqft, Length: 13320, dtype: float64

In [13]:
dataset.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [14]:
# we have to apply one hot encoding on location and there are 1305 locations
# so replacing location names with other where value count is < 10 
dataset["location"].value_counts()

location
Whitefield                        541
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

In [15]:
dataset["location"] = dataset["location"].apply(lambda x : x.strip())
location_count = dataset["location"].value_counts()
location_lessthan_10 = location_count[location_count<=10]
location_lessthan_10

location
Dairy Circle                      10
Nagappa Reddy Layout              10
Basapura                          10
1st Block Koramangala             10
Sector 1 HSR Layout               10
                                  ..
Bapuji Layout                      1
1st Stage Radha Krishna Layout     1
BEML Layout 5th stage              1
singapura paradise                 1
Abshot Layout                      1
Name: count, Length: 1053, dtype: int64

In [16]:
# changing location to other here where count is <=10
dataset["location"] = dataset["location"].apply(lambda value : "other" if value in location_lessthan_10 else value)

In [17]:
dataset["location"].value_counts()

location
other                 2885
Whitefield             542
Sarjapur  Road         399
Electronic City        304
Kanakpura Road         273
                      ... 
Nehru Nagar             11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: count, Length: 242, dtype: int64

## here , i am using price_per_sqft column to view invalid data
1. removing data by row where price_per_sqft is less than 3500 and greater than 25000 
2. total_sqft - size of home cannot be 1 , so changing minimum to 300
as they are invalid.

So to make ML model accurate , removed this outliner data


In [18]:
# here minimum and maximum values are invalid 
# by math average value for min should be around 1000 and max should be around 25000

dataset.describe().round()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1560.0,3.0,113.0,3.0,7908.0
std,1238.0,1.0,149.0,1.0,106430.0
min,1.0,1.0,8.0,1.0,268.0
25%,1100.0,2.0,50.0,2.0,4267.0
50%,1276.0,2.0,72.0,3.0,5434.0
75%,1680.0,3.0,120.0,3.0,7312.0
max,52272.0,40.0,3600.0,43.0,12000000.0


In [19]:
# total_sqft - size of home cannot be 1 , so changing minimum to 300
dataset = dataset[((dataset["total_sqft"] / dataset["bhk"]) >= 300)] 
dataset.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [20]:
# this will remove sqft outliners as total_sqft have invalid data acc to mean and std 

def remove_outliers_sqft(df):
    df_output = pd.DataFrame()

    # Iterate over each unique location in the DataFrame
    for location, subdf in df.groupby("location"):
        # Calculate the mean and standard deviation of "price_per_sqft" for the current location
        mean_price = np.mean(subdf["price_per_sqft"])
        std_price = np.std(subdf["price_per_sqft"])

        # Filter the data for the current location to keep values within one standard deviation from the mean
        filtered_df = subdf[(subdf["price_per_sqft"] >= (mean_price - std_price)) &
                            (subdf["price_per_sqft"] <= (mean_price + std_price))]

        # Concatenate the filtered data for this location to the output DataFrame
        df_output = pd.concat([df_output, filtered_df], ignore_index=True)

    return df_output

dataset = remove_outliers_sqft(dataset)
dataset.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,880.694214,0.979449,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


In [21]:
dataset.shape

(10301, 7)

In [22]:
# this will remove bhk outliners

def remove_outline_bhk(df):
    exclude_indices = np.array([])

    # Iterate over each location in the DataFrame
    for location, location_df in df.groupby("location"):
        bhk_stats = {}
        
        # Calculate statistics for each bhk group within the location
        for bhk, bhk_df in location_df.groupby("bhk"):
            bhk_stats[bhk] = {
                "mean": np.mean(bhk_df["price_per_sqft"]),
                "std": np.std(bhk_df["price_per_sqft"]),
                "count": bhk_df.shape[0]
            }
        
        # Iterate over each bhk group within the location
        for bhk, bhk_df in location_df.groupby("bhk"):
            stats = bhk_stats.get(bhk - 1)
            
            # Exclude data points that fall below the mean of the previous bhk group (if it exists) and have a count greater than 5
            if stats and stats["count"] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df["price_per_sqft"] < stats["mean"]].index.values)

    # Drop the excluded indices from the DataFrame
    return df.drop(exclude_indices, axis="index")

In [23]:
dataset = remove_outline_bhk(dataset)
dataset

Unnamed: 0,area_type,location,total_sqft,bath,price,bhk,price_per_sqft
0,Super built-up Area,1st Block Jayanagar,2850.0,4.0,428.0,4,15017.543860
1,Super built-up Area,1st Block Jayanagar,1630.0,3.0,194.0,3,11901.840491
2,Super built-up Area,1st Block Jayanagar,1875.0,2.0,235.0,3,12533.333333
3,Built-up Area,1st Block Jayanagar,1200.0,2.0,130.0,3,10833.333333
4,Super built-up Area,1st Block Jayanagar,1235.0,2.0,148.0,2,11983.805668
...,...,...,...,...,...,...,...
10292,Carpet Area,other,1200.0,2.0,70.0,2,5833.333333
10293,Super built-up Area,other,1800.0,1.0,200.0,1,11111.111111
10296,Super built-up Area,other,1353.0,2.0,110.0,2,8130.081301
10297,Plot Area,other,812.0,1.0,26.0,1,3201.970443


In [24]:
# sorting by price_per_sqft and removing non useful columns
dataset = dataset.sort_values(by='price_per_sqft', ascending=False)
dataset.drop(columns=["area_type" , "price_per_sqft"] , inplace=True)

# converted bhk to float 
dataset['bhk'] = dataset['bhk'].astype(float)

In [25]:
dataset.info


<bound method DataFrame.info of              location  total_sqft  bath   price  bhk
2541    HAL 2nd Stage      2040.0   4.0   500.0  5.0
2539    HAL 2nd Stage       600.0   3.0   145.0  2.0
1721  Cunningham Road      7500.0   6.0  1800.0  4.0
1712  Cunningham Road      5270.0   4.0  1250.0  4.0
1713  Cunningham Road      3875.0   3.0   864.0  3.0
...               ...         ...   ...     ...  ...
9294            other       470.0   2.0    10.0  1.0
561          Attibele       550.0   1.0    11.5  1.0
9455            other      1000.0   1.0    19.0  1.0
9729            other     10030.0   1.0   150.0  1.0
8703            other      1500.0   1.0    19.5  1.0

[7361 rows x 5 columns]>

In [26]:
# our data Cleaning is complete and ready for ML model 
# changing order for dependent and independent variable
dataset = dataset[["location", "total_sqft", "bath", "bhk", "price"]]

file_path = 'Cleaned_Data.csv'
dataset.to_csv(file_path, index=False) 