In [1]:
import pandas as pd
df = pd.read_csv('Bengaluru_House_Data (1).csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [2]:
print("Missing values in 'society' before imputation:", df['society'].isnull().sum())
df['society'].fillna('NoSociety', inplace=True)
print("Missing values in 'society' after imputation:", df['society'].isnull().sum())
print("First 5 rows of 'society' column after imputation:")
print(df['society'].head())

Missing values in 'society' before imputation: 5502
Missing values in 'society' after imputation: 0
First 5 rows of 'society' column after imputation:
0       Coomee
1      Theanmp
2    NoSociety
3      Soiewre
4    NoSociety
Name: society, dtype: object


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['society'].fillna('NoSociety', inplace=True)


In [3]:
print("Missing values in 'society' before imputation:", df['society'].isnull().sum())
df['society'] = df['society'].fillna('NoSociety')
print("Missing values in 'society' after imputation:", df['society'].isnull().sum())
print("First 5 rows of 'society' column after imputation:")
print(df['society'].head())

Missing values in 'society' before imputation: 0
Missing values in 'society' after imputation: 0
First 5 rows of 'society' column after imputation:
0       Coomee
1      Theanmp
2    NoSociety
3      Soiewre
4    NoSociety
Name: society, dtype: object


In [4]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outlier_indices = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)].index

print(f"First Quartile (Q1) for 'price': {Q1:.2f}")
print(f"Third Quartile (Q3) for 'price': {Q3:.2f}")
print(f"Interquartile Range (IQR) for 'price': {IQR:.2f}")
print(f"Lower bound for outlier detection: {lower_bound:.2f}")
print(f"Upper bound for outlier detection: {upper_bound:.2f}")
print(f"\nNumber of outliers identified: {len(outlier_indices)}")
print(f"Indices of 'price' outliers:\n{outlier_indices.tolist()}")

First Quartile (Q1) for 'price': 50.00
Third Quartile (Q3) for 'price': 120.00
Interquartile Range (IQR) for 'price': 70.00
Lower bound for outlier detection: -55.00
Upper bound for outlier detection: 225.00

Number of outliers identified: 1276
Indices of 'price' outliers:
[7, 9, 11, 18, 22, 60, 62, 79, 96, 107, 122, 140, 153, 159, 163, 185, 192, 193, 210, 225, 248, 260, 277, 282, 324, 325, 337, 355, 358, 362, 373, 401, 407, 408, 413, 434, 435, 440, 451, 455, 459, 472, 477, 480, 485, 490, 518, 524, 534, 539, 545, 550, 566, 570, 583, 605, 609, 610, 615, 618, 634, 639, 641, 648, 649, 656, 669, 672, 681, 702, 736, 743, 749, 794, 801, 823, 840, 855, 861, 888, 889, 908, 911, 928, 935, 936, 937, 938, 941, 948, 950, 953, 977, 1024, 1044, 1050, 1063, 1065, 1068, 1078, 1090, 1094, 1097, 1124, 1146, 1153, 1176, 1184, 1186, 1227, 1234, 1245, 1246, 1255, 1257, 1262, 1269, 1287, 1296, 1299, 1302, 1311, 1330, 1352, 1357, 1361, 1370, 1379, 1383, 1385, 1393, 1398, 1400, 1405, 1442, 1443, 1478, 1552, 1

In [5]:
print("Missing values in other columns before dropping:\n", df.drop(columns=['society']).isnull().sum()[df.drop(columns=['society']).isnull().sum() > 0])

# Justification for dropping rows with missing values:
# Dropping rows with missing values is a suitable strategy here because for critical columns like 'total_sqft', 'bath', and 'balcony',
# imputation without domain knowledge might introduce inaccuracies. Given the context of house price prediction,
# having complete and accurate information for these features is important. If the percentage of missing values is relatively small,
# dropping them helps maintain data integrity and avoids complex imputation methods that might distort the data distribution.

# Drop rows with any remaining missing values (excluding 'society' which is already imputed)
df_no_nan = df.dropna(subset=df.drop(columns=['society']).columns.tolist())

# Remove rows identified as outliers in the 'price' column
df_cleaned = df_no_nan.drop(outlier_indices, errors='ignore')

print(f"\nShape of the cleaned DataFrame (df_cleaned): {df_cleaned.shape}")
print("First 5 rows of the cleaned DataFrame:")
print(df_cleaned.head())

Missing values in other columns before dropping:
 location      1
size         16
bath         73
balcony     609
dtype: int64

Shape of the cleaned DataFrame (df_cleaned): (11658, 9)
First 5 rows of the cleaned DataFrame:
              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

     society total_sqft  bath  balcony   price  
0     Coomee       1056   2.0      1.0   39.07  
1    Theanmp       2600   5.0      3.0  120.00  
2  NoSociety       1440   2.0      3.0   62.00  
3    Soiewre       1521   3.0      1.0   95.00  
4  NoSociety       1200   2.0      1.0   51.00  
