In [1]:
import pandas as pd

# Load the dataset into a pandas DataFrame
df = pd.read_csv('/content/Bengaluru_House_Data (1).csv')

# Display the first 5 rows of the DataFrame
print("First 5 rows of the DataFrame:")
print(df.head())

# Display a summary of the DataFrame, including data types and non-null values
print("\nDataFrame Info:")
df.info()

# Generate descriptive statistics of the numerical columns
print("\nDescriptive Statistics:")
print(df.describe())

First 5 rows of the DataFrame:
              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

   society total_sqft  bath  balcony   price  
0   Coomee       1056   2.0      1.0   39.07  
1  Theanmp       2600   5.0      3.0  120.00  
2      NaN       1440   2.0      3.0   62.00  
3  Soiewre       1521   3.0      1.0   95.00  
4      NaN       1200   2.0      1.0   51.00  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 

In [2]:
print("Missing values in 'society' column before treatment:")
missing_society_count = df['society'].isnull().sum()
missing_society_percentage = (missing_society_count / len(df)) * 100
print(f"Number of missing values: {missing_society_count}")
print(f"Percentage of missing values: {missing_society_percentage:.2f}%")

# Fill missing values in 'society' column with 'Not Available'
df['society'].fillna('Not Available', inplace=True)

print("\nMissing values in 'society' column after treatment:")
print(f"Number of missing values: {df['society'].isnull().sum()}")

Missing values in 'society' column before treatment:
Number of missing values: 5502
Percentage of missing values: 41.31%

Missing values in 'society' column after treatment:
Number of missing values: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['society'].fillna('Not Available', inplace=True)


In [3]:
print("Missing values in 'society' column before treatment:")
missing_society_count = df['society'].isnull().sum()
missing_society_percentage = (missing_society_count / len(df)) * 100
print(f"Number of missing values: {missing_society_count}")
print(f"Percentage of missing values: {missing_society_percentage:.2f}%")

# Fill missing values in 'society' column with 'Not Available'
df['society'] = df['society'].fillna('Not Available')

print("\nMissing values in 'society' column after treatment:")
print(f"Number of missing values: {df['society'].isnull().sum()}")

Missing values in 'society' column before treatment:
Number of missing values: 0
Percentage of missing values: 0.00%

Missing values in 'society' column after treatment:
Number of missing values: 0


In [4]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]

print(f"Number of outliers in 'price' column: {len(outliers)}")
print(f"Percentage of outliers: {(len(outliers) / len(df)) * 100:.2f}%")

print("\nDescriptive statistics for 'price' column (original):")
print(df['price'].describe())

print("\nDescriptive statistics for 'price' column (outliers):")
print(outliers['price'].describe())

Number of outliers in 'price' column: 1276
Percentage of outliers: 9.58%

Descriptive statistics for 'price' column (original):
count    13320.000000
mean       112.565627
std        148.971674
min          8.000000
25%         50.000000
50%         72.000000
75%        120.000000
max       3600.000000
Name: price, dtype: float64

Descriptive statistics for 'price' column (outliers):
count    1276.000000
mean      425.746865
std       324.368791
min       226.000000
25%       260.000000
50%       325.000000
75%       450.000000
max      3600.000000
Name: price, dtype: float64


In [5]:
df_cleaned = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]

print(f"Original DataFrame row count: {len(df)}")
print(f"Cleaned DataFrame row count after removing price outliers: {len(df_cleaned)}")
print(f"Number of rows dropped due to price outliers: {len(df) - len(df_cleaned)}")

Original DataFrame row count: 13320
Cleaned DataFrame row count after removing price outliers: 12044
Number of rows dropped due to price outliers: 1276
