In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv(r"D:\DAML-LAB\Housing.csv")

In [3]:
data.head(10)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [5]:
# Use 'data' instead of 'df'
# Also, columns 'Dimensions', 'Plot Area', and 'Price (in rupees)' do not exist in 'data'
# Adjust column names to match 'data'

# Example: drop columns if they exist (safe check)
cols_to_drop = ['Dimensions', 'Plot Area']
data = data.drop(columns=[col for col in cols_to_drop if col in data.columns], inplace=False)

# Fill missing values for 'price' column
data['price'].fillna(data['price'].median(), inplace=True)

# Fill missing values for categorical columns
categorical_cols = data.select_dtypes(include='object').columns
for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

print(data.isnull().sum())


price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['price'].fillna(data['price'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


In [7]:
duplicate_count = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")
data.drop_duplicates(inplace=True)
print(f"Data shape after removing duplicates: {data.shape}")


Number of duplicate rows: 0
Data shape after removing duplicates: (545, 13)


In [9]:
print("Null values before dropping:\n", data.isnull().sum())
data.dropna(inplace=True)
print(f"Shape after dropping nulls: {data.shape}")
print("Null values after dropping:\n", data.isnull().sum())


Null values before dropping:
 price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64
Shape after dropping nulls: (545, 13)
Null values after dropping:
 price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [11]:
# Use 'data' instead of 'df', and use the correct column name 'price'
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data['price'] >= lower_bound) & (data['price'] <= upper_bound)]
print(f"Data shape after removing outliers: {data.shape}")



Data shape after removing outliers: (530, 13)
