### This notebook contains code related to features added.

In [1]:
import pandas as pd

In [2]:
# Load csv
df = pd.read_csv("../data/parsed/pune/pune_listings_cleaned.csv")

In [6]:
print(f"Shape : {df.shape}")

Shape : (2829, 7)


In [4]:
print(f"Columns :  {df.columns}")

Columns :  Index(['price_num', 'bhk_number', 'area_sqft', 'price_per_sqft', 'locality',
       'city', 'listing_url'],
      dtype='object')


In [5]:
print(f"Data Types : {df.dtypes}")

Data Types : price_num          int64
bhk_number         int64
area_sqft          int64
price_per_sqft     int64
locality          object
city              object
listing_url       object
dtype: object


In [8]:
#Check for invalid prices
invalid = df[df['price_num'] < 100000]
if len(invalid) > 0:
    print("Unrealistic price rows detected (below 1 lakh):")
    print(invalid.head())
else:
    print("Prices are Valid")

Prices are Valid


In [9]:
#Create price_lakhs column
df['price_lakhs'] = (df['price_num'] / 100000).round(2)

In [10]:
print(df[['price_num', 'price_lakhs']].head())

   price_num  price_lakhs
0    6500000         65.0
1    7000000         70.0
2    2500000         25.0
3   13500000        135.0
4    5000000         50.0


In [11]:
df.to_csv("../data/parsed/pune/pune_listings_cleaned.csv", index=False)

In [12]:
# Before progress validate
df.isna().sum()

price_num         0
bhk_number        0
area_sqft         0
price_per_sqft    0
locality          0
city              0
listing_url       4
price_lakhs       0
dtype: int64

In [13]:
# Fill the urls
df['listing_url'] = df['listing_url'].fillna('Unknown')

In [14]:
#final Check
df.isna().sum()

price_num         0
bhk_number        0
area_sqft         0
price_per_sqft    0
locality          0
city              0
listing_url       0
price_lakhs       0
dtype: int64

### Outlier Detection & Removal

In [15]:
df.columns

Index(['price_num', 'bhk_number', 'area_sqft', 'price_per_sqft', 'locality',
       'city', 'listing_url', 'price_lakhs'],
      dtype='object')

In [16]:
df['price_lakhs'].describe()
df['area_sqft'].describe()
df['price_per_sqft'].describe()

count    2.829000e+03
mean     1.393701e+04
std      1.480987e+05
min      1.500000e+03
25%      7.589000e+03
50%      1.027200e+04
75%      1.356900e+04
max      7.882533e+06
Name: price_per_sqft, dtype: float64

In [17]:
out_price = df[(df['price_lakhs'] < 5) | (df['price_lakhs'] > 1500)]
out_area  = df[(df['area_sqft'] < 150) | (df['area_sqft'] > 10000)]
out_pps   = df[(df['price_per_sqft'] < 1000) | (df['price_per_sqft'] > 50000)]

print("Price outliers:", len(out_price))
print("Area outliers:", len(out_area))
print("PPSQFT outliers:", len(out_pps))

Price outliers: 4
Area outliers: 2
PPSQFT outliers: 5


In [18]:
#Filter price_lakhs
df = df[(df['price_lakhs'] >= 5) & (df['price_lakhs'] <= 1500)]


In [19]:
#Filter area_sqft
df = df[(df['area_sqft'] >= 150) & (df['area_sqft'] <= 10000)]


In [20]:
# Filter price_per_sqft
df = df[(df['price_per_sqft'] >= 1000) & (df['price_per_sqft'] <= 50000)]

In [21]:
df.shape

(2820, 8)

In [22]:
df[['price_lakhs','area_sqft','price_per_sqft']].describe()

Unnamed: 0,price_lakhs,area_sqft,price_per_sqft
count,2820.0,2820.0,2820.0
mean,122.207518,969.697163,11042.529787
std,131.58972,615.024306,4815.200662
min,8.0,180.0,2400.0
25%,47.0,619.75,7587.25
50%,81.8,814.5,10257.0
75%,148.25,1138.0,13553.25
max,1500.0,7500.0,40732.0


In [23]:
df.to_csv("../data/parsed/pune/pune_listings_cleaned.csv", index=False)