In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('real-estate.csv')
data.head()

In [None]:
cleaned_columns = [col.strip().replace(' ', '_').replace('-', '_').lower() for col in data.columns]
data.columns = cleaned_columns
data.columns

In [None]:
missing_summary = data.isnull().sum()
print("Missing Values Summary:\n", missing_summary)

for col in ['bath', 'balcony', 'price']:
    data[col] = data[col].fillna(data[col].median())

data.dropna(inplace=True)
data.isnull().sum()

In [None]:
subset_data = data[(data['availability'] == 'Ready To Move') & (data['price'] > 50)]
subset_data.head()

In [None]:
encoded_data = pd.get_dummies(subset_data, columns=['area_type', 'availability', 'location'], drop_first=True)
encoded_data.head()

In [None]:
location_columns = [col for col in encoded_data.columns if col.startswith("location_")]

location_avg_prices = {}
for loc_col in location_columns:
    location_avg_prices[loc_col] = encoded_data.loc[encoded_data[loc_col] == 1, 'price'].mean()

location_avg_prices_df = pd.DataFrame(location_avg_prices.items(), columns=["Location", "Average Price"])
location_avg_prices_df["Location"] = location_avg_prices_df["Location"].str.replace("location_", "").str.replace("_", " ")
location_avg_prices_df.sort_values(by="Average Price", ascending=False, inplace=True)

plt.figure(figsize=(10, 6))
sns.barplot(data=location_avg_prices_df.head(10), x="Average Price", y="Location", palette="viridis")
plt.title("Top 10 Locations by Average Sale Price")
plt.xlabel("Average Price")
plt.ylabel("Location")
plt.show()

In [None]:
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

cleaned_data = encoded_data[(encoded_data['price'] >= lower_bound) & (encoded_data['price'] <= upper_bound)]
cleaned_data.head()

In [None]:
cleaned_data.to_csv("Cleaned_RealEstate_Prices.csv", index=False)