# Housing Price Prediction - Exploratory Data Analysis

This notebook performs exploratory data analysis on the housing dataset.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load the Data

In [None]:
# Load the housing dataset
df = pd.read_csv('../data/housing.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()

## 3. Data Overview and Statistics

In [None]:
# Data types and missing values
print("Data Types and Missing Values:")
print(df.info())
print("\n")
print("Statistical Summary:")
df.describe()

## 4. Univariate Analysis

In [None]:
# Distribution of the target variable (Price)
plt.figure(figsize=(14, 4))

plt.subplot(1, 2, 1)
df['price'].hist(bins=30, edgecolor='black')
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
df['price'].plot(kind='box')
plt.title('Box Plot of House Prices')
plt.ylabel('Price')

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix
correlation_matrix = df.corr(numeric_only=True)

# Display correlation with price
print("Correlation with Price:")
print(correlation_matrix['price'].sort_values(ascending=False))

In [None]:
# Heatmap of correlations
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix of Housing Features')
plt.tight_layout()
plt.show()

## 6. Bivariate Analysis

In [None]:
# Scatter plots of top correlated features with price
top_features = correlation_matrix['price'].abs().nlargest(4).index[1:]  # Exclude 'price' itself

plt.figure(figsize=(14, 8))
for i, feature in enumerate(top_features, 1):
    plt.subplot(2, 2, i)
    plt.scatter(df[feature], df['price'], alpha=0.5)
    plt.xlabel(feature)
    plt.ylabel('Price')
    plt.title(f'Price vs {feature}')

plt.tight_layout()
plt.show()

## 7. Missing Values Analysis

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Percentage': missing_percentage
})

print("Missing Values Summary:")
print(missing_df[missing_df['Missing Count'] > 0])

if missing_df[missing_df['Missing Count'] > 0].empty:
    print("No missing values found!")

## 8. Key Insights and Conclusions

In [None]:
print("Key Findings:")
print("="*50)
print(f"1. Total samples: {len(df)}")
print(f"2. Total features: {len(df.columns)}")
print(f"3. Price range: ${df['price'].min():.2f} - ${df['price'].max():.2f}")
print(f"4. Mean price: ${df['price'].mean():.2f}")
print(f"5. Median price: ${df['price'].median():.2f}")
print(f"6. Standard deviation: ${df['price'].std():.2f}")