In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# NOTE: Ensure df_filtered is loaded before running this cell
print("\n--- Section 4: Exploratory Data Analysis (EDA) ---")

# --- 4.1 Data Overview ---
print("\n4.1 Data Overview:")
print(f"Shape of the filtered data: {df_filtered.shape}")
print("\nDataFrame Info:")
df_filtered.info()

print("\nMissing Values (Count and Percentage):")
missing_data = df_filtered.isnull().sum()
missing_percentage = (df_filtered.isnull().sum() / len(df_filtered)) * 100
missing_info = pd.DataFrame({'Missing Count': missing_data, 'Missing Percentage (%)': missing_percentage})
print(missing_info[missing_info['Missing Count'] > 0])

# --- 4.2 Descriptive Statistics ---
print("\n4.2 Descriptive Statistics:")
print("\nDescriptive Statistics for Numerical Columns:")
print(df_filtered.describe())

print("\nValue Counts for Categorical Columns:")
for col in ['Property Type', 'Old/New', 'Duration', 'PPD Category Type', 'Record Status']:
    if col in df_filtered.columns:
        print(f"\n--- {col} ---")
        print(df_filtered[col].value_counts())

# --- 4.3 Distribution of Key Variables ---
print("\n4.3 Distribution of Key Variables:")

# Distribution of Price
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
sns.histplot(df_filtered['Price'], bins=50, kde=True)
plt.title('Distribution of Price')
plt.xlabel('Price (£)')
plt.ylabel('Frequency')
plt.ticklabel_format(style='plain', axis='x')

plt.subplot(1, 2, 2)
sns.boxplot(y=df_filtered['Price'])
plt.title('Box Plot of Price')
plt.ylabel('Price (£)')
plt.ticklabel_format(style='plain', axis='y')
plt.show()

# Distribution of Property Type
plt.figure(figsize=(8, 6))
sns.countplot(data=df_filtered, x='Property Type', order=df_filtered['Property Type'].value_counts().index)
plt.title('Distribution of Property Types')
plt.xlabel('Property Type')
plt.ylabel('Count')
plt.show()

# Distribution of Old/New
plt.figure(figsize=(6, 5))
sns.countplot(data=df_filtered, x='Old/New')
plt.title('Distribution of Old/New Properties')
plt.xlabel('Old/New')
plt.ylabel('Count')
plt.show()

# Distribution of Duration
plt.figure(figsize=(6, 5))
sns.countplot(data=df_filtered, x='Duration')
plt.title('Distribution of Property Duration (Tenure)')
plt.xlabel('Duration')
plt.ylabel('Count')
plt.show()

# --- 4.4 Relationships with Price ---
print("\n4.4 Relationships with Price:")

# Price vs. Property Type
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_filtered, x='Property Type', y='Price', order=df_filtered['Property Type'].value_counts().index)
plt.title('Price Distribution by Property Type')
plt.xlabel('Property Type')
plt.ylabel('Price (£)')
plt.yscale('log')
plt.ticklabel_format(style='plain', axis='y')
plt.show()

# Price vs. Old/New
plt.figure(figsize=(8, 6))
sns.boxplot(data=df_filtered, x='Old/New', y='Price')
plt.title('Price Distribution by Old/New Property Status')
plt.xlabel('Old/New')
plt.ylabel('Price (£)')
plt.yscale('log')
plt.ticklabel_format(style='plain', axis='y')
plt.show()

# Price vs. Duration
plt.figure(figsize=(8, 6))
sns.boxplot(data=df_filtered, x='Duration', y='Price')
plt.title('Price Distribution by Duration (Tenure)')
plt.xlabel('Duration')
plt.ylabel('Price (£)')
plt.yscale('log')
plt.ticklabel_format(style='plain', axis='y')
plt.show()

# Price vs. Year (Average Price Trend)
plt.figure(figsize=(12, 7))
avg_price_per_year = df_filtered.groupby('year')['Price'].mean().reset_index()
sns.lineplot(data=avg_price_per_year, x='year', y='Price', marker='o')
plt.title('Average House Price Trend by Year (2015-2024)')
plt.xlabel('Year')
plt.ylabel('Average Price (£)')
plt.grid(True)
plt.ticklabel_format(style='plain', axis='y')
plt.show()

# Price vs. is_post_covid
plt.figure(figsize=(8, 6))
sns.boxplot(data=df_filtered, x='is_post_covid', y='Price')
plt.title('Price Distribution: Pre-COVID vs. Post-COVID (from March 2020)')
plt.xlabel('Is Post-COVID (True if >= Mar 2020)')
plt.ylabel('Price (£)')
plt.yscale('log')
plt.ticklabel_format(style='plain', axis='y')
plt.show()

# --- 4.5 Geographical Analysis ---
print("\n4.5 Geographical Analysis:")

# Top 15 Towns by Average Price
top_towns = df_filtered.groupby('Town')['Price'].mean().nlargest(15).reset_index()
plt.figure(figsize=(12, 8))
sns.barplot(x='Price', y='Town', data=top_towns)
plt.title('Top 15 Towns by Average House Price (2015-2024)')
plt.xlabel('Average Price (£)')
plt.ylabel('Town')
plt.ticklabel_format(style='plain', axis='x')
plt.tight_layout()
plt.show()

# Top 15 Counties by Average Price
top_counties = df_filtered.groupby('County')['Price'].mean().nlargest(15).reset_index()
plt.figure(figsize=(12, 8))
sns.barplot(x='Price', y='County', data=top_counties)
plt.title('Top 15 Counties by Average House Price (2015-2024)')
plt.xlabel('Average Price (£)')
plt.ylabel('County')
plt.ticklabel_format(style='plain', axis='x')
plt.tight_layout()
plt.show()

print("\nEDA complete. Review the plots and printed outputs for insights.")