📊 EDA Goals
Understand the distribution of prices and reviews.

Identify correlations between features (e.g., availability, room type, host listings).

Analyze popular neighbourhoods and listing types.

Spot missing/erroneous data visually.

Prep the foundation for modeling (feature insights).

In [None]:

# Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
%matplotlib inline

In [None]:
# Load cleaned data
listings = pd.read_csv("../data/processed/clean_listings.csv")
reviews = pd.read_csv("../data/processed/clean_reviews.csv")
merged = pd.read_csv("../data/processed/merged_data.csv")

In [None]:
# Overview
print("Listings shape:", listings.shape)
print("Reviews shape:", reviews.shape)

display(listings.head())
display(reviews.head())

In [None]:
# Price distribution
plt.figure(figsize=(10, 5))
sns.histplot(listings['price'], bins=50, kde=True)
plt.title('Distribution of Listing Prices')
plt.xlabel('Price (Euro)')
plt.xlim(0, 500)
plt.show()

In [None]:
# Room type distribution
plt.figure(figsize=(8, 4))
sns.countplot(data=listings, x='room_type', order=listings['room_type'].value_counts().index)
plt.title("Room Type Frequency")
plt.show()

In [None]:
# Availability distribution
plt.figure(figsize=(10, 5))
sns.boxplot(data=listings, x='room_type', y='availability_365')
plt.title("Availability by Room Type")
plt.ylim(0, 365)
plt.show()

In [None]:
# Correlation heatmap
corr = listings[['price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Between Numerical Features")
plt.show()

In [None]:
# Top neighbourhoods by average price
top_neigh = merged.groupby("neighbourhood")["price"].mean().sort_values(ascending=False).head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_neigh.values, y=top_neigh.index)
plt.title("Top 10 Neighbourhoods by Average Price")
plt.xlabel("Average Price")
plt.show()

In [None]:
# Listings per host
host_listings = listings['calculated_host_listings_count'].value_counts().sort_index()
plt.figure(figsize=(10, 5))
sns.barplot(x=host_listings.index, y=host_listings.values)
plt.title("Number of Listings per Host")
plt.xlabel("Listings per Host")
plt.ylabel("Number of Hosts")
plt.show()