# Airbnb Amsterdam 探索性数据分析 / Airbnb Amsterdam Exploratory Data Analysis

本 Notebook 包含对 Airbnb 阿姆斯特丹房源数据的完整探索性数据分析。

This notebook contains a complete exploratory data analysis of Airbnb Amsterdam listings data.

## 分析内容 / Analysis Contents

1. 数据加载和基本信息 / Data Loading and Basic Information
2. 缺失值检查 / Missing Value Check
3. 数据清洗 / Data Cleaning
4. 房源活动分析 / Activity Analysis
5. 描述性统计分析 / Descriptive Statistics
6. 许可证分析 / License Analysis
7. 顶级房东分析 / Top Hosts Analysis


In [None]:
# 导入必要的库 / Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# 设置中文字体支持（如果需要）/ Set Chinese font support (if needed)
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei']  # 用于显示中文 / For displaying Chinese
plt.rcParams['axes.unicode_minus'] = False  # 正常显示负号 / Normal display of minus sign

# 确保输出目录存在 / Ensure output directory exists
os.makedirs("../../charts", exist_ok=True)

print("库导入完成 / Libraries imported successfully")


## 1. 数据加载 / Data Loading


In [None]:
# 读取房源数据 / Read listings data
# 使用相对路径读取数据文件 / Use relative path to read data file
listing = pd.read_csv("../../data/listings.csv")

print(f"数据形状 / Data shape: {listing.shape}")
print(f"列数 / Number of columns: {len(listing.columns)}")
print("\n前5行数据 / First 5 rows:")
listing.head()


## 2. 缺失值检查 / Missing Value Check


In [None]:
# 查找缺失值 / Find missing values
# 统计每个特征的缺失值数量 / Count missing values for each feature
missing_value_count = listing.isnull().sum()
print("---Number of null in each feature")
print(missing_value_count)

# 仅显示有缺失值的特征 / Display only features with missing values
missing_feature = missing_value_count[missing_value_count > 0]
print("\n---Number of null value in this feature")
if missing_feature.empty:
    print("No null value feature")
else:
    print(missing_feature)


## 3. 数据清洗 / Data Cleaning


In [None]:
# 删除neighbourhood_group列 / Drop neighbourhood_group column
listing = listing.drop("neighbourhood_group", axis=1)

# 用0填充review相关字段的空值 / Fill null values in review-related fields with 0
listing["last_review"] = listing["last_review"].fillna(0)
listing["reviews_per_month"] = listing["reviews_per_month"].fillna(0)

# 用blank_name填充name和host_name的空值 / Fill null values in name and host_name with blank_name
listing["name"] = listing["name"].fillna("blank_name")
listing["host_name"] = listing["host_name"].fillna("blank_host_name")

# 用0填充license字段的空值 / Fill null values in license field with 0
listing["license"] = listing["license"].fillna(0)

print("数据清洗完成 / Data cleaning completed")
print(f"清洗后数据形状 / Data shape after cleaning: {listing.shape}")


## 4. 数据基本信息描述 / Basic Data Information


In [None]:
# 数据基本信息 / Basic data information
print("数据信息 / Data Info:")
print(listing.info())

print("\n数据统计描述 / Data Description:")
listing.describe()


## 5. 房源活动分析 / Activity Analysis


In [None]:
# 计算平均入住天数 / Calculate average occupancy days
# occupancy days = 365 - availability_365
listing["occupancy days"] = 365 - listing["availability_365"]
print(f"The average nights booked is {round(listing['occupancy days'].mean())} days")

# 计算平均价格 / Calculate average price per night
print(f"The average price per night is ${round(listing['price'].mean())}")

# 绘制入住天数分布直方图 / Plot occupancy days distribution histogram
plt.figure(figsize=(8, 8))
listing["occupancy days"].plot(kind="hist", bins=list(range(0, 365, 30)), edgecolor="black")
plt.title("Occupancy distribution")
plt.xlabel("Occupancy (last 12 months)")
plt.ylabel("listing")
plt.tight_layout()
plt.savefig("../../charts/occupancy_distribution.png", dpi=300, bbox_inches='tight')
plt.show()
print("图表已保存至: charts/occupancy_distribution.png")


## 6. 描述性统计分析 / Descriptive Statistics Analysis


In [None]:
# 价格分布分析 / Price distribution analysis
print("价格统计描述 / Price Statistics:")
print(listing["price"].describe())

plt.figure(figsize=(8, 5))
listing['price'].plot(kind='hist', bins=60, edgecolor='black')
plt.title('Price distribution')
plt.xlabel("price(€)")
plt.ylabel("Number of listing")
plt.tight_layout()
plt.savefig("../../charts/price_distribution.png", dpi=300, bbox_inches='tight')
plt.show()
print("图表已保存至: charts/price_distribution.png")


In [None]:
# 房型分布分析 / Room type distribution analysis
plt.figure(figsize=(6, 6))
listing["room_type"].value_counts().plot(kind='pie', autopct="%1.1f%%")
plt.title('Room Type share')
plt.ylabel("")
plt.tight_layout()
plt.savefig("../../charts/room_type_distribution.png", dpi=300, bbox_inches='tight')
plt.show()
print("图表已保存至: charts/room_type_distribution.png")


In [None]:
# 评论数量分布分析 / Review distribution analysis
plt.figure(figsize=(8, 5))
listing["number_of_reviews"].plot(kind='hist', bins=60, edgecolor="black")
plt.title("Review distribution")
plt.xlabel("Number of Reviews")
plt.ylabel("Number of Listing")
plt.tight_layout()
plt.savefig("../../charts/review_distribution.png", dpi=300, bbox_inches='tight')
plt.show()
print("图表已保存至: charts/review_distribution.png")


In [None]:
# 不同房型的平均价格分析 / Average price analysis by room type
plt.figure(figsize=(10, 10))
listing.groupby("room_type")["price"].mean().sort_values().plot(kind='bar')
plt.title("average price of room type")
plt.xlabel("room type")
plt.ylabel("average price")
plt.tight_layout()
plt.savefig("../../charts/avg_price_by_room_type.png", dpi=300, bbox_inches='tight')
plt.show()
print("图表已保存至: charts/avg_price_by_room_type.png")


In [None]:
# 不同街区的平均价格分析 / Average price analysis by neighbourhood
plt.figure(figsize=(15, 12))
listing.groupby("neighbourhood")["price"].mean().sort_values(ascending=False).plot(kind="bar")
plt.title("Average Price of neighbourhoods")
plt.xlabel("Neighbourhoods")
plt.ylabel("Average Price")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig("../../charts/avg_price_by_neighbourhood.png", dpi=300, bbox_inches='tight')
plt.show()
print("图表已保存至: charts/avg_price_by_neighbourhood.png")


## 7. 许可证分析 / License Analysis


In [None]:
# 许可证分类处理 / License classification processing
# 根据license字段的值进行分类：0->Unlicensed, "Exempt"->Exempt, 0363开头->Licensed, 其他->pending
# Classify licenses based on license field: 0->Unlicensed, "Exempt"->Exempt, starts with 0363->Licensed, others->pending
pie_label = np.select(
    [
        listing["license"] == 0,
        listing["license"] == "Exempt",
        listing["license"].astype(str).str.startswith("0363")
    ],
    ["Unlicensed", "Exempt", "Licensed"],
    default="pending"
)
listing["License_label"] = pie_label

# 许可证分布饼图 / License distribution pie chart
plt.figure(figsize=(8, 8))
listing["License_label"].value_counts().plot(kind="pie", autopct="%1.1f%%")
plt.title("Licenses")
plt.tight_layout()
plt.savefig("../../charts/license_distribution.png", dpi=300, bbox_inches='tight')
plt.show()
print("图表已保存至: charts/license_distribution.png")

# 显示许可证分类统计 / Display license classification statistics
print("\n许可证分类统计 / License Classification Statistics:")
print(listing["License_label"].value_counts())


## 8. 顶级房东分析 / Top Hosts Analysis


In [None]:
# 顶级房东分析 / Top hosts analysis
# 使用交叉表统计每个房东的房型分布 / Use crosstab to count room type distribution for each host
room_count = pd.crosstab(listing["host_name"], listing["room_type"])
# 计算每个房东的总房源数 / Calculate total listings for each host
room_count["listings"] = room_count.sum(axis=1)
# 按房源数量降序排序 / Sort by total listings in descending order
room_count = room_count.sort_values(by="listings", ascending=False)
# 显示前10名房东 / Display top 10 hosts
print("前10名房东及其房源分布 / Top 10 Hosts and their Room Type Distribution:")
room_count.head(10)


## 分析总结 / Analysis Summary

本 Notebook 完成了以下分析：

1. ✅ 数据加载和基本信息查看
2. ✅ 缺失值检查和统计
3. ✅ 数据清洗（删除列、填充缺失值）
4. ✅ 房源活动分析（入住率、平均价格）
5. ✅ 描述性统计分析（价格、房型、评论分布）
6. ✅ 许可证分类和分布分析
7. ✅ 顶级房东排名分析

所有图表已保存至 `charts/` 目录。
