In [1]:
# Data Source: Inside Airbnb
# Snapshot: June 2025
# File: listings.csv (summary, 18 columns)
# Goal: city-level preprocessing + light EDA
# Note: Do NOT drop columns, row-level cleaning only

In [2]:
# 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
#데이터로드 및 데이터크기 확인
import pandas as pd

path = '/content/drive/MyDrive/bangkok_listings_clean.csv'
df = pd.read_csv(path, low_memory=False)

df.shape

(28196, 16)

In [4]:
df.head()


Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm
0,27934,Nice room with superb city view,120437,Nuttee,Ratchathewi,13.75983,100.54134,Entire home/apt,1625.0,15,65,2024-09-17,0.4,1,362,1
1,27979,"Easy going landlord,easy place",120541,Emy,Bang Na,13.66818,100.61674,Private room,,1,0,,,2,0,0
2,28745,modern-style apartment in Bangkok,123784,Familyroom,Bang Kapi,13.75232,100.62402,Private room,,60,0,,,1,0,0
3,47516,Beautiful waterfront house,214456,Anuradha,Don Mueang,13.92726,100.58529,Entire home/apt,4266.0,3,0,,,1,365,0
4,48736,Condo with Chaopraya River View,222005,Athitaya,Rat Burana,13.68556,100.49535,Private room,1477.0,14,1,2014-02-03,0.01,1,365,0


In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28196 entries, 0 to 28195
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              28196 non-null  int64  
 1   name                            28196 non-null  object 
 2   host_id                         28196 non-null  int64  
 3   host_name                       27256 non-null  object 
 4   neighbourhood                   28196 non-null  object 
 5   latitude                        28196 non-null  float64
 6   longitude                       28196 non-null  float64
 7   room_type                       28196 non-null  object 
 8   price                           22666 non-null  float64
 9   minimum_nights                  28196 non-null  int64  
 10  number_of_reviews               28196 non-null  int64  
 11  last_review                     18385 non-null  object 
 12  reviews_per_month               

In [6]:
missing_ratio = df.isna().mean().sort_values(ascending=False)
missing_ratio

Unnamed: 0,0
reviews_per_month,0.347957
last_review,0.347957
price,0.196127
host_name,0.033338
neighbourhood,0.0
id,0.0
name,0.0
host_id,0.0
room_type,0.0
longitude,0.0


In [7]:
#price 결측은 가격분석이 불가하므로 -> 제거
df = df.dropna(subset=["price"])

In [8]:
#리뷰없는 숙소는  0으로 해석
df["reviews_per_month"] = df["reviews_per_month"].fillna(0)

In [9]:
df.isna().mean().sort_values(ascending=False)
df.shape

(22666, 16)

In [10]:
# 2025-06-24 매매기준율 (은행 고시)
THB_TO_USD_20250624 = 42.15 / 1381.90

df["price_usd"] = df["price"] * THB_TO_USD_20250624

In [11]:
df[["price", "price_usd"]].head()
df["price_usd"].describe()

Unnamed: 0,price_usd
count,22666.0
mean,75.358759
std,503.462843
min,1.525074
25%,27.54284
50%,41.177003
75%,64.35813
max,30501.483465


In [12]:
df["price_usd"] = df["price_usd"].round(1)

In [13]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,price_usd
0,27934,Nice room with superb city view,120437,Nuttee,Ratchathewi,13.75983,100.54134,Entire home/apt,1625.0,15,65,2024-09-17,0.4,1,362,1,49.6
3,47516,Beautiful waterfront house,214456,Anuradha,Don Mueang,13.92726,100.58529,Entire home/apt,4266.0,3,0,,0.0,1,365,0,130.1
4,48736,Condo with Chaopraya River View,222005,Athitaya,Rat Burana,13.68556,100.49535,Private room,1477.0,14,1,2014-02-03,0.01,1,365,0,45.1
5,55681,Sathorn Terrace Apartment(61),263049,Tor,Bang Rak,13.71934,100.5176,Private room,1354.0,2,38,2025-02-28,0.22,7,364,4,41.3
8,105042,Central Bangkok 3 Bedroom Apartment,545890,Henry,Khlong Toei,13.73378,100.56303,Entire home/apt,5704.0,28,147,2020-01-07,0.88,1,362,0,174.0


In [14]:
#결측치 재확인
missing_ratio_after = df.isna().mean().sort_values(ascending=False)
print(missing_ratio_after.head(10))

last_review      0.302215
host_name        0.033045
id               0.000000
host_id          0.000000
name             0.000000
latitude         0.000000
longitude        0.000000
room_type        0.000000
neighbourhood    0.000000
price            0.000000
dtype: float64


In [15]:
#가격 이상치 확인
print("price <= 0:", (df["price"] <= 0).sum())
print("price_usd <= 0:", (df["price_usd"] <= 0).sum() if "price_usd" in df.columns else "price_usd 없음")

price <= 0: 0
price_usd <= 0: 0


In [16]:
#id 중복 체크
print("duplicated rows:", df.duplicated().sum())
print("duplicated id:", df["id"].duplicated().sum())

duplicated rows: 0
duplicated id: 0


In [17]:
print(df[["price", "price_usd"]].describe() if "price_usd" in df.columns else df[["price"]].describe())

                price     price_usd
count    22666.000000  22666.000000
mean      2470.658828     75.361008
std      16506.175624    503.462887
min         50.000000      1.500000
25%        903.000000     27.500000
50%       1350.000000     41.200000
75%       2110.000000     64.400000
max    1000000.000000  30501.500000


“방콕 숙소 데이터는 가격·리뷰 결측을 규칙 기반으로 정리했고,
2025년 6월 24일 환율 기준으로 USD 환산 컬럼을 추가한 상태로,
NYC 데이터와 직접 비교 가능한 클린 버전이다.”

In [20]:
df["city"] = "Bangkok"

In [21]:
save_path = "/content/drive/MyDrive/airbnb/clean/bangkok_listings_jun2025_summary_clean.csv"
df.to_csv(save_path, index=False)

import os
print("Saved:", os.path.exists(save_path))
print("Reload shape:", pd.read_csv(save_path, low_memory=False).shape)

Saved: True
Reload shape: (22666, 18)


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22666 entries, 0 to 28195
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              22666 non-null  int64  
 1   name                            22666 non-null  object 
 2   host_id                         22666 non-null  int64  
 3   host_name                       21917 non-null  object 
 4   neighbourhood                   22666 non-null  object 
 5   latitude                        22666 non-null  float64
 6   longitude                       22666 non-null  float64
 7   room_type                       22666 non-null  object 
 8   price                           22666 non-null  float64
 9   minimum_nights                  22666 non-null  int64  
 10  number_of_reviews               22666 non-null  int64  
 11  last_review                     15816 non-null  object 
 12  reviews_per_month               22666