In [6]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
file_paths = {
    'orders': 'olist_orders_dataset.csv',
    'customers': 'olist_customers_dataset.csv',
    'sellers': 'olist_sellers_dataset.csv',
    'order_items': 'olist_order_items_dataset.csv'
}

In [9]:
orders = pd.read_csv(file_paths['orders'])
customers = pd.read_csv(file_paths['customers'])
sellers = pd.read_csv(file_paths['sellers'])
order_items = pd.read_csv(file_paths['order_items'])


In [11]:
print(f"Orders: {len(orders)} rows")
print(f"Customers: {len(customers)} rows")
print(f"Sellers: {len(sellers)} rows")
print(f"Order Items: {len(order_items)} rows")

Orders: 99441 rows
Customers: 99441 rows
Sellers: 3095 rows
Order Items: 112650 rows


In [12]:
# Delivered 필터링
delivered_orders = orders[orders['order_status'] == 'delivered'].copy()
print(f"Delivered orders: {len(delivered_orders)} rows")

Delivered orders: 96478 rows


In [13]:
# 날짜 컬럼 datetime 변환
date_columns = ['order_purchase_timestamp', 'order_delivered_customer_date', 'order_estimated_delivery_date']

for col in date_columns:
    delivered_orders[col] = pd.to_datetime(delivered_orders[col])


In [14]:
# Delta 계산 (실제 배송일 - 예상 배송일)
delivered_orders['estimated_days'] = (delivered_orders['order_estimated_delivery_date'] -
                                     delivered_orders['order_purchase_timestamp']).dt.days
delivered_orders['actual_days'] = (delivered_orders['order_delivered_customer_date'] -
                                  delivered_orders['order_purchase_timestamp']).dt.days
delivered_orders['delta'] = delivered_orders['actual_days'] - delivered_orders['estimated_days']


In [15]:
# 결측값 제거
delivered_orders = delivered_orders.dropna(subset=['delta'])
print(f"Valid delta records: {len(delivered_orders)} rows")


Valid delta records: 96470 rows


In [17]:
# 고객/판매자 정보 병합
order_customer = delivered_orders.merge(customers, on='customer_id', how='left')
final_df = order_customer.merge(order_items, on='order_id', how='left')
final_df = final_df.merge(sellers, on='seller_id', how='left')

In [18]:
# 피처 엔지니어링
# 구매 요일/월
final_df['purchase_weekday'] = final_df['order_purchase_timestamp'].dt.weekday
final_df['purchase_month'] = final_df['order_purchase_timestamp'].dt.month

In [20]:
# 집계 피처
agg_features = final_df.groupby('order_id').agg({
    'order_item_id': 'count',  # n_items
    'seller_id': 'nunique',    # n_sellers
    'freight_value': 'sum',    # sum_freight
    'price': 'sum'             # total_price (추가)
}).reset_index()

agg_features.columns = ['order_id', 'n_items', 'n_sellers', 'sum_freight', 'total_price']

In [21]:
# 최종 데이터셋 병합
final_df = final_df.drop_duplicates(subset=['order_id']).merge(agg_features, on='order_id')

In [22]:
# 학습용 데이터셋 (지정된 피처만 선택, 날짜 컬럼 제외)
feature_columns = [
    'customer_zip_code_prefix', 'customer_city', 'customer_state',
    'seller_zip_code_prefix', 'seller_city', 'seller_state',
    'shipping_limit_date', 'price', 'freight_value', 'order_item_id',
    'order_purchase_timestamp', 'order_status',
    'purchase_weekday', 'purchase_month',
    'n_items', 'n_sellers', 'sum_freight', 'total_price'
]


In [23]:
train_dataset = final_df[feature_columns + ['delta']].dropna()
print(f"Final training dataset: {len(train_dataset)} rows")

Final training dataset: 96470 rows
