In [1]:
import kagglehub
import os
import shutil
from pathlib import Path

def download_to_folder(dataset_id, target_folder, dataset_name=None):
    """
    下載 Kaggle 資料集到指定資料夾
    
    Args:
        dataset_id: Kaggle 資料集 ID (如 'yasserh/titanic-dataset')
        target_folder: 目標資料夾路徑
        dataset_name: 資料集名稱（用於顯示）
    """
    try:
        # 使用 KaggleHub 下載（會下載到緩存目錄）
        print(f"📥 正在下載 {dataset_name or dataset_id}...")
        cache_path = kagglehub.dataset_download(dataset_id)
        print(f"✅ 下載完成，緩存路徑: {cache_path}")
        
        # 創建目標資料夾
        target_path = Path(target_folder)
        target_path.mkdir(parents=True, exist_ok=True)
        
        # 複製檔案到目標資料夾
        print(f"📁 複製檔案到: {target_folder}")
        
        # 如果目標資料夾已存在且不為空，先清空
        if target_path.exists() and any(target_path.iterdir()):
            shutil.rmtree(target_path)
            target_path.mkdir(parents=True, exist_ok=True)
        
        # 複製所有檔案
        cache_path_obj = Path(cache_path)
        if cache_path_obj.is_file():
            # 如果是單個檔案
            shutil.copy2(cache_path, target_path / cache_path_obj.name)
        else:
            # 如果是資料夾，複製所有內容
            for item in cache_path_obj.iterdir():
                if item.is_file():
                    shutil.copy2(item, target_path / item.name)
                elif item.is_dir():
                    shutil.copytree(item, target_path / item.name)
        
        print(f"✅ 複製完成！")
        
        # 顯示下載的檔案
        files = list(target_path.glob('*'))
        print(f"📄 資料夾中的檔案: {[f.name for f in files]}")
        
        return str(target_path)
        
    except Exception as e:
        print(f"❌ 下載 {dataset_name or dataset_id} 失敗: {e}")
        return None

# 設置基本目錄
base_dir = "datasets/raw"
os.makedirs(base_dir, exist_ok=True)

print("🚀 開始下載 Kaggle 資料集到指定資料夾...")
print("=" * 50)

# 下載 Titanic 資料集
titanic_path = download_to_folder(
    dataset_id="yasserh/titanic-dataset",
    target_folder=f"{base_dir}/titanic",
    dataset_name="Titanic Dataset"
)

Downloading from https://www.kaggle.com/api/v1/datasets/download/yasserh/titanic-dataset?dataset_version_number=1...


100%|██████████| 22.0k/22.0k [00:00<00:00, 1.65MB/s]

Extracting files...
Path to dataset files: /home/os-sunnie.gd.weng/.cache/kagglehub/datasets/yasserh/titanic-dataset/versions/1





In [None]:
# 下載其他資料集
datasets = [
    {
        "id": "mirichoi0218/insurance",
        "name": "Medical Cost Personal Dataset",
        "folder": "insurance"
    },
    {
        "id": "uciml/breast-cancer-wisconsin-data",
        "name": "Breast Cancer Wisconsin",
        "folder": "breast_cancer"
    },
    {
        "id": "lakshmi25npathi/imdb-dataset-of-50k-movie-reviews",
        "name": "IMDB 50K Movie Reviews",
        "folder": "imdb_reviews"
    }
]

successful_downloads = []
failed_downloads = []

for dataset in datasets:
    print(f"\n{'='*60}")
    result = download_to_folder(
        dataset_id=dataset["id"],
        target_folder=f"{base_dir}/{dataset['folder']}",
        dataset_name=dataset["name"]
    )
    
    if result:
        successful_downloads.append(dataset["name"])
    else:
        failed_downloads.append(dataset["name"])

print(f"\n{'='*60}")
print("📊 下載結果總結:")
print(f"✅ 成功下載 ({len(successful_downloads)} 個):")
for name in successful_downloads:
    print(f"   • {name}")

if failed_downloads:
    print(f"\n❌ 下載失敗 ({len(failed_downloads)} 個):")
    for name in failed_downloads:
        print(f"   • {name}")


In [None]:
# 檢查最終的資料夾結構
print("\n📁 最終資料夾結構:")
print("=" * 40)

import os
for root, dirs, files in os.walk(base_dir):
    level = root.replace(base_dir, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        print(f"{subindent}{file}")

# 顯示每個資料集的詳細信息
print(f"\n📊 各資料集詳細資訊:")
print("=" * 40)

for folder in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder)
    if os.path.isdir(folder_path):
        files = os.listdir(folder_path)
        file_count = len(files)
        total_size = sum(os.path.getsize(os.path.join(folder_path, f)) 
                        for f in files if os.path.isfile(os.path.join(folder_path, f)))
        size_mb = total_size / (1024 * 1024)
        
        print(f"\n📂 {folder}:")
        print(f"   • 檔案數量: {file_count}")
        print(f"   • 總大小: {size_mb:.2f} MB")
        print(f"   • 檔案列表: {', '.join(files)}")

print(f"\n🎉 資料集下載完成！所有檔案已存放在 '{base_dir}' 資料夾中。")
