In [38]:
# Load libraries
import os
import subprocess
import sys
import zipfile
import kagglehub
import shutil
# !pip install kagglehub
# Download latest version
import pandas as pd
import glob
import duckdb
from logging import root
import os

In [None]:
# Download the data from Kaggle and move CSV files to destination folder
def downloadData(dest_folder, kaggle_path=None):
    """
    Download dataset using kagglehub (if kaggle_path provided) or use existing global `path`,
    then move CSV files (or extract zip) into dest_folder and clean up.
    """

    # determine download path
    if kaggle_path:
        download_path = kagglehub.dataset_download(kaggle_path)
        print("Path to dataset files:", download_path)
    else:
        download_path = globals().get("path")
        if not download_path:
            raise ValueError("No kaggle_path provided and no global 'path' found.")

    # ensure destination exists
    os.makedirs(dest_folder, exist_ok=True)

    moved = 0
    # if the download is a zip file, extract it into dest_folder
    if os.path.isfile(download_path) and zipfile.is_zipfile(download_path):
        with zipfile.ZipFile(download_path, "r") as z:
            # extract only csv files to dest_folder
            for member in z.namelist():
                if member.endswith(".csv"):
                    z.extract(member, dest_folder)
                    # if extracted into subfolders, move to root dest_folder
                    extracted_path = os.path.join(dest_folder, member)
                    final_path = os.path.join(dest_folder, os.path.basename(member))
                    if extracted_path != final_path:
                        os.makedirs(os.path.dirname(final_path), exist_ok=True)
                        shutil.move(extracted_path, final_path)
                    moved += 1
        # remove the zip file after extraction
        try:
            os.remove(download_path)
        except OSError:
            pass
    else:
        # treat download_path as directory: move csv files from it (recursively)
        if os.path.isdir(download_path):
            for root, _, files in os.walk(download_path):
                for fname in files:
                    if fname.endswith(".csv"):
                        src = os.path.join(root, fname)
                        dst = os.path.join(dest_folder, fname)
                        # if destination file exists, overwrite
                        if os.path.exists(dst):
                            os.remove(dst)
                        shutil.move(src, dst)
                        moved += 1
        else:
            # single file that is not a zip: move if csv
            if download_path.endswith(".csv"):
                dst = os.path.join(dest_folder, os.path.basename(download_path))
                shutil.move(download_path, dst)
                moved = 1
            else:
                raise ValueError(f"Unsupported download path: {download_path}")

    # cleanup any empty directories under the original download_path
    try:
        if os.path.isdir(download_path):
            shutil.rmtree(download_path, ignore_errors=True)
    except Exception as e:
        print("Warning during cleanup:", e)

    print(f"Data downloaded and moved to {dest_folder}. Files moved: {moved}")

# load dataset
from logging import root
from isort import file
import pandas as pd
import os

# load dataset

def showDataShapeColumnNames(csv_folder):
    """
    Show shape and column names of CSV files in the specified folder.
    For performance, column names are read with pandas (nrows=0) and rows are counted by scanning the file.
    """
    if not os.path.isdir(csv_folder):
        raise ValueError(f"Folder not found: {csv_folder}")

    for fname in sorted(os.listdir(csv_folder)):
        if not fname.endswith('.csv'):
            continue
        path = os.path.join(csv_folder, fname)
        try:
            # get columns without reading full data
            cols = pd.read_csv(path, nrows=0).columns.tolist()
            # count data rows (fast line count). Subtract 1 for header if present.
            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                total_lines = sum(1 for _ in f)
            nrows = max(0, total_lines - 1)
            print(f"File: {fname}")
            print(f"Shape: ({nrows}, {len(cols)})")
            print("Columns:", cols)
            print()
        except Exception as e:
            print(f"Could not read file {path}: {e}")


if __name__ == "__main__":
    URL = "yasserh/instacart-online-grocery-basket-analysis-dataset"
    DEST_FOLDER = "../data"

    # Ensure destination folder exists (avoid FileNotFoundError)
    if not os.path.isdir(DEST_FOLDER):
        print(f"Destination folder {DEST_FOLDER} does not exist. Creating it.")
        os.makedirs(DEST_FOLDER, exist_ok=True)

    # List only CSV files in the destination folder
    csv_files = [f for f in os.listdir(DEST_FOLDER) if f.endswith(".csv")]
    if csv_files:
        print(f"CSV files in {DEST_FOLDER}: {csv_files}")
    else:
        print(f"No CSV files found in {DEST_FOLDER}, proceeding to download.")
        downloadData(DEST_FOLDER, kaggle_path=URL)

    showDataShapeColumnNames(csv_folder=DEST_FOLDER)