# 01 — Explore Cigarette & Alcohol Data (CCDS)

This notebook uses `addiction.config` for project paths, plus quick EDA:
- Environment sanity check (prints Python & pandas path/version)
- Directory setup from `config.py`
- CSV auto-discovery (`data/` → `data/raw/`)
- Schema, missingness, summaries, correlations
- Simple histograms/bar charts
- Save cleaned Parquet to `data/interim/` and `data/processed/`

In [None]:
# --- Env/KERNEL sanity (helpful in VS Code) ---
import sys, os, importlib.util
print("PYTHON:", sys.executable)
print("CONDA_DEFAULT_ENV:", os.environ.get("CONDA_DEFAULT_ENV"))
print("pandas spec (before import):", importlib.util.find_spec("pandas"))

# --- Standard libraries ---
from pathlib import Path

# --- Third-party (analysis/EDA) ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from loguru import logger

# --- Make local package importable from repo root or notebooks/ ---
HERE = Path.cwd()
PROJ_ROOT = HERE if (HERE / "addiction").exists() else HERE.parent
if str(PROJ_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJ_ROOT))

# --- CCDS package: paths + setup helper ---
from addiction import (
    setup,                         # creates data/, reports/, models/, etc. if missing
    DATA_DIR, RAW_DATA_DIR,        # paths
    INTERIM_DATA_DIR, PROCESSED_DATA_DIR,
    REPORTS_DIR, FIGURES_DIR,
)

# Ensure CCDS directory tree exists
setup()

# Quick path printout
print("DATA_DIR:", DATA_DIR)
print("RAW_DATA_DIR:", RAW_DATA_DIR)
print("INTERIM_DATA_DIR:", INTERIM_DATA_DIR)
print("PROCESSED_DATA_DIR:", PROCESSED_DATA_DIR)
print("FIGURES_DIR:", FIGURES_DIR)

# Matplotlib inline for notebooks
%matplotlib inline


In [None]:
candidate = DATA_DIR / "raw/addiction_population_data.csv"

if not candidate.exists():
    raise FileNotFoundError(
        f"Expected CSV at {candidate} but it was not found.\n"
        "If your file is named differently, update the filename here."
    )

print("Using CSV:", candidate)


In [None]:
df = pd.read_csv(candidate)
print("Rows x Cols:", df.shape)
print("Columns: ", df.columns)
df.head(10)