In [1]:
# --- Env/KERNEL sanity (helpful in VS Code) ---
import sys, os, importlib.util
print("PYTHON:", sys.executable)
print("CONDA_DEFAULT_ENV:", os.environ.get("CONDA_DEFAULT_ENV"))
print("pandas spec (before import):", importlib.util.find_spec("pandas"))

# --- Standard libraries ---
from pathlib import Path

# --- Third-party (analysis/EDA) ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from loguru import logger

# --- Make local package importable from repo root or notebooks/ ---
HERE = Path.cwd()
PROJ_ROOT = HERE if (HERE / "addiction").exists() else HERE.parent
if str(PROJ_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJ_ROOT))

# --- CCDS package: paths + setup helper ---
from addiction import (
    setup,                         # creates data/, reports/, models/, etc. if missing
    DATA_DIR, RAW_DATA_DIR,        # paths
    INTERIM_DATA_DIR, PROCESSED_DATA_DIR,
    REPORTS_DIR, FIGURES_DIR,
)

# Ensure CCDS directory tree exists
setup()

# Quick path printout
print("DATA_DIR:", DATA_DIR)
print("RAW_DATA_DIR:", RAW_DATA_DIR)
print("INTERIM_DATA_DIR:", INTERIM_DATA_DIR)
print("PROCESSED_DATA_DIR:", PROCESSED_DATA_DIR)
print("FIGURES_DIR:", FIGURES_DIR)

# Matplotlib inline for notebooks
%matplotlib inline


PYTHON: /Users/christianfullerton/miniforge3/envs/cigarette-and-drinking-data/bin/python
CONDA_DEFAULT_ENV: cigarette-and-drinking-data
pandas spec (before import): ModuleSpec(name='pandas', loader=<_frozen_importlib_external.SourceFileLoader object at 0x10d3cd670>, origin='/Users/christianfullerton/miniforge3/envs/cigarette-and-drinking-data/lib/python3.12/site-packages/pandas/__init__.py', submodule_search_locations=['/Users/christianfullerton/miniforge3/envs/cigarette-and-drinking-data/lib/python3.12/site-packages/pandas'])


[32m2025-11-04 15:16:42.030[0m | [34m[1mDEBUG   [0m | [36maddiction.config[0m:[36m<module>[0m:[36m29[0m - [34m[1mResolved PROJ_ROOT: /Users/christianfullerton/Developer/Python Workspace/Cigarette-and-Drinking-Data[0m


[32m2025-11-04 15:16:42.054[0m | [34m[1mDEBUG   [0m | [36maddiction.config[0m:[36m_configure_logging[0m:[36m65[0m - [34m[1mConfigured loguru with tqdm sink.[0m
[32m2025-11-04 15:16:42.087[0m | [34m[1mDEBUG   [0m | [36maddiction.config[0m:[36mensure_project_dirs[0m:[36m100[0m - [34m[1mEnsured directory exists: /Users/christianfullerton/Developer/Python Workspace/Cigarette-and-Drinking-Data/data[0m
[32m2025-11-04 15:16:42.087[0m | [34m[1mDEBUG   [0m | [36maddiction.config[0m:[36mensure_project_dirs[0m:[36m100[0m - [34m[1mEnsured directory exists: /Users/christianfullerton/Developer/Python Workspace/Cigarette-and-Drinking-Data/data/raw[0m
[32m2025-11-04 15:16:42.088[0m | [34m[1mDEBUG   [0m | [36maddiction.config[0m:[36mensure_project_dirs[0m:[36m100[0m - [34m[1mEnsured directory exists: /Users/christianfullerton/Developer/Python Workspace/Cigarette-and-Drinking-Data/data/interim[0m
[32m2025-11-04 15:16:42.088[0m | [34m[1mDEBUG   

In [2]:
candidate = RAW_DATA_DIR / "addiction_population_data.csv"

if not candidate.exists():
    raise FileNotFoundError(
        f"Expected CSV at {candidate} but it was not found.\n"
        "If your file is named differently, update the filename here."
    )

print("Using CSV:", candidate)


Using CSV: /Users/christianfullerton/Developer/Python Workspace/Cigarette-and-Drinking-Data/data/raw/addiction_population_data.csv


In [3]:
df = pd.read_csv(candidate)
print("Rows x Cols:", df.shape)
print("Columns: ", df.columns)
df.head(10)

Rows x Cols: (3000, 25)
Columns:  Index(['id', 'name', 'age', 'gender', 'country', 'city', 'education_level',
       'employment_status', 'annual_income_usd', 'marital_status',
       'children_count', 'smokes_per_day', 'drinks_per_week',
       'age_started_smoking', 'age_started_drinking',
       'attempts_to_quit_smoking', 'attempts_to_quit_drinking',
       'has_health_issues', 'mental_health_status', 'exercise_frequency',
       'diet_quality', 'sleep_hours', 'bmi', 'social_support',
       'therapy_history'],
      dtype='object')


Unnamed: 0,id,name,age,gender,country,city,education_level,employment_status,annual_income_usd,marital_status,...,attempts_to_quit_smoking,attempts_to_quit_drinking,has_health_issues,mental_health_status,exercise_frequency,diet_quality,sleep_hours,bmi,social_support,therapy_history
0,1,Michael Bates,66,Other,Yemen,Martinmouth,Secondary,Student,45595,Married,...,6,2,True,Good,Daily,Average,5.6,22.4,,Current
1,2,Brian Thompson,29,Male,Saudi Arabia,Harperhaven,Primary,Self-Employed,145842,Single,...,1,6,False,Poor,Weekly,Good,6.7,24.1,Moderate,
2,3,Steven Little,75,Male,Togo,Chanport,Postgraduate,Unemployed,162480,Single,...,9,9,True,Good,Never,Good,6.2,22.2,Weak,
3,4,Michael Mathews,35,Other,Togo,North Cory,University,Unemployed,16023,In a relationship,...,5,7,False,Average,Daily,Good,7.2,25.5,Moderate,Current
4,5,Nicholas Sanchez,38,Female,Morocco,Danielberg,College,Self-Employed,62933,In a relationship,...,4,7,True,Poor,Weekly,Good,8.5,31.2,Weak,Past
5,6,Carrie Sellers,17,Male,Belgium,West Arthur,Secondary,Employed,178640,Single,...,2,3,True,Good,Never,Average,6.4,29.3,Weak,Past
6,7,Jessica Riley MD,36,Male,Saint Kitts and Nevis,Christopherstad,College,Self-Employed,20491,Widowed,...,4,9,False,Good,Weekly,Good,4.5,32.6,Moderate,Past
7,8,Walter Tran MD,67,Male,Equatorial Guinea,Silvaberg,College,Unemployed,11275,Single,...,9,1,True,Average,Weekly,Good,3.6,24.6,Weak,
8,9,Melanie Walker,16,Other,Burkina Faso,Brandystad,Postgraduate,Retired,85495,Divorced,...,6,5,False,Good,Weekly,Good,4.6,24.6,,
9,10,Carl Mathews,44,Male,Barbados,Roseborough,University,Self-Employed,192822,Single,...,2,2,False,Average,Weekly,Poor,4.5,26.0,Strong,Current


In [14]:
print(df.columns[df.isna().any()])

Index(['education_level', 'social_support', 'therapy_history'], dtype='object')


In [15]:
df_processed = df.copy()

In [16]:
df_processed = df_processed.drop(columns=["id"])
df_processed = df_processed.set_index("name")

In [17]:
min_income = df_processed["annual_income_usd"].min()
max_income = df_processed["annual_income_usd"].max()
med_income = df_processed["annual_income_usd"].median()
avg_income = df_processed["annual_income_usd"].mean()
std_income = df_processed["annual_income_usd"].std()

print("Minimum income: ", min_income)
print("Maximum income: ", max_income)
print("Median Income: ", med_income)
print("Average Income: ", avg_income)
print("STD Income: ", std_income)

Minimum income:  560
Maximum income:  199951
Median Income:  98616.5
Average Income:  98904.178
STD Income:  57288.035963336144


In [19]:
bins   = [-np.inf, 25000, 50000, 75000, 100000, 150000, np.inf]
labels = ["<25k", "25–50k", "50–75k", "75–100k", "100–150k", "150k+"]
df_processed["income_band"] = pd.cut(df_processed["annual_income_usd"], bins=bins, labels=labels, right=True, include_lowest=True)

In [21]:
mode_by_group = (
    df_processed.groupby(["marital_status", "children_count"])["social_support"]
      .transform(lambda s: s.mode().iloc[0] if not s.mode().empty else np.nan)
)
df_processed["social_support"] = df_processed["social_support"].fillna(mode_by_group)\
                                         .fillna(df["social_support"].mode())

In [23]:
mode_by_group = (
    df_processed.groupby(["employment_status", "income_band"])['education_level']
        .transform(lambda s: s.mode().iloc[0] if not s.mode().empty else np.nan)
)

df_processed["education_level"] = df_processed["education_level"].fillna(mode_by_group)\
                                         .fillna(df["education_level"].mode())

  df_processed.groupby(["employment_status", "income_band"])['education_level']


In [None]:
mode_by_group = (
    df_processed.groupby(['education_level','arital_status','mental_health_status'])['therapy_history']
        .transform(lambda s: s.mode().iloc[0] if not s.mode().empty else np.nan)
)

df_processed["therapy_history"] = df_processed["therapy_history"].fillna(mode_by_group)\
                                         .fillna(df["therapy_history"].mode())

In [None]:
df_processed["smoke_intensity"]  = pd.cut(df_processed["smokes_per_day"], [0,1,5,10,20, np.inf], labels=["none","ultra","light","med","heavy"])
df_processed["drink_intensity"]  = pd.cut(df_processed["drinks_per_week"], [0,1,7,14,28, np.inf], labels=["none","very_low","low","mod","high"])

In [None]:
df["dependents_ratio"] = (df["children_count"] + 1) / (df["income"] + 1)

In [None]:
df["quit_effort_smoke_norm"] = df["attempts_to_quit_smoking"] / (df["smokes_per_day"]+1)
df["quit_effort_drink_norm"] = df["attempts_to_quit_drinking"] / (df["drinks_per_week"]+1)
