In [11]:
# Cell 1 — EDA bootstrap & imports (project-aware)

import sys
from pathlib import Path

ROOT = next((p for p in [Path.cwd(), *Path.cwd().parents] if (p / "pyproject.toml").exists() or (p / ".git").exists()), Path.cwd())
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from addiction.utilities.config import load_config 

import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)
plt.rcParams["figure.figsize"] = (8, 5)
plt.rcParams["figure.dpi"] = 150

# Jupyter magic for inline plots
%matplotlib inline

In [None]:
CSV_OVERRIDE = None
CFG = load_config(ROOT / "config" / "config.yaml")
csv_path = Path(CSV_OVERRIDE) if CSV_OVERRIDE else CFG.paths.raw_csv
assert csv_path.exists(), f"CSV not found: {csv_path}"

# Read CSV (fallback encoding if needed)
read_kwargs = dict(low_memory=False)
try:
    df = pd.read_csv(csv_path, **read_kwargs)
except UnicodeDecodeError:
    df = pd.read_csv(csv_path, encoding="latin-1", **read_kwargs)

rows, cols = df.shape
mem_mb = round(df.memory_usage(deep=True).sum() / (1024**2), 3)
print(f"Shape: {rows:,} × {cols:,} | Memory: {mem_mb} MB")
df.head(10)

Loaded: /Users/christianfullerton/Desktop/Coding Workspace/Python Workspace/Full Projects/Cigarette-and-Drinking-Data/data/raw/addiction_population_data.csv
Shape: 3,000 × 25 | Memory: 2.184 MB


Unnamed: 0,id,name,age,gender,country,city,education_level,employment_status,annual_income_usd,marital_status,children_count,smokes_per_day,drinks_per_week,age_started_smoking,age_started_drinking,attempts_to_quit_smoking,attempts_to_quit_drinking,has_health_issues,mental_health_status,exercise_frequency,diet_quality,sleep_hours,bmi,social_support,therapy_history
0,1,Michael Bates,66,Other,Yemen,Martinmouth,Secondary,Student,45595,Married,3,5,4,12,29,6,2,True,Good,Daily,Average,5.6,22.4,,Current
1,2,Brian Thompson,29,Male,Saudi Arabia,Harperhaven,Primary,Self-Employed,145842,Single,4,11,3,11,24,1,6,False,Poor,Weekly,Good,6.7,24.1,Moderate,
2,3,Steven Little,75,Male,Togo,Chanport,Postgraduate,Unemployed,162480,Single,5,13,4,18,27,9,9,True,Good,Never,Good,6.2,22.2,Weak,
3,4,Michael Mathews,35,Other,Togo,North Cory,University,Unemployed,16023,In a relationship,3,7,4,37,14,5,7,False,Average,Daily,Good,7.2,25.5,Moderate,Current
4,5,Nicholas Sanchez,38,Female,Morocco,Danielberg,College,Self-Employed,62933,In a relationship,0,8,2,20,35,4,7,True,Poor,Weekly,Good,8.5,31.2,Weak,Past
5,6,Carrie Sellers,17,Male,Belgium,West Arthur,Secondary,Employed,178640,Single,0,6,4,13,35,2,3,True,Good,Never,Average,6.4,29.3,Weak,Past
6,7,Jessica Riley MD,36,Male,Saint Kitts and Nevis,Christopherstad,College,Self-Employed,20491,Widowed,1,9,5,18,23,4,9,False,Good,Weekly,Good,4.5,32.6,Moderate,Past
7,8,Walter Tran MD,67,Male,Equatorial Guinea,Silvaberg,College,Unemployed,11275,Single,2,8,3,15,33,9,1,True,Average,Weekly,Good,3.6,24.6,Weak,
8,9,Melanie Walker,16,Other,Burkina Faso,Brandystad,Postgraduate,Retired,85495,Divorced,1,8,6,10,36,6,5,False,Good,Weekly,Good,4.6,24.6,,
9,10,Carl Mathews,44,Male,Barbados,Roseborough,University,Self-Employed,192822,Single,2,7,2,39,16,2,2,False,Average,Weekly,Poor,4.5,26.0,Strong,Current


In [7]:
df.columns

Index(['id', 'name', 'age', 'gender', 'country', 'city', 'education_level', 'employment_status', 'annual_income_usd', 'marital_status', 'children_count',
       'smokes_per_day', 'drinks_per_week', 'age_started_smoking', 'age_started_drinking', 'attempts_to_quit_smoking', 'attempts_to_quit_drinking',
       'has_health_issues', 'mental_health_status', 'exercise_frequency', 'diet_quality', 'sleep_hours', 'bmi', 'social_support', 'therapy_history'],
      dtype='object')

In [8]:
df_copy = df.copy()
df_copy.set_index('id')

Unnamed: 0_level_0,name,age,gender,country,city,education_level,employment_status,annual_income_usd,marital_status,children_count,smokes_per_day,drinks_per_week,age_started_smoking,age_started_drinking,attempts_to_quit_smoking,attempts_to_quit_drinking,has_health_issues,mental_health_status,exercise_frequency,diet_quality,sleep_hours,bmi,social_support,therapy_history
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,Michael Bates,66,Other,Yemen,Martinmouth,Secondary,Student,45595,Married,3,5,4,12,29,6,2,True,Good,Daily,Average,5.6,22.4,,Current
2,Brian Thompson,29,Male,Saudi Arabia,Harperhaven,Primary,Self-Employed,145842,Single,4,11,3,11,24,1,6,False,Poor,Weekly,Good,6.7,24.1,Moderate,
3,Steven Little,75,Male,Togo,Chanport,Postgraduate,Unemployed,162480,Single,5,13,4,18,27,9,9,True,Good,Never,Good,6.2,22.2,Weak,
4,Michael Mathews,35,Other,Togo,North Cory,University,Unemployed,16023,In a relationship,3,7,4,37,14,5,7,False,Average,Daily,Good,7.2,25.5,Moderate,Current
5,Nicholas Sanchez,38,Female,Morocco,Danielberg,College,Self-Employed,62933,In a relationship,0,8,2,20,35,4,7,True,Poor,Weekly,Good,8.5,31.2,Weak,Past
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2996,Mary Bartlett,29,Other,Macedonia,Port Kimberly,Postgraduate,Employed,118269,Single,0,8,3,33,20,9,1,True,Poor,Never,Average,6.6,27.4,Moderate,Current
2997,Beth Sanchez,51,Male,Saudi Arabia,Smithbury,College,Employed,35898,Widowed,2,11,5,25,23,9,2,True,Poor,Rarely,Good,4.3,27.7,Moderate,Past
2998,Megan Duffy,15,Male,Falkland Islands (Malvinas),Clinebury,College,Retired,167501,Widowed,4,12,5,27,19,1,8,True,Poor,Rarely,Poor,6.6,23.5,Moderate,Past
2999,Jeffery Lawson,38,Male,Bermuda,Colleenport,Secondary,Retired,135893,Married,2,9,6,37,23,9,3,False,Poor,Rarely,Poor,4.9,27.2,,Current
