In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml
import os
import sys
from pathlib import Path
from datetime import datetime

In [None]:
paths = yaml.safe_load(open("configs/paths.yaml"))
RAW = Path(paths["raw_dir"])
INTERIM = Path(paths["interim_dir"]); INTERIM.mkdir(parents=True, exist_ok=True)
PROCESSED = Path(paths["processed_dir"]); PROCESSED.mkdir(parents=True, exist_ok=True)

raw_path = RAW / "updated_cigarette_drinking_data.csv" if (RAW / "updated_cigarette_drinking_data.csv").exists() else sorted(RAW.glob("*.csv"))[0]
raw_path

In [None]:
try:
    sys.path.append("src")
    from cig_drink import basic_clean  # reusable project cleaner (why: consistency)
except Exception:
    def basic_clean(df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        df.columns = (df.columns.str.strip().str.lower()
                      .str.replace(" ", "_", regex=False)
                      .str.replace("&", "and", regex=False))
        return df

df = pd.read_csv(raw_path)
df = basic_clean(df)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/christianfullerton/Desktop/Coding Workspace/Python Workspace/Data Science Practice/Cigarettes and Smoking Data/addiction_population_data.csv'

In [None]:
df['gender'] = df['gender'].astype('category')
df['marital_status'] = df['marital_status'].astype('category')
df['mental_health_status'] = df['mental_health_status'].astype('category')
df['education_level'] = df['education_level'].astype('category')
df['employment_status'] = df['employment_status'].astype('category')
df['mental_health_status'] = df['mental_health_status'].astype('category')
df['exercise_frequency'] = df['exercise_frequency'].astype('category')
df['diet_quality'] = df['diet_quality'].astype('category')
df['social_support'] = df['social_support'].astype('category')
df['therapy_history'] = df['therapy_history'].astype('category')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.set_index('id')

In [None]:
df['annual_income_usd'] = np.round(df['annual_income_usd'], -3)

In [None]:
quantiles = [0, 0.05, 0.25, 0.5, 0.75, 0.9, 1.0]
labels = ["Bottom 5%", "Bottom 25%", "Bottom 50%", "Top 50%", "Top 25%", "Top 10%"]
df['salary_percentile'] = pd.qcut(df['annual_income_usd'], q=quantiles, labels=labels)
df['salary_percentile'] = df['salary_percentile'].astype('category')

In [None]:
choices = ['Teenager', 'Young Adult', 'Middle Age Adult', 'Elder']
conditions = [
    (df['age'] <= 18),
    (df['age'] > 18) & (df['age'] < 30),
    (df['age'] >= 30) & (df['age'] <= 59),
    (df['age'] >= 60)
]
df['age_group'] = np.select(conditions, choices, default='Unknown')
df['age_group'] = df['age_group'].astype('category')

In [None]:
df['adequet_sleep'] = np.where(df['sleep_hours'] > 8, "Adequet Sleep", "Not Adequet Sleep")
df['adequet_sleep'] = df['adequet_sleep'].astype('category')

In [None]:
conditions = [
    # 1. Family with children
    (
        df['marital_status'].isin(['Married', 'In a relationship']) &
        (df['children_count'] > 0)
    ),
    
    # 2. Single parent
    (
        ~df['marital_status'].isin(['Married', 'In a relationship']) &
        (df['children_count'] > 0)
    ),
    
    # 3. Family with no kids
    (
        (df['children_count'] == 0) &
        df['marital_status'].isin(['Married', 'In a relationship'])
    )
]
choices = ["Family with Children", "Single Parent", "Family with no Kids"]
df['family_status'] = np.select(conditions, choices, default = "Other")
df['family_status'] = df['family_status'].astype('category')

In [None]:
df.isna().sum()

In [None]:
df['education_level'] = df['education_level'].fillna(
    df.groupby(['salary_percentile', 'employment_status','annual_income_usd'])['education_level']
      .transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
)

In [None]:
df['social_support'] = df['social_support'].fillna(
    df.groupby(
        ['family_status', 'marital_status', 'children_count'], observed=True)['social_support']
    .transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
)

In [None]:
df['therapy_history'] = df['therapy_history'].fillna(
    df.groupby(
        ['family_status', 'marital_status', 'children_count'], observed=True)['therapy_history']
    .transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
)

In [None]:
df = df.dropna(subset=['education_level'])

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
gender_smoke = df.groupby('gender')['smokes_per_day'].mean().sort_values()

plt.figure(figsize=(6, 4))
plt.bar(gender_smoke.index, gender_smoke.values, color='darkorange')
plt.title('Average Cigarettes per Day by Gender')
plt.ylabel('Avg Smokes per Day')
plt.xlabel('Gender')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
edu_smoking_age = df.groupby('education_level')['age_started_smoking'].mean().sort_values()
plt.barh(edu_smoking_age.index, edu_smoking_age.values)
plt.title('Average Age Started Smoking by Education Level')
plt.xlabel('Average Age Started Smoking')
plt.ylabel('Education Level')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
plt.hist(df['bmi'], bins=25, edgecolor='black')
plt.title('BMI Distribution')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
age_sleep = df.groupby('age_group')['sleep_hours'].mean()
plt.bar(age_sleep.index, age_sleep.values)
plt.title('Average Sleep Hours by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Sleep Hours')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
therapy_quit_smoke = df.groupby('therapy_history')['attempts_to_quit_smoking'].mean().sort_values()
plt.barh(therapy_quit_smoke.index, therapy_quit_smoke.values)
plt.title('Average Times People Attepted to qui Smoking with Therapy')
plt.xlabel('Average Attempts to Quit Smoking')
plt.ylabel('Therapy Session')
plt.tight_layout()
plt.show()

In [None]:
age_bmi_sleep = df.groupby('age_group')[['sleep_hours', 'bmi']].mean()

fig, ax1 = plt.subplots(figsize=(8, 4))

ax1.set_title('Average Sleep Hours and BMI by Age Group')
ax1.set_xlabel('Age Group')
ax1.set_ylabel('Average Sleep Hours', color='tab:blue')
ax1.bar(age_bmi_sleep.index, age_bmi_sleep['sleep_hours'], alpha=0.6, label='Sleep Hours')
ax1.tick_params(axis='y', labelcolor='tab:blue')

ax2 = ax1.twinx()
ax2.set_ylabel('Average BMI', color='tab:red')
ax2.plot(age_bmi_sleep.index, age_bmi_sleep['bmi'], color='tab:red', marker='o', label='BMI')
ax2.tick_params(axis='y', labelcolor='tab:red')

fig.tight_layout()
plt.show()

In [None]:
marital_counts = df['marital_status'].value_counts()

plt.figure(figsize=(6, 6))
plt.pie(marital_counts, labels=marital_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Marital Status Distribution')
plt.tight_layout()
plt.show()

In [None]:
stamp = datetime.now().strftime("%Y%m%d-%H%M")
wip_path = INTERIM / f"cleaned_wip_{stamp}.csv"
final_path = PROCESSED / "cleaned.csv"

df.to_csv(wip_path, index=False)
df.to_csv(final_path, index=False)

print("Wrote:", wip_path)
print("Wrote:", final_path)