In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

# Danh sách nghề
jobs = ['office', 'construction', 'healthcare', 'teacher',
        'driver', 'retail', 'software', 'student', 'retired', 'farmer']

# Base BMI trung bình theo nghề
job_bmi_map = {
    'office': 27, 'construction': 23, 'healthcare': 24,
    'teacher': 25, 'driver': 27, 'retail': 25,
    'software': 26, 'student': 22, 'retired': 26, 'farmer': 22
}

n = 1000
job_choices = np.random.choice(jobs, n)
ages = np.clip(np.random.normal(40, 12, n).astype(int), 18, 75)
heights = np.clip(np.random.normal(170, 10, n), 140, 200)

# BMI gốc
bmis = []
for job, age in zip(job_choices, ages):
    base_bmi = job_bmi_map[job]
    age_effect = 0.02 * (age - 40)
    bmi = np.random.normal(loc=base_bmi + age_effect, scale=2.0)
    bmis.append(bmi)

weights = bmis * (heights/100)**2

df = pd.DataFrame({
    'job': job_choices,
    'age': ages,
    'height_cm': heights.round(1),
    'weight_kg': weights.round(1)
})

# 1. BMI
df['BMI'] = (df['weight_kg'] / (df['height_cm']/100)**2).round(1)

# 2. age_group
df['age_group'] = pd.cut(df['age'],
                         bins=[17,29,44,59,100],
                         labels=['18-29','30-44','45-59','60+'])

# 3. gender
df['gender'] = np.random.choice(['male','female'], size=n)

# 4. smoker
df['smoker'] = np.random.choice([0,1], size=n, p=[0.8,0.2])

# 5. exercise_freq
df['exercise_freq'] = np.random.poisson(lam=3, size=n)

# 6. alcohol_units_per_week
df['alcohol_units_per_week'] = np.random.poisson(lam=2, size=n)

# 7. marital_status
df['marital_status'] = np.random.choice(['single','married','divorced','widowed'], size=n)

# 8. education_level
df['education_level'] = np.random.choice(['highschool','bachelor','master','phd'], size=n,
                                         p=[0.4,0.4,0.15,0.05])

# 9. income_level (liên hệ với nghề)
income_map = {
    'student':'low','retired':'low','retail':'low',
    'farmer':'medium','driver':'medium','construction':'medium','teacher':'medium','healthcare':'medium',
    'office':'high','software':'high'
}
df['income_level'] = df['job'].map(income_map)

# 10. sleep_hours
df['sleep_hours'] = np.clip(np.random.normal(7,1,n), 4, 10).round(1)

# 11. diet_quality
df['diet_quality'] = np.random.choice(['poor','average','good'], size=n, p=[0.2,0.5,0.3])

# 12. blood_pressure_sys
df['blood_pressure_sys'] = np.random.normal(120, 15, n).round(0)

# 13. blood_pressure_dia
df['blood_pressure_dia'] = np.random.normal(80, 10, n).round(0)

# 14. cholesterol_level
df['cholesterol_level'] = np.random.normal(190, 40, n).round(0)

# 15. diabetes (BMI > 30 tăng xác suất)
prob_diabetes = np.where(df['BMI'] > 30, 0.25, 0.05)
df['diabetes'] = np.random.binomial(1, prob_diabetes)

# Lưu file CSV
path = r"C:\DATA\data_4.2.csv"
df.to_csv(path, index=False)

print(f"Dataset saved at {path}")
print(df.head())


Dataset saved at C:\DATA\data_4.2.csv
        job  age  height_cm  weight_kg   BMI age_group  gender  smoker  \
0  software   50      186.3       98.1  28.3     45-59  female       1   
1   teacher   28      167.4       71.0  25.3     18-29    male       0   
2   student   29      179.4       72.7  22.6     18-29    male       0   
3    driver   42      163.1       75.4  28.3     30-44    male       0   
4  software   32      167.0       74.2  26.6     30-44  female       0   

   exercise_freq  alcohol_units_per_week marital_status education_level  \
0              3                       0       divorced      highschool   
1              3                       1         single        bachelor   
2              0                       4         single      highschool   
3              3                       2       divorced          master   
4              5                       1         single        bachelor   

  income_level  sleep_hours diet_quality  blood_pressure_sys  \
0 