### Importation des données

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sns
import yaml
import importlib

from utils import HTMLParser

In [6]:
data = pd.read_csv("data/train.csv")

In [7]:
# Génération du dictionnaire .json à partir du HTML
parser = HTMLParser()
html_snippet = "data/USCODE22_LLCP_102523.HTML"

codebook = parser.parse_html_codebook(html_snippet)

In [8]:
config = "features.yaml"

with open(config, 'r') as file:
    config = yaml.load(file, Loader=yaml.FullLoader)

In [9]:
data_reduced = data[config['features'] + ['TARGET'] + ["ID"]].copy()

In [10]:
features_classification = {}

for feature in config['features']:
    features_classification[feature] = {}
    entry = next((item for item in codebook if item["sas_variable"] == feature), None)
    if entry:
        categories = list(set([cat['value'] for cat in entry['categories']]))
        categories.sort()
        if categories[0] != str(1):
            features_classification[feature]['type'] = 'numerical'

            print(f"Feature: {feature}")
            print(categories)

        else:
            features_classification[feature]['type'] = 'categorical'
    else:
        features_classification[feature]['type'] = 'unknown'

Feature: PHYSHLTH
['1 - 30', '77', '88', '99', 'BLANK']
Feature: MENTHLTH
['1 - 30', '77', '88', '99', 'BLANK']
Feature: POORHLTH
['1 - 30', '77', '88', '99', 'BLANK']
Feature: SLEPTIM1
['1 - 24', '77', '99', 'BLANK']
Feature: WEIGHT2
['50 - 0776', '7777', '9023 - 9352', '9999', 'BLANK']
Feature: HEIGHT3
['200 - 711', '7777', '9061 - 9998', '9999', 'BLANK']
Feature: MARIJAN1
['1 - 30', '77', '88', '99', 'BLANK']
Feature: ALCDAY4
['101 - 199', '201 - 299', '777', '888', '999', 'BLANK']
Feature: AVEDRNK3
['1 - 76', '77', '88', '99', 'BLANK']
Feature: DRNK3GE5
['1 - 76', '77', '88', '99', 'BLANK']
Feature: MAXDRNKS
['0', '1 - 76', '77', '99', 'BLANK']
Feature: COPDSMOK
['1 - 76', '77', '88', '99', 'BLANK']
Feature: _PACKDAY
['0 - 100', 'BLANK']


In [14]:
# Handle missing values for numerical features

continuous_columns = ['PHYSHLTH','MENTHLTH','POORHLTH',
                      'SLEPTIM1','WEIGHT2','HEIGHT3',
                      'MARIJAN1','ALCDAY4','AVEDRNK3','DRNK3GE5',
                      'MAXDRNKS','COPDSMOK','_PACKDAY',
                      ]

data_reduced["_PACKDAY"] = data_reduced["_PACKDAY"].fillna(data_reduced["_PACKDAY"].median())

data_reduced["COPDSMOK"] = data_reduced["COPDSMOK"].replace({88:0, 77:np.nan, 99:np.nan}).fillna(data_reduced["COPDSMOK"].median())

data_reduced["MAXDRNKS"] = data_reduced["MAXDRNKS"].replace({77:np.nan, 99:np.nan}).fillna(data_reduced["MAXDRNKS"].median())

data_reduced["DRNK3GE5"] = data_reduced["DRNK3GE5"].replace({88:0, 77:np.nan, 99:np.nan}).fillna(data_reduced["DRNK3GE5"].median())

data_reduced["AVEDRNK3"] = data_reduced["AVEDRNK3"].replace({88:0, 77:np.nan, 99:np.nan}).fillna(data_reduced["AVEDRNK3"].median())

data_reduced["MARIJAN1"] = data_reduced["MARIJAN1"].replace({88:0, 77:np.nan, 99:np.nan}).fillna(data_reduced["MARIJAN1"].median())

mask_alc_week = data_reduced["ALCDAY4"].astype(float).between(101, 199)
data_reduced.loc[mask_alc_week, "ALCDAY4"] = (data_reduced.loc[mask_alc_week, "ALCDAY4"] % 100) * 4
mask_alc_month = data_reduced["ALCDAY4"].between(201, 299)  
data_reduced.loc[mask_alc_month, "ALCDAY4"] = (data_reduced.loc[mask_alc_month, "ALCDAY4"] % 200)

data_reduced["ALCDAY4"] = data_reduced["ALCDAY4"].replace({888:0, 777:np.nan, 999:np.nan}).fillna(data_reduced["ALCDAY4"].median())

data_reduced["SLEPTIM1"] = data_reduced["SLEPTIM1"].replace({77:np.nan, 99:np.nan}).fillna(data_reduced["SLEPTIM1"].median())

data_reduced["POORHLTH"] = data_reduced["POORHLTH"].replace({88:0, 77:np.nan, 99:np.nan}).fillna(data_reduced["POORHLTH"].median())

data_reduced["MENTHLTH"] = data_reduced["MENTHLTH"].replace({88:0, 77:np.nan, 99:np.nan}).fillna(data_reduced["MENTHLTH"].median())

data_reduced["PHYSHLTH"] = data_reduced["PHYSHLTH"].replace({88:0, 77:np.nan, 99:np.nan}).fillna(data_reduced["PHYSHLTH"].median())

def ft_in_to_cm(s):
    s = str(int(float(s))).zfill(3)  
    feet = int(s[0])
    inches = int(s[1:])
    return feet * 30.48 + inches * 2.54

 
data_reduced["WEIGHT2"] = data_reduced["WEIGHT2"].astype(float)
mask_lb = data_reduced["WEIGHT2"].between(50, 776)
data_reduced.loc[mask_lb, "WEIGHT2"] = data_reduced.loc[mask_lb, "WEIGHT2"] * 0.45359237

mask_kg = data_reduced["WEIGHT2"].between(9023, 9352) 
data_reduced.loc[mask_kg, "WEIGHT2"] = data_reduced.loc[mask_kg, "WEIGHT2"] - 9000

mask_ft_in = data_reduced["HEIGHT3"].astype(float).between(200, 711)
data_reduced.loc[mask_ft_in, "HEIGHT3"] = data_reduced.loc[mask_ft_in, "HEIGHT3"].apply(ft_in_to_cm)

mask_kg = data_reduced["HEIGHT3"].between(9061, 9998)  
data_reduced.loc[mask_kg, "HEIGHT3"] = data_reduced.loc[mask_kg, "HEIGHT3"] - 9000

data_reduced["WEIGHT2"] = data_reduced["WEIGHT2"].replace({7777:np.nan, 9999:np.nan}).fillna(data_reduced["WEIGHT2"].median())

data_reduced["HEIGHT3"] = data_reduced["HEIGHT3"].replace({7777:np.nan, 9999:np.nan}).fillna(data_reduced["HEIGHT3"].median())


In [15]:
data_reduced[continuous_columns].head()

Unnamed: 0,PHYSHLTH,MENTHLTH,POORHLTH,SLEPTIM1,WEIGHT2,HEIGHT3,MARIJAN1,ALCDAY4,AVEDRNK3,DRNK3GE5,MAXDRNKS,COPDSMOK,_PACKDAY
0,30.0,0.0,0.0,7.0,33.948096,157.48,0.0,8.0,1.0,0.0,2.0,0.0,2.0
1,0.0,15.0,14.0,6.0,29.421683,157.48,0.0,30.0,2.0,0.0,2.0,0.0,0.5
2,0.0,0.0,0.0,8.0,26.597577,180.34,0.0,0.0,2.0,0.0,2.0,0.0,0.75
3,0.0,0.0,0.0,10.0,22.864584,182.88,0.0,0.0,2.0,0.0,2.0,0.0,0.5
4,0.0,3.0,0.0,7.0,37.034287,185.42,0.0,5.0,4.0,2.0,7.0,0.0,0.5
