In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train_set = pd.read_csv("train.csv",delimiter=",")
test_set = pd.read_csv("test.csv",delimiter=",")
print('training set size : ', train_set.shape, '\ntest set size : ', test_set.shape)

training set size :  (20758, 18) 
test set size :  (13840, 17)


#### Categorical Variables

In [3]:
categorical_cols = train_set.select_dtypes(include=['object', 'bool']).columns
target = train_set['NObeyesdad']
categorical_data = train_set[categorical_cols].drop('NObeyesdad', axis=1)
pd.DataFrame(categorical_data).head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS
0,Male,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation
1,Female,yes,yes,Frequently,no,no,no,Automobile
2,Female,yes,yes,Sometimes,no,no,no,Public_Transportation
3,Female,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation
4,Male,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation


In [4]:
encoded_categorical_data = pd.get_dummies(pd.DataFrame(categorical_data))
pd.DataFrame(encoded_categorical_data).head()

Unnamed: 0,Gender_Female,Gender_Male,family_history_with_overweight_no,family_history_with_overweight_yes,FAVC_no,FAVC_yes,CAEC_Always,CAEC_Frequently,CAEC_Sometimes,CAEC_no,...,SCC_no,SCC_yes,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,False,True,False,True,False,True,False,False,True,False,...,True,False,False,True,False,False,False,False,True,False
1,True,False,False,True,False,True,False,True,False,False,...,True,False,False,False,True,True,False,False,False,False
2,True,False,False,True,False,True,False,False,True,False,...,True,False,False,False,True,False,False,False,True,False
3,True,False,False,True,False,True,False,False,True,False,...,True,False,False,True,False,False,False,False,True,False
4,False,True,False,True,False,True,False,False,True,False,...,True,False,False,True,False,False,False,False,True,False


#### Numerical Variables

In [5]:
numerical_cols = train_set.select_dtypes(include=['int64', 'float64']).columns
numerical_data = train_set[numerical_cols].drop('id', axis=1)
pd.DataFrame(numerical_data).head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,24.443011,1.699998,81.66995,2.0,2.983297,2.763573,0.0,0.976473
1,18.0,1.56,57.0,2.0,3.0,2.0,1.0,1.0
2,18.0,1.71146,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584
3,20.952737,1.71073,131.274851,3.0,3.0,1.674061,1.467863,0.780199
4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721


In [6]:
dataset = pd.concat([numerical_data, encoded_categorical_data], axis=1)
pd.DataFrame(dataset).head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,SCC_no,SCC_yes,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,24.443011,1.699998,81.66995,2.0,2.983297,2.763573,0.0,0.976473,False,True,...,True,False,False,True,False,False,False,False,True,False
1,18.0,1.56,57.0,2.0,3.0,2.0,1.0,1.0,True,False,...,True,False,False,False,True,True,False,False,False,False
2,18.0,1.71146,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584,True,False,...,True,False,False,False,True,False,False,False,True,False
3,20.952737,1.71073,131.274851,3.0,3.0,1.674061,1.467863,0.780199,True,False,...,True,False,False,True,False,False,False,False,True,False
4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721,False,True,...,True,False,False,True,False,False,False,False,True,False


## Dimensionality Reduction

#### Principal Component Analysis

After One Hot Encoding, 22 components are necessary to maintain 100% variance (19 components for 95%)

In [7]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(dataset)
scaled_dataset = scaler.transform(dataset)

pca = PCA(n_components=22)
pca_out = pca.fit(scaled_dataset)
pca_data = pca.transform(scaled_dataset)

pc_list = ["PC"+str(i) for i in list(range(1, pca_out.n_components+1))]
pc_load = ["loading "+str(i) for i in list(range(1, pca_out.n_components+1))]
cum_explained_var =np.cumsum(pca_out.explained_variance_ratio_)
df_pca=pd.DataFrame({'Variance Ratio':pca_out.explained_variance_ratio_
                     ,'Cumulated Variance Ratio':cum_explained_var
                    }, index=pc_list)
df_pca.T

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22
Variance Ratio,0.134584,0.097587,0.080003,0.070238,0.063661,0.061621,0.057095,0.048152,0.040748,0.037667,...,0.033343,0.032864,0.032025,0.028927,0.026112,0.025211,0.02285,0.015389,0.013252,0.008008
Cumulated Variance Ratio,0.134584,0.232171,0.312174,0.382413,0.446073,0.507695,0.56479,0.612941,0.653689,0.691356,...,0.79536,0.828224,0.860249,0.889176,0.915289,0.9405,0.96335,0.978739,0.991992,1.0


#### Truncated SVD

In [9]:
# from sklearn.decomposition import TruncatedSVD

# scaler = StandardScaler()
# scaler.fit(dataset)
# scaled_dataset = scaler.transform(dataset)

# svd = TruncatedSVD(n_components=22, random_state=42) 

# svd_data = svd.fit_transform(scaled_dataset)

# svd_list = ["SVD"+str(i) for i in range(1, svd.n_components+1)]

# explained_variance_ratio = svd.explained_variance_ratio_
# cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# df_svd = pd.DataFrame({
#     'Variance Ratio': explained_variance_ratio,
#     'Cumulated Variance Ratio': cumulative_explained_variance
# }, index=svd_list)

# df_svd.T


## Feature Selection

#### SelectKBest

In [20]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler

# Préparation des données
X_norm = MinMaxScaler().fit_transform(pca_data)  # Assurez-vous que les données sont normalisées pour le test Chi2

results = []
for k in range(1, X_norm.shape[1] + 1):  # Tester de 1 à nombre total de features
    selector = SelectKBest(score_func=chi2, k=k)
    X_new = selector.fit_transform(X_norm, target)
    model = GradientBoostingClassifier()
    scores = cross_val_score(model, X_new, target, cv=5)  # cv est le nombre de plis dans la validation croisée
    results.append(scores.mean())

best_k = results.index(max(results)) + 1  # +1 car l'index commence à 0
print(f"Le meilleur k est {best_k} avec une accuracy de {max(results)}")

KeyboardInterrupt: 

## Evaluating

In [16]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# #1 Get matrix of data X
# train_X = train_set.drop(['NObeyesdad'], axis=1)

# #2 Get Y, the vector or value we predict
# y = train_set['NObeyesdad']
numerical_cols = dataset.columns

y = target

#3 Identify data types: numerical value / categoriccal Columns
# numerical_cols = train_X.select_dtypes(include=['int64', 'float64']).columns
# categorical_cols = train_X.select_dtypes(include=['object']).columns

#4 Replace missing numerical features by the median of the feature
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('pca', PCA(n_components=22)),             
    ('scaler', StandardScaler())])

#5 Encode categorical features using the "one hot" encoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#6 Create a preprocessor class used before doing predictions to apply transformations on the features
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols)])


print(numerical_cols)
print(categorical_cols)

Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'Gender_Female', 'Gender_Male', 'family_history_with_overweight_no',
       'family_history_with_overweight_yes', 'FAVC_no', 'FAVC_yes',
       'CAEC_Always', 'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no',
       'SMOKE_no', 'SMOKE_yes', 'SCC_no', 'SCC_yes', 'CALC_Frequently',
       'CALC_Sometimes', 'CALC_no', 'MTRANS_Automobile', 'MTRANS_Bike',
       'MTRANS_Motorbike', 'MTRANS_Public_Transportation', 'MTRANS_Walking'],
      dtype='object')
Index(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


# List of models to try
models = [
    ('Random Forest', RandomForestClassifier(n_estimators=100)),
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Support Vector Machine', SVC()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier())
]

# Dictionary to store models and their scores for later use
model_scores = {}

# Diviser les données
X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size=0.2, random_state=42)

for name, model in models:
    # Create a new pipeline with the current model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Evaluate the model and print the score
    score = pipeline.score(X_test, y_test)
    print(f"{name} Accuracy: {score}")
    
    # Store the score in the dictionary
    model_scores[name] = score

Random Forest Accuracy: 0.8629576107899807
Logistic Regression Accuracy: 0.8641618497109826
Support Vector Machine Accuracy: 0.853082851637765
K-Nearest Neighbors Accuracy: 0.7577071290944123
Decision Tree Accuracy: 0.8155105973025049
Gradient Boosting Accuracy: 0.8807803468208093
