In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
train=pd.read_csv("/kaggle/input/ai-201-b-mse-2-ai-c/train.csv")
test=pd.read_csv("/kaggle/input/ai-201-b-mse-2-ai-c/test.csv")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18306 entries, 0 to 18305
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          17071 non-null  object 
 1   Age                             16495 non-null  float64
 2   Height                          16702 non-null  float64
 3   Weight                          16838 non-null  float64
 4   family_history_with_overweight  17241 non-null  object 
 5   FAVC                            17257 non-null  object 
 6   FCVC                            17331 non-null  float64
 7   NCP                             16586 non-null  float64
 8   CAEC                            16841 non-null  object 
 9   SMOKE                           16759 non-null  object 
 10  CH2O                            17373 non-null  float64
 11  SCC                             16503 non-null  object 
 12  FAF                             

In [4]:
train.head(10)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Male,33.226808,1.766888,83.337721,yes,,2.0,,Sometimes,no,1.964435,no,0.750111,0.0,no,Automobile,Overweight_Level_II
1,Female,18.0,1.6,55.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Sometimes,Public_Transportation,Normal_Weight
2,,26.0,1.640741,111.9307,yes,yes,3.0,3.0,Sometimes,no,2.617988,no,0.0,,Sometimes,Public_Transportation,Obesity_Type_III
3,Male,19.0,1.69,64.0,no,yes,2.0,,Frequently,no,2.0,no,0.0,1.0,Sometimes,Automobile,Normal_Weight
4,Male,,1.606474,94.189167,yes,yes,2.0,2.812283,Sometimes,no,2.0,no,1.0,0.0,no,,Obesity_Type_I
5,,21.0,1.72,68.0,yes,yes,1.0,1.0,Frequently,no,2.0,no,1.0,1.0,Sometimes,Public_Transportation,Normal_Weight
6,Male,23.329344,,95.290429,yes,yes,2.0,3.0,Sometimes,no,3.0,no,3.0,2.0,Sometimes,Public_Transportation,Obesity_Type_I
7,Male,25.472995,,88.633616,yes,yes,1.846452,1.001633,Sometimes,no,2.0,no,0.038809,0.37465,Sometimes,Public_Transportation,Overweight_Level_II
8,Male,17.451085,1.787379,59.612717,yes,yes,3.0,3.762778,Sometimes,,2.0,no,1.0,,Sometimes,Automobile,Insufficient_Weight
9,,18.0,1.78,70.0,no,yes,2.0,3.0,Sometimes,no,2.0,,1.0,1.0,no,Public_Transportation,Normal_Weight


In [5]:
train.isnull().sum()

Gender                            1235
Age                               1811
Height                            1604
Weight                            1468
family_history_with_overweight    1065
FAVC                              1049
FCVC                               975
NCP                               1720
CAEC                              1465
SMOKE                             1547
CH2O                               933
SCC                               1803
FAF                               1724
TUE                               1120
CALC                              1090
MTRANS                            1089
NObeyesdad                           0
dtype: int64

In [6]:
test.isnull().sum()

id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
dtype: int64

In [7]:
test_id=test['id']
test=test.drop(columns=['id'])

In [8]:
X=train.drop(columns=['NObeyesdad'])
y=train['NObeyesdad']

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [10]:
numeric_feature=X.select_dtypes(include=['int64','float64']).columns
categorical_features=X.select_dtypes(include=['object']).columns

In [11]:
numeric_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

In [12]:
categorical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('scaler',OneHotEncoder(handle_unknown='ignore'))
])

In [13]:
preprocessor=ColumnTransformer(transformers=[
    ('num',numeric_pipeline,numeric_feature),
    ('cat',categorical_pipeline,categorical_features)
])

In [None]:
model=GradientBoostingClassifier(
    n_estimators=920,        # number of trees
    learning_rate=0.025,      # smaller learning rate usually better
    max_depth=4,             # depth of each tree
    subsample=0.7,           # fraction of samples for each tree
    min_samples_split=5,     # minimum samples to split a node
    min_samples_leaf=2,      # minimum samples in a leaf
    max_features='sqrt',     # number of features to consider for best split
    random_state=42
)

In [15]:
pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',model)
])

In [None]:
pipeline.fit(X_train,y_train)

In [17]:
y_pred=pipeline.predict(X_test)

In [18]:
accu=accuracy_score(y_pred,y_test)

In [19]:
print(accu)

0.8795740032768978


In [20]:
y_final=pipeline.predict(test)

In [21]:
print(y)

0        Overweight_Level_II
1              Normal_Weight
2           Obesity_Type_III
3              Normal_Weight
4             Obesity_Type_I
                ...         
18301       Obesity_Type_III
18302        Obesity_Type_II
18303    Insufficient_Weight
18304    Insufficient_Weight
18305     Overweight_Level_I
Name: NObeyesdad, Length: 18306, dtype: object


In [22]:
print(y_final)

['Obesity_Type_III' 'Overweight_Level_II' 'Obesity_Type_I' ...
 'Obesity_Type_I' 'Overweight_Level_I' 'Overweight_Level_II']


In [23]:
submission = pd.DataFrame({
    'id': test_id,
    'NObeyesdad': y_final
})

In [24]:
submission.to_csv('submission.csv', index=False)