In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling as ydp


In [2]:
#global variables


SEED = 42

FOLDS = 5

FILEPATH = '../data/'

In [3]:
test = pd.read_csv(f'{FILEPATH}test.csv')
train = pd.read_csv(f'{FILEPATH}train.csv')
#original = pd.read_csv(f'{FILEPATH}original.csv').drop('RowNumber', axis=1)
#train = pd.concat([train, original]).reset_index(drop=True).copy()

In [4]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("\n")
    
    print("DataFrame Columns:")
    print("______________________")
    display(df.columns)
    

analyze_dataframe(train)

DataFrame Information:
______________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null 

None



DataFrame Head:
______________________


Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II




DataFrame Tail:
______________________


Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
20753,20753,Male,25.137087,1.766626,114.187096,yes,yes,2.919584,3.0,Sometimes,no,2.151809,no,1.330519,0.19668,Sometimes,Public_Transportation,Obesity_Type_II
20754,20754,Male,18.0,1.71,50.0,no,yes,3.0,4.0,Frequently,no,1.0,no,2.0,1.0,Sometimes,Public_Transportation,Insufficient_Weight
20755,20755,Male,20.101026,1.819557,105.580491,yes,yes,2.407817,3.0,Sometimes,no,2.0,no,1.15804,1.198439,no,Public_Transportation,Obesity_Type_II
20756,20756,Male,33.852953,1.7,83.520113,yes,yes,2.671238,1.971472,Sometimes,no,2.144838,no,0.0,0.973834,no,Automobile,Overweight_Level_II
20757,20757,Male,26.680376,1.816547,118.134898,yes,yes,3.0,3.0,Sometimes,no,2.003563,no,0.684487,0.713823,Sometimes,Public_Transportation,Obesity_Type_II




DataFrame Description:
______________________


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,20758.0,10378.5,5992.46278,0.0,5189.25,10378.5,15567.75,20757.0
Age,20758.0,23.841804,5.688072,14.0,20.0,22.815416,26.0,61.0
Height,20758.0,1.700245,0.087312,1.45,1.631856,1.7,1.762887,1.975663
Weight,20758.0,87.887768,26.379443,39.0,66.0,84.064875,111.600553,165.057269
FCVC,20758.0,2.445908,0.533218,1.0,2.0,2.393837,3.0,3.0
NCP,20758.0,2.761332,0.705375,1.0,3.0,3.0,3.0,4.0
CH2O,20758.0,2.029418,0.608467,1.0,1.792022,2.0,2.549617,3.0
FAF,20758.0,0.981747,0.838302,0.0,0.008013,1.0,1.587406,3.0
TUE,20758.0,0.616756,0.602113,0.0,0.0,0.573887,1.0,2.0




Number of Null Values:
______________________


id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64



Number of Duplicated Rows:
______________________


0



Number of Unique Values:
______________________


id                                20758
Gender                                2
Age                                1703
Height                             1833
Weight                             1979
family_history_with_overweight        2
FAVC                                  2
FCVC                                934
NCP                                 689
CAEC                                  4
SMOKE                                 2
CH2O                               1506
SCC                                   2
FAF                                1360
TUE                                1297
CALC                                  3
MTRANS                                5
NObeyesdad                            7
dtype: int64



DataFrame Shape:
______________________
Rows: 20758, Columns: 18


DataFrame Columns:
______________________


Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

# Feature Engineering
___

In [29]:

def preprocess_data(df, cat_features, num_features, scaler):
    
    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=cat_features)

    # Normalize numerical features
    df[num_features] = scaler.fit_transform(df[num_features])


    return df

In [31]:
cat_features = [
    "Gender",
    "family_history_with_overweight",
    "FAVC",
    "CAEC",
    "SMOKE",
    "SCC",
    "CALC",
    "MTRANS",
]
num_features = [
    "Age",
    "Height",
    "Weight",
    "FCVC",
    "NCP",
    "CH2O",
    "FAF",
    "TUE",
]


In [34]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score,confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold


scaler = StandardScaler()

# Preprocess the training data
train_df = preprocess_data(train, cat_features, num_features, scaler)
test_df = preprocess_data(test, cat_features, num_features, scaler)

# Initialize the label encoder
label_encoder = LabelEncoder()
X_train = train_df.drop('NObeyesdad', axis=1)

# Fit and transform the labels to integers
y_train_encoded = label_encoder.fit_transform(train_df['NObeyesdad'])


# Initialize the stratified k-fold cross-validator
skf = StratifiedKFold(n_splits=FOLDS, random_state=SEED, shuffle=True)

# Split the data into training and validation sets using stratified k-fold cross-validation
for train_index, val_index in skf.split(X_train, y_train_encoded):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train_encoded[train_index], y_train_encoded[val_index]

    # Train the XGBoost model
    model = xgb.XGBClassifier(objective='multi:softprob', random_state=SEED)
    model.fit(X_train_fold, y_train_fold)

    # Make predictions on the validation set
    y_pred_fold = model.predict(X_val_fold)

    # Calculate accuracy for each fold
    accuracy_fold = accuracy_score(y_val_fold, y_pred_fold)
    print(f'Accuracy for fold: {accuracy_fold}')

Accuracy for fold: 0.9065510597302505
Accuracy for fold: 0.9026974951830443
Accuracy for fold: 0.9070327552986512
Accuracy for fold: 0.9019513370272224
Accuracy for fold: 0.9033967718622019


In [35]:
# Assuming 'train_df' is your preprocessed training dataframe and 'test_df' is your preprocessed test dataframe

# Get the list of columns in the training dataset
train_columns = X_train.columns

# Preprocess the test data
# Make sure to include the step where you preprocess 'test' to get 'test_df' here

# Align the test dataframe columns with the training dataframe columns
# Add missing columns in test_df with all zeros
for column in train_columns:
    if column not in test_df.columns:
        test_df[column] = 0

# Ensure the order of columns in test_df matches that of train_columns
test_df = test_df[train_columns]

# Now, make predictions on the test set
y_pred_test = model.predict(test_df)

# Inverse transform the predicted labels
y_pred_test_labels = label_encoder.inverse_transform(y_pred_test)

# Create a submission DataFrame. Assuming 'id' was stored or can be retrieved for the test set
submission_df = pd.DataFrame({
    'id': test['id'],  # Make sure to have saved or to retrieve the 'id' from the original test dataset
    'NObeyesdad': y_pred_test_labels
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

# Display the first few rows of the submission DataFrame
submission_df.head()

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
