In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor

# Loading the training dataset
data = pd.read_csv("train.csv")

# Droping irrelevant columns from the datset
irrelevant_columns = ['candidate_id', 'left_eyesight_lvl', 'right_eyesight_lvl', 'hemoglobin_lvl',
                      'urea_lvl', 'creatinine_lvl', 'liver_enzyme_lvl1', 'liver_enzyme_lvl2',
                      'residential_area']
data.drop(columns=irrelevant_columns, inplace=True)

# Feature engineering: 
# Calculating BMI when wheight and weight columns are given in the dataset
if 'weight_in_lbs' in data.columns and 'height_in_cm' in data.columns:
    data['bmi'] = data['weight_in_lbs'] * 0.453592 / (data['height_in_cm'] / 100) ** 2
    data.drop(columns=['height_in_cm', 'weight_in_lbs'], inplace=True)

# Separating features and target from the dataset
X = data.drop('triglyceride_lvl', axis=1)
y = data['triglyceride_lvl']

# Spliting the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Defining columns for preprocessing
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Preprocessing for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Applying preprocessing to training and validation data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_valid_preprocessed = preprocessor.transform(X_valid)

# Defining the model with early stopping 
model = XGBRegressor(n_estimators=100, early_stopping_rounds=10, eval_metric='rmse', random_state=42)

# Fit the model with validation set
model.fit(X_train_preprocessed, y_train, eval_set=[(X_valid_preprocessed, y_valid)], verbose=False)
y_pred = model.predict(X_valid_preprocessed)
mse = mean_squared_error(y_valid, y_pred)
mae = mean_absolute_error(y_valid, y_pred)
r2 = r2_score(y_valid, y_pred)

# Loading new data for prediction i.e. test data
data_new = pd.read_csv("test.csv")
candidate_ids = data_new['candidate_id'].copy()  # Save candidate IDs for output
data_new.drop(columns=irrelevant_columns, inplace=True, errors='ignore')

# Handling new data columns and missing values
if 'weight_in_lbs' in data_new.columns and 'height_in_cm' in data_new.columns:
    data_new['bmi'] = data_new['weight_in_lbs'] * 0.453592 / (data_new['height_in_cm'] / 100) ** 2
    data_new.drop(columns=['height_in_cm', 'weight_in_lbs'], inplace=True)

# Preprocess new data
X_new_preprocessed = preprocessor.transform(data_new)

# Predict using the trained model
y_new_pred = model.predict(X_new_preprocessed)

# Prepare the final DataFrame with candidate IDs and predicted triglyceride levels
results_df = pd.DataFrame({'candidate_id': candidate_ids, 'predicted_triglyceride_lvl': y_new_pred})

# Save results to a CSV file
results_df.to_csv("sample_submission.csv", index=False)

(mse, mae, r2, results_df.head())


(816.0905074315833,
 15.957699284436448,
 0.8784521921780615,
   candidate_id  predicted_triglyceride_lvl
 0    CAN_22401                  133.079498
 1    CAN_22402                   79.903091
 2    CAN_22403                   56.338291
 3    CAN_22404                  257.398254
 4    CAN_22405                  241.001617)