### ✅ Submission

This section will contain a reusable utility function to submit the predictions to the competition.

In [3]:
%pip install -q pandas numpy scikit-learn xgboost imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [4]:
# DATA_DIR = '/kaggle/input/costa-rican-household-poverty-prediction/'
DATA_DIR = '../data/'

TEST_CSV = DATA_DIR + 'test.csv'
TRAIN_CSV = DATA_DIR + 'train.csv'
TEST_CSV = DATA_DIR + 'test.csv'

TARGET_COLUMN = "Target"
ID_COLUMN = "Id"
HOUSE_HOLD_ID_COLUMN = "idhogar"

DEFAULT_RANDOM_STATE = 369
DEFAULT_TEST_SIZE = 0.3
DEFAULT_VALIDATION_SIZE = 0.3
DEFAULT_CROSS_VALIDATION = 4

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

### Data Preparation

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from copy import deepcopy

pipeline = None

def fill_and_encode(data, fit=False):
    global pipeline

    num_cols = data.select_dtypes(include=['int64', 'float64']).columns
    cat_cols = data.select_dtypes(include=['object', 'bool']).columns  

    num_transformer = SimpleImputer(strategy='median') 
    cat_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num_cols),
            ('cat', cat_transformer, cat_cols)
        ])

    if pipeline is None or fit:
        pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
        fit_data = pipeline.fit_transform(data)
    else:
        fit_data = pipeline.transform(data)
    
    onehot_encoder = pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
    onehot_cols = [f"{col}_{val}" for col, vals in zip(cat_cols, onehot_encoder.categories_) for val in vals]
    feature_names = np.append(num_cols, onehot_cols)
    return pd.DataFrame(fit_data, columns = feature_names)

def remove_columns(data):
    cols_to_remove = [ ID_COLUMN, TARGET_COLUMN, HOUSE_HOLD_ID_COLUMN ]
    for col in cols_to_remove:
        if col in data.columns:
            data = data.drop(columns = col)
    return data

def prepare(csv_path = None, data = None):
    if csv_path is None and data is None:
        raise ValueError("Either csv_path or data must be provided")
    if csv_path is not None and data is not None:
        raise ValueError("Only one of csv_path or data must be provided")
    raw_data = pd.read_csv(csv_path) if csv_path is not None else deepcopy(data)
    raw_data = remove_columns(raw_data)
    return fill_and_encode(raw_data)

### Training

In [19]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

def train_model(
    model, 
    train_x, 
    train_y, 
    param_grid = None, 
    boosting = 'not-xgb',
    encoder = None,
    test_size=DEFAULT_VALIDATION_SIZE, 
    cv = DEFAULT_CROSS_VALIDATION):

    if boosting == 'xgb' and encoder is None:
        raise ValueError("encoder must be provided for xgb boosting")

    X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=test_size, random_state=DEFAULT_RANDOM_STATE)

    if param_grid is not None:
        model = GridSearchCV(model, param_grid, cv=cv, scoring='f1_weighted', n_jobs=-1)

    model.fit(X_train, y_train if boosting != 'xgb' else encoder.fit_transform(y_train))
    
    predictions = model.predict(X_test)
    predictions = predictions if boosting != 'xgb' else encoder.inverse_transform(predictions)
    accuracy = accuracy_score(y_test, predictions)

    return model, accuracy

### Sampling

In [20]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler

def oversample(X, y):
    ros = RandomOverSampler(random_state=0)
    return ros.fit_resample(X, y)

def undersample(X, y):
    ros = RandomUnderSampler(random_state=0)
    return ros.fit_resample(X, y)

def smotsample(X, y):
    ros = SMOTETomek(sampling_strategy='auto')
    return ros.fit_resample(X, y)

### Submission

In [9]:
def submit(model, xgb_encoder = None):
    test_data = pd.read_csv(TEST_CSV)
    pred_input = prepare(TEST_CSV)
    predictions = model.predict(pred_input)
    predictions = predictions if xgb_encoder is None else xgb_encoder.inverse_transform(predictions)
    submission_df = pd.DataFrame({ID_COLUMN: test_data[ID_COLUMN], TARGET_COLUMN: predictions})
    submission_df.to_csv('submission.csv', index=False)

In [21]:
csv_data = pd.read_csv(TRAIN_CSV)
TARGET = csv_data[TARGET_COLUMN]
DATA = prepare(data=csv_data)
TRAINING_FEATURES = DATA.columns

X_train, X_test, y_train, y_test = train_test_split(DATA, TARGET, test_size=DEFAULT_TEST_SIZE, random_state=DEFAULT_RANDOM_STATE)
oversampled_X, oversampled_y = oversample(X_train, y_train)

In [None]:
params = {'estimator__max_depth': [10], 'estimator__min_samples_leaf': [5], 'learning_rate': [0.5]}
gs, _ = train_model(AdaBoostClassifier(estimator=DecisionTreeClassifier()), oversampled_X, oversampled_y, param_grid=params)

In [20]:
g_params = {'n_estimators': [300], 'min_samples_leaf': [5], 'max_depth': [10], 'learning_rate': [0.5] }
gb_result = train_model(GradientBoostingClassifier(), oversampled_X, oversampled_y, param_grid=g_params)

In [None]:
gb_result[1]

In [22]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

le = LabelEncoder()

x_params = {
    'objective': ['multi:logistic'], 
    'num_class': [4], 
    'n_estimators': [100], 
    'max_depth': [10], 
    'learning_rate': [0.1], 
    'eval_metric': ['merror']
}

xgb_result = train_model(
    XGBClassifier(), 
    oversampled_X, 
    oversampled_y, 
    param_grid=x_params, 
    boosting='xgb', 
    encoder=le,
)

submit(xgb_result[0], le)

In [23]:
xgb_result[1]

0.9747565096402305

In [None]:
submit(gs)