###  Classification Targets

Target - the target is an ordinal variable indicating groups of income levels.

1 = extreme poverty \
2 = moderate poverty \
3 = vulnerable households \
4 = non vulnerable households


https://www.kaggle.com/competitions/costa-rican-household-poverty-prediction/data

### Setup

In [3]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


### Data Preparation

In [5]:
import pandas as pd

In [8]:
data_dir = '../data/'
test_csv = data_dir + 'test.csv'
train_csv = data_dir + 'train.csv'

prediction_column = "Target"

def prepare_mean_fill(data):
    data = data[:]
    for column in data.columns:
        if column == prediction_column:
            continue
        if data[column].dtype == 'object':
            data = data.drop(column, axis=1)
            continue

        data[column] = data[column].fillna(data[column].mean())
    return data

data_prep_factory = {
    'mean': prepare_mean_fill
}

def prepare(data, fill_method = "mean", drop_columns = None):
    data = data[:]
    if drop_columns:
        data = data.drop(drop_columns, axis=1)

    preparation_handler = data_prep_factory.get(fill_method)
    if not preparation_handler:
        raise Exception("Invalid fill method")
    return preparation_handler(data)

def prepare_from_csv(csv_path, fill_method = "mean", drop_columns = None):
    data = pd.read_csv(csv_path)
    return prepare(data, fill_method, drop_columns)

In [13]:
test = prepare_from_csv(test_csv, fill_method="mean")
train = prepare_from_csv(train_csv, fill_method="mean")

### Submission

In [17]:
def export_submission(df):
    df.to_csv('submission.csv', index=False)

### Data Overview

In [16]:
print(f"Training data shape: {train.shape}, test data shape: {test.shape}")
test.head()

Training data shape: (9557, 138), test data shape: (23856, 137)


Unnamed: 0,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,r4h2,...,age,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq
0,174872.554316,0,5,0,1,1,0,1.348517,1,1,...,4,0,16,9,0,1,2.25,0.25,272.25,16
1,174872.554316,0,5,0,1,1,0,1.348517,1,1,...,41,256,1681,9,0,1,2.25,0.25,272.25,1681
2,174872.554316,0,5,0,1,1,0,1.348517,1,1,...,41,289,1681,9,0,1,2.25,0.25,272.25,1681
3,174872.554316,0,14,0,1,1,1,1.0,0,1,...,59,256,3481,1,256,0,1.0,0.0,256.0,3481
4,175000.0,0,4,0,1,1,1,1.0,0,0,...,18,121,324,1,0,1,0.25,64.0,100.50922,324


In [21]:
train.head()

Unnamed: 0,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,r4h2,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,190000.0,0,3,0,1,1,0,1.404063,0,1,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4
1,135000.0,0,4,0,1,1,1,1.0,0,1,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4
2,165231.606971,0,8,0,1,1,0,1.404063,0,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4
3,180000.0,0,5,0,1,1,1,1.0,0,2,...,81,289,16,121,4,1.777778,1.0,121.0,289,4
4,180000.0,0,5,0,1,1,1,1.0,0,2,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4


## Ensemble learning

### Shared Code

In [48]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

def train_model(model, train, test_size=0.2, param_grid = None):
    X = train.drop(prediction_column, axis=1)
    y = train[prediction_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    if param_grid is not None:
        model = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
        
    model.fit(X_train, y_train)
    accuracy = accuracy_score(y_test, model.predict(X_test))
    return model, accuracy

def important_features(trained_model):
    columns = train[:]
    columns = columns.drop(prediction_column, axis=1)
    columns = columns.columns
    if trained_model.__class__ == GridSearchCV:
        trained_model = trained_model.best_estimator_
        
    feature_importance = pd.DataFrame(trained_model.feature_importances_,
                                    index = columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
    return feature_importance

### AdaBoost

In [35]:
ada_boost_param_grid = {
    # run this when you have time
    # 'learning_rate': [0.01, 0.1, 0.5],
    # 'base_estimator__max_depth': [3, 5, 7],
    # 'base_estimator__min_samples_leaf': [5, 10, 20]

    'learning_rate': [0.01],
    'estimator__max_depth': [3],
    'estimator__min_samples_leaf': [5, 10]
}

ada_boost_model, training_accuracy = train_model(AdaBoostClassifier(estimator=DecisionTreeClassifier()), train, param_grid=ada_boost_param_grid)
print(f"AdaBoost training accuracy: {training_accuracy}")


AdaBoost training accuracy: 0.6767782426778243


In [36]:
ada_boost_model

In [39]:
ada_boost_model.best_params_

{'estimator__max_depth': 3,
 'estimator__min_samples_leaf': 5,
 'learning_rate': 0.01}

In [49]:
important_features(ada_boost_model)

Unnamed: 0,importance
SQBdependency,0.228698
meaneduc,0.199066
SQBmeaned,0.168028
hogar_nin,0.072995
SQBhogar_nin,0.058119
...,...
public,0.000000
abastaguano,0.000000
abastaguafuera,0.000000
abastaguadentro,0.000000


### Gradient Boosting

In [51]:
gradient_boost_param_grid = {
    # Run this at before going to sleep
    # 'learning_rate': [0.01, 0.1, 0.5],
    # 'max_depth': [3, 5, 7],
    # 'min_samples_leaf': [5, 10, 20]

    'learning_rate': [0.1],
    'max_depth': [3],
    'min_samples_leaf': [5]
}

gradient_boost_model, training_accuracy = train_model(GradientBoostingClassifier(), train, param_grid=gradient_boost_param_grid)
print(f"GradientBoost training accuracy: {training_accuracy}")


GradientBoost training accuracy: 0.74581589958159


In [52]:
gradient_boost_model.__class__

sklearn.model_selection._search.GridSearchCV

In [53]:
important_features(gradient_boost_model)

Unnamed: 0,importance
SQBdependency,0.112112
meaneduc,0.108947
SQBmeaned,0.094643
SQBhogar_nin,0.052052
qmobilephone,0.042270
...,...
instlevel2,0.000000
instlevel4,0.000000
instlevel9,0.000000
instlevel8,0.000000


### XGBoost

In [72]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

def train_xgboost_model(train, xgb_params, test_size=0.2):
    train = train[:]
    X = train.drop(prediction_column, axis=1)
    y = train[prediction_column]
    

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
    train_labels = le.fit_transform(y_train)
    val_labels = le.transform(y_test)

    dtrain = xgb.DMatrix(X_train, label=train_labels, enable_categorical=True)
    dval = xgb.DMatrix(X_test, label=val_labels, enable_categorical=True)
    eval_set = [(dtrain, 'train'), (dval, 'eval')]
    
    xgb_model = xgb.train(
        xgb_params, 
        dtrain, 
        num_boost_round=1000, 
        evals=eval_set, 
        early_stopping_rounds=10, 
        verbose_eval=False
    )
    accuracy = accuracy_score(val_labels, xgb_model.predict(dval))

    return xgb_model, accuracy

def xgb_feature_importance(xgb_model):
    feature_importance = xgb_model.get_score(importance_type='weight')
    feature_importance_df = pd.DataFrame({
        'Feature': list(feature_importance.keys()),
        'Importance': list(feature_importance.values())
    })
    feature_importance_df.sort_values(by='Importance', ascending=False).head(5)
    return feature_importance_df

In [73]:
xgb_params = {
    "objective": "multi:softmax", 
    "num_class": 4,
    "eval_metric": "merror"
}

xgb_model, training_accuracy = train_xgboost_model(train, xgb_params)
print(f"XGBoost training accuracy: {training_accuracy}")

XGBoost training accuracy: 0.9257322175732218


In [74]:
feature_importance_df = xgb_feature_importance(xgb_model)
feature_importance_df

Unnamed: 0,Feature,Importance
0,v2a1,750.0
1,hacdor,33.0
2,rooms,838.0
3,hacapo,14.0
4,v14a,12.0
...,...,...
117,area1,290.0
118,age,1131.0
119,SQBedjefe,1046.0
120,SQBdependency,799.0


## Tasks
- ✅ A working setup for Ensemble learning with AdaBoost, Gradient Boosting and XGBoost
- ✅ Feature importance for the different models
- ✅ Compare the different models accuracy
- ❌ Add different data preprocessing methods
- ❌ Balance dataset by undersampling, and possibly oversampling
- ❌ Find optimum parameter for the different models