In [1]:
import sys
!{sys.executable} -m pip install pandas catboost numpy scikit-learn


Collecting pandas
  Downloading pandas-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting matplotlib (from catboost)
  Downloading matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (52 kB)
Collecting scipy (from catboost)
  Downloading scipy-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.5.2-py3-none-any.whl.metadata (8.5 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Do

In [2]:
import pandas as pd
import re
import numpy as np
import os
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import itertools

In [None]:
SEED = 0
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

In [None]:
ALL_COLS = ['weight','hhid','survey_id','strata','utl_exp_ppp17','male','hsize','num_children5','NUM_CHILDREN5', \
            'num_children10','NUM_CHILDREN10','num_children18','NUM_CHILDREN18','age','owner','water','toilet', \
            'sewer','elect','water_source','sanitation_source','dweltyp','num_adult_female','NUM_ADULT_FEMALE', \
            'num_adult_male','NUM_ADULT_MALE','num_elderly','NUM_ELDERLY','employed','sworkershh', \
            'share_secondary','educ_max','sfworkershh','any_nonagric','sector1d','region','urban']
for i in range(1,51):
  ALL_COLS.append(f'C{i:02d}')

EXCLUDE_COLS = {'hhid', 'weight'}

FEATURE_COLS = [c for c in ALL_COLS if c not in EXCLUDE_COLS]

CAT_COLS = {
 'survey_id',
 'strata',
 'water_source',
 'sanitation_source',
 'dweltyp',
 'educ_max',
 'sector1d',
 'region',
 'urban'
}

In [None]:
answers = {}

for _, row in pd.read_csv("./train_hh_gt.csv").iterrows():
 answers[row["hhid"]] = np.log(float(row["cons_ppp17"]))

In [None]:
CAT_MISSING_TOKEN = "__MISSING__"

for csv_path in ['./train_hh_features.csv','./test_hh_features.csv']:

 pd.set_option('future.no_silent_downcasting', True)

 df = pd.read_csv(csv_path)

 if 'com' in df.columns:
  df = df.drop('com', axis=1)

 binary_cols = ['male', 'owner', 'water', 'toilet', 'sewer', 'elect', 'employed', 'any_nonagric', 'urban']
 for col in binary_cols:
  if col in df.columns:
   df[col] = df[col].astype(str).str.lower().replace({
    'yes': 1,
    'no': 0,
    'male': 1,
    'female': 0,
    'true': 1,
    'false': 0,
    'not owner': 0,
    'owner' : 1,
    'access': 1,
    'no access': 0,
    'urban': 1,
    'rural': 0,
    'employed' : 1,
    'not employed' : 0
   }).fillna(0).astype(int)\

 region_cols = [col for col in df.columns if re.match(r'region\d+', col)]
 if region_cols:
  df['region'] = pd.NA
  for col in region_cols:
   region_number = int(re.search(r'\d+', col).group())
   df.loc[df[col] == 1, 'region'] = region_number
  df = df.drop(columns=region_cols)
  df['region'] = df['region'].astype('Int64')

 for i in range(1, 51):
  old_col = f'consumed{i}00'
  new_col = f'C{i:02d}'
  if old_col in df.columns:
   df = df.rename(columns={old_col: new_col})
   df[new_col] = df[new_col].replace({'Yes': 1, 'No': 0}).fillna(0).astype(int)

 for col in df.columns:
  if df[col].dtype == 'object':
   df[col] = df[col].astype(str).str.lower().replace('nan', pd.NA)

 df['survey_id'] = (df['survey_id']/100000).astype(int)

 for col in ['num_children5','num_children10','num_children18','num_adult_female','num_adult_male','num_elderly']:
   df[col.upper()] = df[col] / df["hsize"]



 df = df[ALL_COLS]

 for col in CAT_COLS:
  if col in df.columns:
   df[col] = (
    df[col]
    .astype(str)
    .replace({'nan': CAT_MISSING_TOKEN, 'None': CAT_MISSING_TOKEN})
    .fillna(CAT_MISSING_TOKEN)
   )

 
 df.to_csv(csv_path, index=False)

  pd.set_option('future.no_silent_downcasting', True)
  pd.set_option('future.no_silent_downcasting', True)


In [None]:
CAT_FEATURE_INDICES = [
 i for i, c in enumerate(FEATURE_COLS) if c in CAT_COLS
]

In [8]:
train_df = pd.read_csv("./train_hh_features.csv")
test_df  = pd.read_csv("./test_hh_features.csv")


In [11]:
y_train = train_df["hhid"].map(answers)

train_mask = ~y_train.isna()

X_train = train_df.loc[train_mask, FEATURE_COLS]
y_train = y_train.loc[train_mask]


In [13]:
X_test = test_df[FEATURE_COLS]


In [None]:
model = CatBoostRegressor(
 loss_function="RMSE",
 depth=10,
 learning_rate=0.01,
 l2_leaf_reg=5,
 iterations=5000,
 early_stopping_rounds=200,
 task_type="GPU",
 random_seed=SEED,
 verbose=200
)

model.fit(
 X_train,
 y_train,
 cat_features=CAT_FEATURE_INDICES
)


0:	learn: 0.6970707	total: 49.1ms	remaining: 4m 5s
200:	learn: 0.3909264	total: 8.64s	remaining: 3m 26s
400:	learn: 0.3563591	total: 17.4s	remaining: 3m 19s
600:	learn: 0.3448401	total: 26.1s	remaining: 3m 11s


In [None]:
y_pred_log = model.predict(X_test)
y_pred = np.exp(y_pred_log)


In [None]:
survey_id (survey_id * 100000),
hhid,
cons_ppp17


In [None]:
output_df.to_csv(
 "predicted_household_consumption.csv",
 index=False
)


In [None]:
model.save_model("final_catboost_model.cbm")