In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

# Fix uniqueid format
test['uniqueid'] = test['uniqueid'] + ' x ' + test['country']

# Target
y = (train['bank_account'] == 'Yes').astype(int)
train = train.drop(['bank_account'], axis=1)

# Combine
all_data = pd.concat([train, test], sort=False).reset_index(drop=True)

# Feature Engineering
all_data['household_per_phone'] = all_data['household_size'] / (all_data['cellphone_access'] == 'Yes').astype(int).replace(0, 1)
all_data['age_bin'] = pd.cut(all_data['age_of_respondent'], bins=[0, 25, 35, 50, 100], labels=[0,1,2,3]).astype(int)
all_data['is_head'] = (all_data['relationship_with_head'] == 'Head of Household').astype(int)
all_data['is_spouse'] = (all_data['relationship_with_head'] == 'Spouse').astype(int)
all_data['urban_head'] = ((all_data['location_type'] == 'Urban') & (all_data['is_head'] == 1)).astype(int)
all_data['has_phone_head'] = ((all_data['cellphone_access'] == 'Yes') & (all_data['is_head'] == 1)).astype(int)
all_data['age_household_ratio'] = all_data['age_of_respondent'] / all_data['household_size']
all_data['education_job_interaction'] = all_data['education_level'].astype(str) + "_" + all_data['job_type'].astype(str)

# Encode categoricals
cat_cols = ['country', 'location_type', 'gender_of_respondent', 'relationship_with_head',
            'marital_status', 'education_level', 'job_type', 'education_job_interaction']

encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    all_data[col + '_enc'] = le.fit_transform(all_data[col].astype(str))
    encoders[col] = le

# Features
feature_cols = [col for col in all_data.columns if 'enc' in col] + \
               ['household_size', 'age_of_respondent', 'household_per_phone', 'age_bin',
                'is_head', 'is_spouse', 'urban_head', 'has_phone_head', 'age_household_ratio']

# Split
X_train = all_data.iloc[:len(train)][feature_cols]
X_test = all_data.iloc[len(train):][feature_cols]

# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_train))
test_preds = np.zeros(len(X_test))

# Model
model = GradientBoostingClassifier(
    n_estimators=1200,
    learning_rate=0.007,
    max_depth=6,
    min_samples_split=25,
    min_samples_leaf=20,
    subsample=0.82,
    max_features='sqrt',
    random_state=42
)

# Train
print("Training model with 5-fold CV...")
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model.fit(X_tr, y_tr)
    val_pred = model.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = val_pred
    test_preds += model.predict_proba(X_test)[:, 1] / skf.n_splits

# Final CV
cv_score = log_loss(y, oof_preds)
print(f"\nFinal CV Log Loss: {cv_score:.9f}")

# Final prediction with calibrated threshold
threshold = 0.48  # tuned on CV
final_pred_binary = (test_preds >= threshold).astype(int)

# Submission
submission = pd.DataFrame({
    'uniqueid': test['uniqueid'],
    'bank_account': final_pred_binary
})

# Save
submission.to_csv('submission_binary.csv', index=False)

# Show results for specific IDs
target_ids = [
    'uniqueid_7867 x Kenya', 'uniqueid_6722 x Kenya', 
    'uniqueid_6714 x Kenya', 'uniqueid_8103 x Kenya', 'uniqueid_8657 x Kenya'
]

print("\nResults for requested Kenya IDs:")
results = submission[submission['uniqueid'].isin(target_ids)].copy()
print(results.to_string(index=False))

print(f"\nSubmission saved: submission_binary.csv")
print(f"Total predictions: {len(submission)}")
print(f"CV Log Loss: {cv_score:.9f}")
print(f"Threshold used: {threshold}")
print(f"Predicted 1s: {final_pred_binary.sum()}")

Training model with 5-fold CV...

Final CV Log Loss: 0.284302689

Results for requested Kenya IDs:
             uniqueid  bank_account
uniqueid_6714 x Kenya             1
uniqueid_6722 x Kenya             0
uniqueid_7867 x Kenya             0
uniqueid_8103 x Kenya             1
uniqueid_8657 x Kenya             0

Submission saved: submission_binary.csv
Total predictions: 10086
CV Log Loss: 0.284302689
Threshold used: 0.48
Predicted 1s: 785
