In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import VarianceThreshold

# Load datasets
kenya_train = pd.read_csv('/kaggle/input/trainingsets/Kenya_training.csv')
kenya_test = pd.read_csv('/kaggle/input/testingset/Kenya_testing.csv')

spain_train = pd.read_csv('/kaggle/input/trainingsets/Spain_training.csv')
spain_val = pd.read_csv('/kaggle/input/testingset/Spain_validation.csv')

vnm_train = pd.read_csv('/kaggle/input/trainingsets/VNM_training.csv')
vnm_test = pd.read_csv('/kaggle/input/testingset/VNM_testing.csv')

# Combine training data
full_train = pd.concat([kenya_train, spain_train, vnm_train], ignore_index=True)
X_base = full_train.drop(columns=['ID', 'Lon', 'Lat', 'lon', 'lat', 'TARGET'], errors='ignore')
y = full_train['TARGET'].map({1: 0, 2: 1})

# Prepare test sets
drop_cols = ['ID', 'Lon', 'Lat', 'lon', 'lat']
kenya_X = kenya_test.drop(columns=drop_cols, errors='ignore').copy()
spain_X = spain_val.drop(columns=drop_cols, errors='ignore').copy()
vnm_X = vnm_test.drop(columns=drop_cols, errors='ignore').copy()

kenya_ids = kenya_test['ID']
spain_ids = spain_val['ID']
vnm_ids = vnm_test['ID']

# 1. Add 20 interaction features
cols = X_base.columns
combos = [(cols[i], cols[j]) for i in range(len(cols)) for j in range(i + 1, len(cols))][:20]
for a, b in combos:
    if a in X_base.columns and b in X_base.columns:
        X_base[f'{a}_{b}'] = X_base[a] * X_base[b]
    for df in [kenya_X, spain_X, vnm_X]:
        if a in df.columns and b in df.columns:
            df[f'{a}_{b}'] = df[a] * df[b]

# 2. Top 2 correlated features → mean encoding
correlations = X_base[cols].corrwith(y).abs().sort_values(ascending=False)
top2_cols = correlations.head(2).index
for col in top2_cols:
    mean_map = full_train.groupby(col)['TARGET'].mean()
    if col in X_base.columns:
        X_base[f'{col}_mean'] = X_base[col].map(mean_map)
    for df, raw in zip([kenya_X, spain_X, vnm_X], [kenya_test, spain_val, vnm_test]):
        if col in raw.columns:
            df[f'{col}_mean'] = raw[col].map(mean_map).fillna(mean_map.mean())

# Low variance filter
vt = VarianceThreshold(threshold=0.015)
X = vt.fit_transform(X_base)
kenya_X = vt.transform(kenya_X)
spain_X = vt.transform(spain_X)
vnm_X = vt.transform(vnm_X)

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kenya_scaled = scaler.transform(kenya_X)
spain_scaled = scaler.transform(spain_X)
vnm_scaled = scaler.transform(vnm_X)

# Classifiers
rf = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=0, class_weight='balanced_subsample')
gb = GradientBoostingClassifier(n_estimators=300, learning_rate=0.03, max_depth=4, random_state=0)
stack = StackingClassifier(
    estimators=[('rf', rf), ('gb', gb)],
    final_estimator=LogisticRegression(C=0.3, max_iter=1000),
    passthrough=False,
    n_jobs=-1
)

# Cross-validated accuracy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
val_preds = cross_val_predict(stack, X_scaled, y, cv=cv, method='predict')
print(" CV Accuracy on combined training data:", round(accuracy_score(y, val_preds), 4))
print(" Classification Report:\n", classification_report(y, val_preds))

# Final model training and predictions
stack.fit(X_scaled, y)

kenya_preds = stack.predict(kenya_scaled)
spain_preds = stack.predict(spain_scaled)
vnm_preds = stack.predict(vnm_scaled)

# Save predictions
pd.DataFrame({'ID': kenya_ids, 'Predicted': kenya_preds + 1}).to_csv("Kenya_predictions.csv", index=False)
pd.DataFrame({'ID': spain_ids, 'Predicted': spain_preds + 1}).to_csv("Spain_predictions.csv", index=False)
pd.DataFrame({'ID': vnm_ids, 'Predicted': vnm_preds + 1}).to_csv("Vietnam_predictions.csv", index=False)
print(" All 3 prediction files saved successfully.")


 CV Accuracy on combined training data: 0.9943
 Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1344
           1       1.00      0.99      0.99      1481

    accuracy                           0.99      2825
   macro avg       0.99      0.99      0.99      2825
weighted avg       0.99      0.99      0.99      2825

 All 3 prediction files saved successfully.
