# Remittance to the Philippines â€“ Machine Learning Predictive Modeling

**Dataset Source:**  
https://www.kaggle.com/datasets/joshbuttler/remittance-to-the-philippines

**Input Files:**  
- data/processed/remittance_cleaned.csv  
- (Optional) data/processed/remittance_clustered.csv

**Purpose:**  
Build and evaluate predictive models to:
- Predict remittance amounts
- Compare linear vs tree-based models
- Identify key drivers via feature importance

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

pd.set_option("display.float_format", "{:,.2f}".format)
pd.set_option("display.max_columns", None)

In [None]:
DATA_PATH = "../data/processed/remittance_cleaned.csv"
df = pd.read_csv(DATA_PATH)

df.head()

In [None]:
# Optional: merge clustering features if available
CLUSTER_PATH = "../data/processed/remittance_clustered.csv"

try:
    cluster_df = pd.read_csv(CLUSTER_PATH)
    merge_cols = [c for c in cluster_df.columns if c in df.columns]
    df = df.merge(cluster_df, on=merge_cols, how="left")
    print("Cluster features merged.")
except Exception as e:
    print("Cluster file not found. Proceeding without clusters.")

In [None]:
target_col = "amount" if "amount" in df.columns else df.select_dtypes(np.number).columns[0]

# Time features
if "date" in df.columns:
    df["date"] = pd.to_datetime(df["date"])
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month

In [None]:
# Drop obvious leakage columns
drop_cols = ["date"]
X = df.drop(columns=[target_col] + [c for c in drop_cols if c in df.columns])
y = df[target_col]

In [None]:
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

numeric_features, categorical_features

In [None]:
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape

In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01)
}

results = {}

for name, model in models.items():
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    results[name] = {
        "MAE": mean_absolute_error(y_test, preds),
        "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
        "R2": r2_score(y_test, preds)
    }

pd.DataFrame(results).T

In [None]:
tree_models = {
    "RandomForest": RandomForestRegressor(
        n_estimators=300, random_state=42, n_jobs=-1
    ),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

for name, model in tree_models.items():
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    results[name] = {
        "MAE": mean_absolute_error(y_test, preds),
        "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
        "R2": r2_score(y_test, preds)
    }

pd.DataFrame(results).T.sort_values("RMSE")

In [None]:
best_model_name = min(results, key=lambda k: results[k]["RMSE"])
best_model_name

In [None]:
best_model = tree_models.get(best_model_name, models.get(best_model_name))

pipe_best = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", best_model)
])

cv_rmse = -cross_val_score(
    pipe_best,
    X,
    y,
    cv=5,
    scoring="neg_root_mean_squared_error"
)

cv_rmse.mean(), cv_rmse.std()

In [None]:
pipe_best.fit(X_train, y_train)

model = pipe_best.named_steps["model"]
ohe = pipe_best.named_steps["preprocessor"].named_transformers_["cat"]["onehot"]

feature_names = (
    numeric_features +
    list(ohe.get_feature_names_out(categorical_features))
)

importances = model.feature_importances_

fi = (
    pd.DataFrame({
        "feature": feature_names,
        "importance": importances
    })
    .sort_values("importance", ascending=False)
)

fi.head(15)

In [None]:
sns.barplot(
    data=fi.head(15),
    y="feature",
    x="importance"
)
plt.title("Top 15 Feature Importances")
plt.show()

In [None]:
y_pred = pipe_best.predict(X_test)

sns.scatterplot(x=y_test, y=y_pred)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         linestyle="--", color="red")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted Remittance Amounts")
plt.show()

## Model Interpretation

- Tree-based models outperform linear baselines, capturing non-linear effects.
- Time-related features (year, month) are strong predictors.
- Country and cluster features contribute significant explanatory power.
- The model is suitable for scenario analysis and short-term forecasting.
