In [216]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import tensorflow.keras.backend as K
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [217]:
train_path = "/kaggle/input/african/Train.csv"
test_path = "/kaggle/input/african/Test.csv"
economic_path = "/kaggle/input/economic/economic_indicators.csv"

In [218]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
macro_data = pd.read_csv(economic_path)

In [219]:
def impute_macro_indicators(df):
    df = df[df['Country'] != "Cote d'Ivoire"].copy()
    
    df_long = pd.melt(
        df,
        id_vars=['Country', 'Indicator'],
        value_vars=[col for col in df.columns if col.startswith('YR')],
        var_name='Year',
        value_name='Value'
    )
    
    interest_indicators = [
        'Interest rate spread (lending rate minus deposit rate, %)',
        'Lending interest rate (%)',
        'Real interest rate (%)',
        'Deposit interest rate (%)'
    ]
    
    for indicator in interest_indicators:
        kenya_values = df_long[
            (df_long['Country'] == 'Kenya') & 
            (df_long['Indicator'] == indicator)
        ]['Value'].mean()
        
        mask = (
            (df_long['Country'] == 'Ghana') & 
            (df_long['Indicator'] == indicator) &
            (df_long['Value'].isna())
        )
        df_long.loc[mask, 'Value'] = kenya_values
    
    mask = df_long['Indicator'] == 'Fossil fuel energy consumption (% of total)'
    df_long.loc[mask, 'Value'] = df_long.loc[mask].groupby('Country')['Value'].transform(
        lambda x: x.fillna(method='ffill').fillna(method='bfill')
    )
    
    mask = df_long['Indicator'] == 'Average precipitation in depth (mm per year)'
    df_long.loc[mask, 'Value'] = df_long.loc[mask].groupby('Country')['Value'].transform(
        lambda x: x.fillna(x.mean())
    )
    
    df_wide = df_long.pivot_table(
        index=['Country', 'Indicator'],
        columns='Year',
        values='Value'
    ).reset_index()
    
    print("Missing values after imputation:", df_wide.isna().sum().sum())
    return df_wide

macro_data = impute_macro_indicators(macro_data)

Missing values after imputation: 0


  lambda x: x.fillna(method='ffill').fillna(method='bfill')


In [220]:
def merge_with_nearest_year(data, macro_pivoted):
    data = data.copy()
    
    merged_data = pd.DataFrame()
    
    # Get unique years in macro data
    available_years = macro_pivoted['Year'].unique()
    
    # For each year in the loan data
    for year in data['disbursement_year'].unique():
        # Get the subset of data for this year
        year_data = data[data['disbursement_year'] == year]
        
        # Find the nearest available year in macro data
        nearest_year = min(available_years, key=lambda x: abs(x - year))
        
        # Merge with the nearest year data
        macro_year = macro_pivoted[macro_pivoted['Year'] == nearest_year]
        temp_merged = year_data.merge(
            macro_year,
            left_on=['country_id'],
            right_on=['Country'],
            how='left'
        )
        
        merged_data = pd.concat([merged_data, temp_merged])
    
    # Drop the extra Country and Year columns
    merged_data = merged_data.drop(['Country', 'Year'], axis=1)
    
    return merged_data

In [221]:
def preprocess_data():
    selected_indicators = [
        # 'Inflation, consumer prices (annual %)',
        'Unemployment rate',
        # 'Lending interest rate (%)',
        'Real interest rate (%)'
    ]
    
    macro_long = pd.melt(
        macro_data,
        id_vars=['Country', 'Indicator'],
        value_vars=[col for col in macro_data.columns if col.startswith('YR')],
        var_name='Year',
        value_name='Value'
    )
    
    macro_long['Year'] = macro_long['Year'].str.replace('YR', '').astype(int)
    
    macro_filtered = macro_long[macro_long['Indicator'].isin(selected_indicators)]
    macro_pivoted = macro_filtered.pivot_table(
        index=['Country', 'Year'],
        columns='Indicator',
        values='Value'
    ).reset_index()

    data = pd.concat([train, test]).reset_index(drop=True)
    data['disbursement_date'] = pd.to_datetime(data['disbursement_date'], errors='coerce')
    data['due_date'] = pd.to_datetime(data['due_date'], errors='coerce')
    
    data['disbursement_year'] = data['disbursement_date'].dt.year
    
    data = merge_with_nearest_year(data, macro_pivoted)

    date_cols = ['disbursement_date', 'due_date']
    for col in date_cols:
        data[col] = pd.to_datetime(data[col])
        data[col+'_month'] = data[col].dt.month
        data[col+'_day'] = data[col].dt.day
        data[col+'_year'] = data[col].dt.year
        data[f'loan_term_days'] = (data['due_date'] - data['disbursement_date']).dt.days
        data[f'disbursement_weekday'] = data['disbursement_date'].dt.weekday
        data[f'due_weekday'] = data['due_date'].dt.weekday
    
    data['repayment_ratio'] = data['Total_Amount_to_Repay'] / data['Total_Amount']
    data['log_Total_Amount'] = np.log1p(data['Total_Amount'])
    # data['interest_rate'] = (data['Total_Amount_to_Repay'] - data['Total_Amount']) / data['Total_Amount'] * 100
    
    # # Additional features using macro indicators
    # data['real_vs_loan_interest_spread'] = data['interest_rate'] - data['Real interest rate (%)']

    # data['loan_duration_years'] = (data['due_date'] - data['disbursement_date']).dt.days / 365.25
    # data['inflation_adjusted_amount'] = data['Total_Amount'] / (
    #     (1 + data['Inflation, consumer prices (annual %)'] / 100) ** data['loan_duration_years']
    # )
    
    # Categorical encoding
    cat_cols = data.select_dtypes(include='object').columns
    data = pd.get_dummies(data, columns=['loan_type'], prefix='loan_type', drop_first=False)
    loan_type_cols = [col for col in data.columns if col.startswith('loan_type_')]
    data[loan_type_cols] = data[loan_type_cols].astype(int)
    
    le = LabelEncoder()
    for col in [col for col in cat_cols if col not in ['loan_type', 'ID', 'Country']]:
        data[col] = le.fit_transform(data[col])
    
    # Split back into train and test
    train_df = data[data['ID'].isin(train['ID'].unique())]
    test_df = data[data['ID'].isin(test['ID'].unique())]
    
    # Update features_for_modelling to include new macro features
    features_for_modelling = [col for col in train_df.columns if col not in 
                            date_cols + ['ID', 'target', 'country_id', 'disbursement_year']]
    
    return train_df, test_df, features_for_modelling

In [222]:
def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

def find_optimal_threshold(predictions, y_true):
    thresholds = np.arange(0.1, 0.9, 0.02)
    best_threshold = 0.5
    best_f1 = 0.0
    
    for threshold in thresholds:
        pred_labels = (predictions > threshold).astype(int)
        f1 = f1_score(y_true, pred_labels)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            
    return best_threshold, best_f1

In [223]:
def create_model(input_dim):
    model = models.Sequential([
        # Input layer
        layers.Input(shape=(input_dim,)),
        layers.BatchNormalization(),
        
        # First layer - increased capacity
        layers.Dense(64, 
                    kernel_initializer='he_normal',
                    kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        layers.LeakyReLU(alpha=0.2),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        # Second layer
        layers.Dense(32,
                    kernel_initializer='he_normal',
                    kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        layers.LeakyReLU(alpha=0.2),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        
        layers.Dense(1, activation='sigmoid',
                    bias_initializer=tf.keras.initializers.Constant(np.log(956/50534)))
    ])
    return model


def train_with_kfold(X, y, features_for_modelling, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Initialize lists to store results
    models = []
    scalers = []
    thresholds = []
    f1_scores = []
    
    # Convert y to numpy array and ensure it's binary
    y = np.array(y).astype(int)
    
    # For each fold
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\nTraining fold {fold + 1}/{n_splits}")
        
        # Split data
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Print class distribution for debugging
        print("Training set class distribution:", np.bincount(y_train))
        print("Validation set class distribution:", np.bincount(y_val))
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        
        # Create and compile model
        model = create_model(len(features_for_modelling))
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
            loss='binary_crossentropy',
            metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
        )
        
        # Calculate class weights
        n_neg = np.sum(y_train == 0)
        n_pos = np.sum(y_train == 1)
        total = len(y_train)
        
        class_weights = {
            0: total / (2.0 * n_neg),
            1: total / (2.0 * n_pos)
        }
        
        print("Class weights:", class_weights)

        early_stopping = callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        )
        
        # Train model
        history = model.fit(
            X_train_scaled, y_train,
            validation_data=(X_val_scaled, y_val),
            epochs=60,
            batch_size=128,
            class_weight=class_weights,
            callbacks=[early_stopping],
            verbose=1
        )
        
        # Find optimal threshold for this fold
        val_pred = model.predict(X_val_scaled)
        threshold, f1 = find_optimal_threshold(val_pred, y_val)
        
        print(f"Fold {fold + 1} - Best threshold: {threshold:.3f}, F1-score: {f1:.4f}")
        
        # Store results
        models.append(model)
        scalers.append(scaler)
        thresholds.append(threshold)
        f1_scores.append(f1)
    
    # Print summary
    print("\nCross-validation summary:")
    print(f"Mean F1-score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")
    print(f"Mean threshold: {np.mean(thresholds):.4f} ± {np.std(thresholds):.4f}")
    
    return models, scalers, thresholds, f1_scores

In [224]:
train_df, test_df, features_for_modelling = preprocess_data()

In [225]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df[features_for_modelling], 
    train_df['target'], 
    random_state=42
)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(51490, 47) (17164, 47) (51490,) (17164,)


In [226]:
models, scalers, thresholds, f1_scores = train_with_kfold(
    X_train, y_train, features_for_modelling
)


Training fold 1/5
Training set class distribution: [40453   739]
Validation set class distribution: [10114   184]
Class weights: {0: 0.5091340568066646, 1: 27.870094722598104}
Epoch 1/60




[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9624 - auc: 0.8692 - loss: 2.5988 - val_accuracy: 0.9467 - val_auc: 0.9768 - val_loss: 1.5826
Epoch 2/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9334 - auc: 0.9722 - loss: 1.5415 - val_accuracy: 0.9440 - val_auc: 0.9795 - val_loss: 1.2502
Epoch 3/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9336 - auc: 0.9763 - loss: 1.2255 - val_accuracy: 0.9380 - val_auc: 0.9816 - val_loss: 1.0204
Epoch 4/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9326 - auc: 0.9807 - loss: 0.9952 - val_accuracy: 0.9263 - val_auc: 0.9820 - val_loss: 0.8489
Epoch 5/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9323 - auc: 0.9804 - loss: 0.8310 - val_accuracy: 0.9381 - val_auc: 0.9813 - val_loss: 0.6932
Epoch 6/60
[1m322/322[0m [32m━━━━━━━━━



[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9509 - auc: 0.7850 - loss: 2.9316 - val_accuracy: 0.9262 - val_auc: 0.9704 - val_loss: 1.7121
Epoch 2/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9283 - auc: 0.9738 - loss: 1.6377 - val_accuracy: 0.9291 - val_auc: 0.9854 - val_loss: 1.3768
Epoch 3/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9267 - auc: 0.9726 - loss: 1.3432 - val_accuracy: 0.9289 - val_auc: 0.9890 - val_loss: 1.1422
Epoch 4/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9290 - auc: 0.9764 - loss: 1.1139 - val_accuracy: 0.9226 - val_auc: 0.9880 - val_loss: 0.9752
Epoch 5/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9227 - auc: 0.9714 - loss: 0.9653 - val_accuracy: 0.9412 - val_auc: 0.9907 - val_loss: 0.7805
Epoch 6/60
[1m322/322[0m [32m━━━━━━━━━



[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9595 - auc: 0.7946 - loss: 2.8495 - val_accuracy: 0.9313 - val_auc: 0.9798 - val_loss: 1.6706
Epoch 2/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9273 - auc: 0.9679 - loss: 1.6362 - val_accuracy: 0.9322 - val_auc: 0.9834 - val_loss: 1.3543
Epoch 3/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9270 - auc: 0.9645 - loss: 1.3374 - val_accuracy: 0.9379 - val_auc: 0.9857 - val_loss: 1.1003
Epoch 4/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9356 - auc: 0.9769 - loss: 1.0905 - val_accuracy: 0.9505 - val_auc: 0.9861 - val_loss: 0.8981
Epoch 5/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9332 - auc: 0.9764 - loss: 0.9144 - val_accuracy: 0.9273 - val_auc: 0.9864 - val_loss: 0.8042
Epoch 6/60
[1m322/322[0m [32m━━━━━━━━━



[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9632 - auc: 0.6864 - loss: 3.2213 - val_accuracy: 0.9312 - val_auc: 0.9744 - val_loss: 1.6667
Epoch 2/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9255 - auc: 0.9650 - loss: 1.6713 - val_accuracy: 0.9254 - val_auc: 0.9808 - val_loss: 1.3818
Epoch 3/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9251 - auc: 0.9753 - loss: 1.3608 - val_accuracy: 0.9273 - val_auc: 0.9814 - val_loss: 1.1523
Epoch 4/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9233 - auc: 0.9741 - loss: 1.1517 - val_accuracy: 0.9346 - val_auc: 0.9824 - val_loss: 0.9761
Epoch 5/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9283 - auc: 0.9781 - loss: 0.9679 - val_accuracy: 0.9363 - val_auc: 0.9834 - val_loss: 0.8298
Epoch 6/60
[1m322/322[0m [32m━━━━━━━━━



[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9581 - auc: 0.7974 - loss: 2.7792 - val_accuracy: 0.9342 - val_auc: 0.9666 - val_loss: 1.6112
Epoch 2/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9278 - auc: 0.9605 - loss: 1.6103 - val_accuracy: 0.9325 - val_auc: 0.9738 - val_loss: 1.3297
Epoch 3/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9251 - auc: 0.9705 - loss: 1.3129 - val_accuracy: 0.9359 - val_auc: 0.9768 - val_loss: 1.0926
Epoch 4/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9283 - auc: 0.9718 - loss: 1.0966 - val_accuracy: 0.9486 - val_auc: 0.9847 - val_loss: 0.8886
Epoch 5/60
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9325 - auc: 0.9797 - loss: 0.8997 - val_accuracy: 0.9480 - val_auc: 0.9885 - val_loss: 0.7521
Epoch 6/60
[1m322/322[0m [32m━━━━━━━━

In [227]:
def make_ensemble_predictions(models, scalers, thresholds, test_data, features_for_modelling):
    all_predictions = []
    
    for model, scaler, threshold in zip(models, scalers, thresholds):
        X_test_scaled = scaler.transform(test_data[features_for_modelling])
        pred_proba = model.predict(X_test_scaled)
        pred_binary = (pred_proba > threshold).astype(int)
        all_predictions.append(pred_binary)
    
    ensemble_predictions = np.round(np.mean(all_predictions, axis=0)).astype(int)
    
    print("\nEnsemble prediction distribution:")
    print(np.bincount(ensemble_predictions.ravel()))
    
    output = pd.DataFrame({
        'ID': test_data['ID'],
        'target': ensemble_predictions.ravel()
    })
    
    return output

In [228]:
output = make_ensemble_predictions(
    models, scalers, thresholds, test_df, features_for_modelling
)

[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

Ensemble prediction distribution:
[17900   694]


In [229]:
output.to_csv('Test_predictions.csv', index=False)
print("Predictions saved to Test_predictions.csv")

Predictions saved to Test_predictions.csv


In [230]:
histories = []

histories.append(history)

plt.figure(figsize=(15, 5))

avg_loss = np.mean([h.history['loss'] for h in histories], axis=0)
avg_val_loss = np.mean([h.history['val_loss'] for h in histories], axis=0)
avg_auc = np.mean([h.history['auc'] for h in histories], axis=0)
avg_val_auc = np.mean([h.history['val_auc'] for h in histories], axis=0)

plt.subplot(1, 2, 1)
plt.plot(avg_loss, label='Training Loss')
plt.plot(avg_val_loss, label='Validation Loss')
plt.title('Average Loss Across Folds')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(avg_auc, label='Training AUC')
plt.plot(avg_val_auc, label='Validation AUC')
plt.title('Average AUC Across Folds')
plt.legend()

plt.tight_layout()
plt.show()

for i, f1 in enumerate(f1_scores):
    print(f"Fold {i+1} - F1 Score: {f1:.4f}, Threshold: {thresholds[i]:.4f}")

NameError: name 'history' is not defined