In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/african-credit-scoring/VariableDefinitions.txt
/kaggle/input/african-credit-scoring/SampleSubmission.csv
/kaggle/input/african-credit-scoring/manifest-5bc184473748f33ea6ca9d341f2737ff20241203-14702-2xdh7n.json
/kaggle/input/african-credit-scoring/Train.csv
/kaggle/input/african-credit-scoring/Test.csv
/kaggle/input/african-credit-scoring/economic_indicators.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import torch
import warnings

warnings.filterwarnings("ignore")

In [3]:
# Load the data
train = pd.read_csv('/kaggle/input/african-credit-scoring/Train.csv')
test = pd.read_csv('/kaggle/input/african-credit-scoring/Test.csv')

# Preprocessing
data = pd.concat([train, test]).reset_index(drop=True)

In [4]:
# Convert the datetime columns appropriately
date_cols = ['disbursement_date', 'due_date']
for col in date_cols:
    data[col] = pd.to_datetime(data[col])
    data[col+'_month'] = data[col].dt.month
    data[col+'_day'] = data[col].dt.day
    data[col+'_year'] = data[col].dt.year

# Handle categorical columns
cat_cols = data.select_dtypes(include='object').columns
data = pd.get_dummies(data, columns=['loan_type'], prefix='loan_type', drop_first=False)


In [5]:
# Label-encoding for other categorical columns
le = LabelEncoder()
for col in [col for col in cat_cols if col not in ['loan_type', 'ID']]:
    data[col] = le.fit_transform(data[col])

# Log-transform for numerical columns to handle skewness
data['Total_Amount'] = np.log1p(data['Total_Amount'])

# Split the data back into train and test
train_df = data[data['ID'].isin(train['ID'].unique())]
test_df = data[data['ID'].isin(test['ID'].unique())]

In [6]:
# Drop unnecessary columns
features_for_modelling = [col for col in train_df.columns if col not in date_cols + ['ID', 'target', 'country_id']]

# Split train data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df[features_for_modelling],
    train['target'],
    stratify=train['target'],
    shuffle=True,
    test_size=0.2,
    random_state=42
)

In [7]:
# Apply SMOTE to balance the data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_valid_scaled = scaler.transform(X_valid)
test_scaled = scaler.transform(test_df[features_for_modelling])

In [8]:
# Classifier Dictionary with GridSearchCV parameters
classifiers = {
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [10, 20],
            'min_samples_split': [2, 5]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(tree_method='gpu_hist' if torch.cuda.is_available() else 'hist', use_label_encoder=False),
        'params': {
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 6],
            'n_estimators': [100, 200]
        }
    },
    'LightGBM': {
        'model': LGBMClassifier(random_state=42),
        'params': {
            'num_leaves': [31, 63],
            'learning_rate': [0.01, 0.1],
            'n_estimators': [100, 200]
        }
    },
    'CatBoost': {
        'model': CatBoostClassifier(silent=True, random_state=42),
        'params': {
            'learning_rate': [0.01, 0.1],
            'depth': [4, 6],
            'iterations': [100, 200]
        }
    }
}

In [None]:
# Train and evaluate each model
for name, clf_dict in classifiers.items():
    print(f"\nTraining {name}...")
    grid = GridSearchCV(estimator=clf_dict['model'], param_grid=clf_dict['params'], scoring='f1', cv=3, verbose=1)
    grid.fit(X_train_scaled, y_train_smote)

    # Best model from GridSearchCV
    best_model = grid.best_estimator_
    print(f"Best Parameters for {name}: {grid.best_params_}")

    # Predictions on validation data
    y_pred = best_model.predict(X_valid_scaled)
    y_pred_proba = best_model.predict_proba(X_valid_scaled)[:, 1] if hasattr(best_model, "predict_proba") else None

    # Evaluation Metrics
    f1 = f1_score(y_valid, y_pred)
    roc_auc = roc_auc_score(y_valid, y_pred_proba) if y_pred_proba is not None else "N/A"

    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Classification Report:\n{classification_report(y_valid, y_pred)}")

    # Save predictions for test dataset
    test_predictions = best_model.predict(test_scaled)
    submission = pd.DataFrame({
        'ID': test_df['ID'],
        'target': test_predictions
    })
    submission_file = f"/kaggle/working/{name.replace(' ', '_')}_submission.csv"
    submission.to_csv(submission_file, index=False)
    print(f"Saved predictions to {submission_file}")


Training RandomForest...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
