# A. Importation of libraries and Configs

In [1]:
# Standard libraries
import pandas as pd

In [2]:
class Config:
    # Path to the pseudonimized revenues dataset
    dataset_dir = r"Database\revenues_pseudonymized.xlsx"
    # Path to the enrollee infos
    enrollees_dir = r"Database\enrollees_pseudonymized.xlsx"
    # Path to the machine learning model parameters
    parameters_dir = r"MachineLearning\parameters.json"

    # Path to cache directory to store preprocessed dataset if needed
    cache_dir = ""
    load_cache = True

    # Path to store transformer results
    results_dir = r"C:\Users\rjbel\Python\Data\Thesis\Results"

    # Class to predict
    target_feature = 'dtp_bracket'
    # Test size in %
    test_size = 0.3


args = Config()

# B. Loading of datasets

## 1. Revenues

In [3]:
df_revenues = pd.read_excel(args.dataset_dir)

In [4]:
df_revenues

Unnamed: 0,entry_number,entry_date,due_date,school_year,student_id_pseudonimized,category_name,discount_refund_applied_to,amount_due,amount_paid,account_name,receivables
0,0,2025-10-13,2025-10-13,2014,9XBPS6GQ,Form 137,,150.0,150.0,G-Cash,0.0
1,1,2016-01-01,2016-01-01,2016,QCNXOF71,Back Account,,3524.0,0.0,Not Applicable,3524.0
2,2,2016-01-01,2016-01-01,2016,UFN5RBCA,Back Account,,9831.0,0.0,Not Applicable,9831.0
3,3,2016-01-01,2016-01-01,2016,CATF26JR,Back Account,,9240.0,0.0,Not Applicable,9240.0
4,4,2016-01-01,2016-01-01,2016,TE11Z2LJ,Back Account,,5886.0,0.0,Not Applicable,5886.0
...,...,...,...,...,...,...,...,...,...,...,...
51498,53704,2026-02-18,2026-12-04,2026,B5DSEMMK,Kn1-C-3rd,,5200.0,0.0,Not Applicable,5200.0
51499,53705,2026-02-18,2027-02-05,2026,B5DSEMMK,Kn1-C-4th,,5200.0,0.0,Not Applicable,5200.0
51500,53706,2026-02-18,2026-08-07,2026,B5DSEMMK,Kn1-OF-1st,,4267.0,4267.0,G-Cash,0.0
51501,53707,2026-02-18,2026-12-04,2026,B5DSEMMK,Kn1-OF-2nd,,4267.0,0.0,Not Applicable,4267.0


## 2. Enrollees

In [5]:
df_enrollees = pd.read_excel(args.enrollees_dir)

## 3. Credit Sales

In [6]:
from FeatureEngineering.credit_sales_machine_learning import CreditSales

cs = CreditSales(df_revenues, df_enrollees)
df_credit_sales = cs.show_data()

Single due date records: 10063
Multiple due date records: 254


KeyError: 'amount_due_cum_sum'

In [None]:
df_credit_sales

In [None]:
# Get counts
counts = df_credit_sales.dtp_bracket.value_counts()

# Convert to percentages
percentages = counts / counts.sum() * 100

# Combine into one DataFrame
result = pd.DataFrame({
    'count': counts,
    'percentage': percentages.round(2)  # round to 2 decimal places
})

print(result)

In [None]:
df_credit_sales.dropna(subset=['dtp_1', 'dtp_2', 'dtp_3', 'dtp_4', 'dtp_bracket'], inplace=True)

# Drop plans D, E, and not enrolled
df_credit_sales = df_credit_sales[
    (df_credit_sales['plan_type_Plan - D'] != 1) &
    (df_credit_sales['plan_type_Plan - E'] != 1) &
    (df_credit_sales['plan_type_nan'] != 1)
]

In [None]:
df_credit_sales

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Clean the column: drop NaNs and empty strings
cleaned_days = df_credit_sales['days_elapsed_until_fully_paid']
cleaned_days = cleaned_days.replace("", np.nan).dropna()

# Filter to range -300 to +300
filtered_days = cleaned_days[(cleaned_days >= -100) & (cleaned_days <= 100)]

# KDE plot
sns.kdeplot(
    x=filtered_days,
    fill=False,
    color="steelblue"
)

plt.title("KDE Plot: Days Elapsed Until Fully Paid (-300 to +300)")
plt.xlabel("Days Elapsed")
plt.ylabel("Density")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df_credit_sales = df_credit_sales.dropna(subset=['dtp_1', 'dtp_2', 'dtp_3', 'dtp_4'])

# Select relevant columns
cols = ['days_elapsed_until_fully_paid', 
        'dtp_1', 'dtp_2', 'dtp_3', 'dtp_4', 
        'dtp_avg', 'dtp_wavg', 'dtp_2_trend',
        'dtp_3_trend', 'days_since_last_payment',
        'credit_sale_amount', 'amount_due_cumsum',
        'amount_paid_cumsum', 'opening_balance']

# Compute correlation matrix
corr = df_credit_sales[cols].corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, fmt=".2f")
plt.title("Correlation with Days Elapsed Until Fully Paid")
plt.show()

In [None]:
drop_columns = ['school_year', 'student_id_pseudonimized', 'category_name',
       'gross_receivables', 'amount_discounted', 'adjustments', 'date_fully_paid',
       'last_payment_date', 'days_elapsed_until_fully_paid',
       'plan_type_Plan - D', 'plan_type_Plan - E', 'plan_type_nan']

df_data = df_credit_sales.drop(columns=drop_columns)

# C. Machine Learning Pipelines

In [None]:
df_data.columns

In [None]:
from MachineLearning.Utils.data_preparation import DataPreparer

# Initialize the preparer
preparer = DataPreparer(df_data, args.target_feature, test_size=args.test_size)

# Run preprocessing
preparer.prep_data()

# Load the train/test splits
X_train = preparer.X_train
X_test  = preparer.X_test
y_train = preparer.y_train
y_test  = preparer.y_test

In [None]:
from MachineLearning import (
    AdaBoostPipeline,
    DecisionTreePipeline,
    GaussianNaiveBayesPipeline,
    KnearestNeighborPipeline,
    RandomForestPipeline,
    XGboostPipeline,
    MultiLayerPerceptronPipeline,
    TransformerPipeline,
)

models = {
    "ada_boost": AdaBoostPipeline,
    "decision_tree": DecisionTreePipeline,
    "gaussian_naive_bayes": GaussianNaiveBayesPipeline,
    "knn": KnearestNeighborPipeline,
    "random_forest": RandomForestPipeline,
    "xgboost": XGboostPipeline,
    "nn_mlp": MultiLayerPerceptronPipeline,
    #"nn_rnn": RecurrentNeuralNetworkPipeline,
    #"nn_transformer": TransformerPipeline
}

In [None]:
# To silence the error when running knn:
# UserWarning: Could not find the number of physical cores for the following reason:
# [WinError 2]
import os

os.environ['OMP_NUM_THREADS'] = '16'

In [None]:
import pandas as pd
from MachineLearning.Utils.load_parameters import ParameterLoader

# Load parameters from JSON
loader = ParameterLoader(args.parameters_dir)

all_results = []  # list to gather results

for model_name, pipeline_class in models.items():
    param_list = loader.get_parameters(model_name)

    for param in param_list:
        print(f"Running {model_name} with parameters: {param}")

        pipeline = pipeline_class(X_train, X_test, y_train, y_test,
                                  args,
                                  param)

        # Capture results from pipeline
        result = pipeline.build_model().train().evaluation().show_results()

        # Add metadata (model name + parameters)
        result["model"] = model_name
        result["parameters"] = str(param)

        all_results.append(result)

# Convert to DataFrame
results_df = pd.DataFrame(all_results)

# Export to Excel
results_df.to_excel("MachineLearning/Results/model_results.xlsx", index=False)
print("All results saved to model_results.xlsx")

In [None]:
results_df.sort_values(by='f1_macro', ascending=False)