# Importation of libraries

In [1]:
# Standard libraries
import pandas as pd
import numpy as np

# Non-standard Libaries
from DataLoaders.bad_debts import BadDebtsExpense

from FeatureEngineering.credit_sales import CreditSales
from FeatureEngineering.days_sales_outstanding import DSO
from FeatureEngineering.consecutive_years import get_consecutive_years

from Analysis.enrollment_statistics import enrollment_statistics

# B. Loading of datasets

## 1. Revenues

In [2]:
df_revenues = pd.read_excel(r"Database\revenues_pseudonymized.xlsx")

In [3]:
df_revenues

Unnamed: 0,entry_number,entry_date,due_date,school_year,student_id_pseudonimized,category_name,discount_refund_applied_to,amount_due,amount_paid,account_name,receivables
0,0,2025-10-13,2025-10-13,2014,9XBPS6GQ,Form 137,,150.0,150.0,G-Cash,0.0
1,1,2016-01-01,2016-01-01,2016,QCNXOF71,Back Account,,3524.0,0.0,Not Applicable,3524.0
2,2,2016-01-01,2016-01-01,2016,UFN5RBCA,Back Account,,9831.0,0.0,Not Applicable,9831.0
3,3,2016-01-01,2016-01-01,2016,CATF26JR,Back Account,,9240.0,0.0,Not Applicable,9240.0
4,4,2016-01-01,2016-01-01,2016,TE11Z2LJ,Back Account,,5886.0,0.0,Not Applicable,5886.0
...,...,...,...,...,...,...,...,...,...,...,...
50149,52878,2026-01-21,2026-01-21,2026,KX9K7HHC,G03-OF,,16400.0,16400.0,Cash,0.0
50150,52879,2026-01-21,2026-07-11,2026,7TIF6WOZ,G04-A-UE,,30800.0,13860.0,Cash,16940.0
50151,52880,2026-01-21,2026-07-11,2026,7TIF6WOZ,Discount - Early Enrollee,G04-A-UE,-3080.0,0.0,Not Applicable,-3080.0
50152,52881,2026-01-21,2026-07-11,2026,7TIF6WOZ,Discount - Pastoral,G04-A-UE,-13860.0,0.0,Not Applicable,-13860.0


## 2. Credit Sales

In [4]:
import time

In [5]:
start = time.time()
cs = CreditSales(df_revenues)

end = time.time()
print(f"Execution time: {end - start:.6f} seconds")

df_truth = cs.show_data()
df_truth = df_truth.groupby(['school_year', 'student_id_pseudonimized', 'category_name']).sum(numeric_only=True)

Execution time: 0.799764 seconds


In [6]:
from FeatureEngineering.credit_sales_multiple_backup import CreditSales as CreditSalesOptimized

start = time.time()

cs_brute = CreditSalesOptimized(df_revenues)

end = time.time()
print(f"Execution time: {end - start:.6f} seconds")

Single due date records: 7371
Multiple due date records: 2748
Execution time: 18.456008 seconds


In [7]:
from FeatureEngineering.credit_sales_multiple import CreditSales as CreditSalesOptimized

start = time.time()

cs_opti = CreditSalesOptimized(df_revenues)

end = time.time()
print(f"Execution time: {end - start:.6f} seconds")

Single due date records: 9800
Multiple due date records: 261
Execution time: 5.828010 seconds


In [8]:
df_truth

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gross_receivables,amount_discounted,adjustments,credit_sale_amount,prepayments,30_days,60_days,90_days,120_days,150_days,180_days,180_above,total_payments,adjusted_credit_amount,net_receivables
school_year,student_id_pseudonimized,category_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2016,CATF26JR,Back Account,9240.0,0.0,0.0,9240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9240.0,9240.0,9240.0,0.0
2016,LI5DFHZ5,Back Account,5650.0,0.0,0.0,5650.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5650.0,5650.0,5650.0,0.0
2016,QCNXOF71,Back Account,3524.0,0.0,0.0,3524.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3524.0,3524.0,3524.0,0.0
2016,TE11Z2LJ,Back Account,5886.0,0.0,0.0,5886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5886.0,5886.0,5886.0,0.0
2016,UFN5RBCA,Back Account,9831.0,0.0,0.0,9831.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9831.0,9831.0,9831.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2026,O9S9VFIM,G04-Books,6260.0,-700.0,0.0,5560.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5560.0,5560.0
2026,XO7E9JLP,G09-OF,800.0,0.0,0.0,800.0,800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,800.0,0.0,0.0
2026,YMJ3L6X6,G08-B-2nd,5200.0,0.0,0.0,5200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5200.0,5200.0
2026,YMJ3L6X6,G08-OF-2nd,6267.0,0.0,0.0,6267.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6267.0,6267.0


In [9]:
df_brute = cs_brute.show_data()
df_brute = df_brute.groupby(['school_year', 'student_id_pseudonimized', 'category_name']).sum(numeric_only=True)

In [10]:
df_test_opti = cs_opti.show_data()
df_test_opti.to_excel("credit_sales_test.xlsx", index=False)
df_test_opti = df_test_opti.groupby(['school_year', 'student_id_pseudonimized', 'category_name']).sum(numeric_only=True)

In [11]:
print(f"Truth : {df_truth.shape[0]}")
print(f"Brute : {df_brute.shape[0]}")
print(f"Test  : {df_test_opti.shape[0]}")

Truth : 9886
Brute : 9884
Test  : 9884


In [12]:
print(f"Truth : {df_truth.amount_discounted.sum()}")
print(f"Brute : {df_brute.amount_discounted.sum()}")
print(f"Test  : {df_test_opti.amount_discounted.sum()}")

Truth : -664521.25
Brute : -664300.25
Test  : -664300.25


In [13]:
print(f"Truth : {df_truth.adjustments.sum()}")
print(f"Brute : {df_brute.adjustments.sum()}")
print(f"Test  : {df_test_opti.adjustments.sum()}")

Truth : -12799.75
Brute : -12799.75
Test  : -12799.75


In [18]:
print(f"Truth : {df_truth.total_payments.sum()}")
print(f"Brute : {df_brute.total_payments.sum()}")
print(f"Test  : {df_test_opti.total_payments.sum()}")

Truth : 40521535.25
Brute : 40514835.25
Test  : 40514835.25


In [15]:
df_test_opti

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gross_receivables,amount_discounted,adjustments,credit_sale_amount,prepayments,30_days,60_days,90_days,120_days,150_days,180_days,180_above,total_payments,adjusted_credit_amount,net_receivables
school_year,student_id_pseudonimized,category_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2016,CATF26JR,Back Account,9240.0,0.0,0.0,9240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9240.0,9240.0,9240.0,0.0
2016,LI5DFHZ5,Back Account,5650.0,0.0,0.0,5650.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5650.0,5650.0,5650.0,0.0
2016,QCNXOF71,Back Account,3524.0,0.0,0.0,3524.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3524.0,3524.0,3524.0,0.0
2016,TE11Z2LJ,Back Account,5886.0,0.0,0.0,5886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5886.0,5886.0,5886.0,0.0
2016,UFN5RBCA,Back Account,9831.0,0.0,0.0,9831.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9831.0,9831.0,9831.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2026,O9S9VFIM,G04-Books,6260.0,-700.0,0.0,5560.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5560.0,5560.0
2026,XO7E9JLP,G09-OF,800.0,0.0,0.0,800.0,800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,800.0,0.0,0.0
2026,YMJ3L6X6,G08-B-2nd,5200.0,0.0,0.0,5200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5200.0,5200.0
2026,YMJ3L6X6,G08-OF-2nd,6267.0,0.0,0.0,6267.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6267.0,6267.0


In [16]:
import pandas as pd

# Merge the two DataFrames on the keys
merged = df_truth.merge(
    df_test_opti,
    on=["school_year", "student_id_pseudonimized", "category_name"],
    suffixes=("_truth", "_test")
)

# Find rows where net_receivables differ
diff_rows = merged[merged["net_receivables_truth"] != merged["net_receivables_test"]]

diff_rows[['net_receivables_truth', 'net_receivables_test']] 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,net_receivables_truth,net_receivables_test
school_year,student_id_pseudonimized,category_name,Unnamed: 3_level_1,Unnamed: 4_level_1


In [17]:
df_truth.reset_index(inplace=True)