In [1]:
import pandas as pd
import numpy as np
import os
import pyfiglet

from colorama import Fore, Style

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier, XGBRegressor

from tqdm import tqdm


In [2]:
import Flexivan_Prediction_Package

In [4]:
# Get filenames from folder
DATA_Folder = './Daily prediction/DATA'

FILENAMES = [f for f in os.listdir(DATA_Folder) if os.path.isfile(os.path.join(DATA_Folder, f))]
selected = "Latest_Test_"
selected2 = '_DETAILED'
FILENAMES = [f for f in FILENAMES if selected in f and selected2 not in f]
DATES = Flexivan_Prediction_Package.extract_datetimes_from_filenames(FILENAMES)
Filenames_DF = pd.DataFrame({
    "Filenames": FILENAMES,
    "Dates": DATES
})
Filenames_DF_Sorted = Filenames_DF.sort_values(by='Dates')
FILENAMES = list(Filenames_DF_Sorted['Filenames'])
DATES = list(Filenames_DF_Sorted['Dates'])

print(f'{len(FILENAMES)} Filenames were found')
print(FILENAMES)


250 Filenames were found
['Latest_Test_2025-02-16.csv', 'Latest_Test_2025-02-23.csv', 'Latest_Test_2025-02-25.csv', 'Latest_Test_2025-03-02.csv', 'Latest_Test_2025-03-03.csv', 'Latest_Test_2025-03-04.csv', 'Latest_Test_2025-03-05.csv', 'Latest_Test_2025-03-06.csv', 'Latest_Test_2025-03-07.csv', 'Latest_Test_2025-03-08.csv', 'Latest_Test_2025-03-09.csv', 'Latest_Test_2025-03-10.csv', 'Latest_Test_2025-03-11.csv', 'Latest_Test_2025-03-12.csv', 'Latest_Test_2025-03-13.csv', 'Latest_Test_2025-03-14.csv', 'Latest_Test_2025-03-15.csv', 'Latest_Test_2025-03-16.csv', 'Latest_Test_2025-03-17.csv', 'Latest_Test_2025-03-18.csv', 'Latest_Test_2025-03-19.csv', 'Latest_Test_2025-03-20.csv', 'Latest_Test_2025-03-21.csv', 'Latest_Test_2025-03-22.csv', 'Latest_Test_2025-03-23.csv', 'Latest_Test_2025-03-24.csv', 'Latest_Test_2025-03-25.csv', 'Latest_Test_2025-03-26.csv', 'Latest_Test_2025-03-27.csv', 'Latest_Test_2025-03-28.csv', 'Latest_Test_2025-03-29.csv', 'Latest_Test_2025-03-30.csv', 'Latest_Test_2

In [None]:
LOT_Transition_Matrix = None
lot_index = None
Test_Ratio = .2
LOT_MODELS = {}                         # Model per LOT
Accuracy_THR_4_Retraining = .8
random_state = 42

Sorting_Field='CHS Pickup Date'
Columns_2_Drop_From_Training = ['CHS ID', 'CTR Trip Id', 'CHS Return Dt', 'CHS Return LOC', 'CHS Pickup Date', 'CTR pick Dt', 'CTR Return Dt']
Enumerated_Columns_LIST = ['CHS Pickup Loc', 'CHS Return Loc', 'CHS pickup MCO', 'CTR Trip MCO', 'O Customer', 'Customer', 'DC Loc', 'CTR Pickup Term', 'CTR Return Term', 
                           'pgkey', 'CTR Trip Loc Type Pattern', 'CTR Trip Pattern']

print(pyfiglet.figlet_format("Building  models..."))

for filename in tqdm(FILENAMES):
    print(f'Reading file:\t' + Fore.YELLOW + f'{filename}' + Fore.RESET + '...', end='')
    File_Analysis_Results_OBJ = Flexivan_Prediction_Package.File_Analysis_Reults(f'{DATA_Folder}/{filename}', Sorting_Field, Columns_2_Drop_From_Training, Enumerated_Columns_LIST)
    
    percentage = int(100*len(File_Analysis_Results_OBJ.DATA)/len(File_Analysis_Results_OBJ.DATA_ORIG))
    print('Samples num:\t' + Fore.YELLOW + f'{Flexivan_Prediction_Package.Comma_Separation_Num_String(len(File_Analysis_Results_OBJ.DATA))}' + Fore.RESET + ' (' + Fore.YELLOW + f' ({percentage}' + Fore.RESET + '% out of original data after cleaning)')

    # Supplumenting (building if necessary) the models
    PU_LOTs_Unique = list(File_Analysis_Results_OBJ.DATA_ORIG['CHS Pickup Loc'].unique())

    for pu_lot in PU_LOTs_Unique:
        DATA = File_Analysis_Results_OBJ.DATA[File_Analysis_Results_OBJ.DATA['CHS Pickup Loc']==pu_lot]
        PU_LOT_COL = np.array(DATA['CHS Pickup Loc'])
        DATA.drop(columns=['CHS Pickup Loc'], inplace=True)
        
        try:
            MODEL = LOT_MODELS[pu_lot]
            print(Fore.GREEN + f'\tModel exists - ' + Fore.RESET, end='')

            # Model exists for that LOT, classify the whole file
            print(f'making predictions for {Flexivan_Prediction_Package.Comma_Separation_Num_String(len(PU_LOT_COL))} samples...', end='')
            DATA = Flexivan_Prediction_Package.align_df_to_model(DATA, MODEL, fill_value=0)
            y_pred = np.array(MODEL.predict(DATA))
            
            # Check results accuracy
            DIFF = np.array(y_pred - PU_LOT_COL)
            Indexes = DIFF[DIFF==0]
            Accuracy = len(Indexes) / len(PU_LOT_COL)
            if Accuracy<Accuracy_THR_4_Retraining:
                print(f'\n\tAccuracy = ' + Fore.RED + f'{Accuracy}' + Fore.RESET)
             
                # Retrain if necessary
                print('\tRe-Training...', end='')
                xgb_params = {'n_estimators': 100, 'random_state': random_state, 'verbosity': 0, 'num_class': len(set(PU_LOTs_Unique))}
                model = XGBClassifier(**xgb_params)

                model.fit(DATA, PU_LOT_COL)
                LOT_MODELS[pu_lot] = model
                print(Fore.GREEN + 'DONE.' + Fore.RESET)
            else:
                print(f'\n\tAccuracy = ' + Fore.GREEN + f'{accuracy}' + Fore.RESET)

        except:
            # Model for LOT does not exist, train from current file (whole)

            print(f'\t{pu_lot} ' + Fore.RED + '\tdoes not ' + Fore.RESET + 'have a LOT prediction model -->> TRAINING...', end='')
            xgb_params = {'n_estimators': 100, 'random_state': random_state, 'verbosity': 0, 'num_class': len(set(PU_LOT_COL))}
            model = XGBClassifier(**xgb_params)

            model.fit(DATA, PU_LOT_COL)
            LOT_MODELS[pu_lot] = model
            print(Fore.GREEN + 'DONE.' + Fore.RESET)

    print('\n\n')



 ____        _ _     _ _                                    _      _           
| __ ) _   _(_) | __| (_)_ __   __ _    _ __ ___   ___   __| | ___| |___       
|  _ \| | | | | |/ _` | | '_ \ / _` |  | '_ ` _ \ / _ \ / _` |/ _ \ / __|      
| |_) | |_| | | | (_| | | | | | (_| |  | | | | | | (_) | (_| |  __/ \__ \_ _ _ 
|____/ \__,_|_|_|\__,_|_|_| |_|\__, |  |_| |_| |_|\___/ \__,_|\___|_|___(_|_|_)
                               |___/                                           



  0%|          | 0/250 [00:00<?, ?it/s]

Reading file:	[33mLatest_Test_2025-02-16.csv[39m...DONE.
Samples num:	[33m14,112[39m ([33m (96[39m% out of original data after cleaning)
	LAXAIM [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
	LAXCSN [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
	CHIIIC [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
	WBCT [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
	LAXICE [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
	SAVCMP [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
	OAKSTE [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
	TACTHG [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m





  0%|          | 1/250 [01:02<4:21:02, 62.90s/it]

Reading file:	[33mLatest_Test_2025-02-23.csv[39m...DONE.
Samples num:	[33m124,988[39m ([33m (94[39m% out of original data after cleaning)
[32m	Model exists - [39m	LAXAIM [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
[32m	Model exists - [39m	LAXCSN [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
[32m	Model exists - [39m	OAKSTE [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
[32m	Model exists - [39m	TACTHG [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
	LAXBAC [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
[32m	Model exists - [39m	CHIIIC [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
[32m	Model exists - [39m	WBCT [31m	does not [39mhave a LOT prediction model -->> TRAINING...[32mDONE.[39m
[32m	Model exists - [39m	LAXICE [31m	does not [39mhave a LOT prediction model -->>