In [None]:


import os, csv, time, uuid
from datetime import date, datetime

#LOG_DIR = os.path.join(os.path.dirname(__file__),'.','logs')
LOG_DIR = 'logs'
if not os.path.exists(LOG_DIR):
    os.mkdir(LOG_DIR)

def update_train_log(
    country: str,
    date_range: str,
    metric: str,
    runtime: float,
    version: str,
    prefix: str,
    note: str,
    mode: str = 'prod',
    test: bool = False
) -> None:
    """
    Append a new entry to the training log file with details about the model training run.

    Parameters:
    - country (str): The country associated with the training data.
    - date_range (str): The date range of the training data.
    - metric (str): Performance metric of the model (RMSE & MAPE).
    - runtime (float): Runtime of the training process in seconds.
    - version (str): Model version identifier.
    - prefix (str): Log file prefix.
    - note (str): Additional notes on the training run.
    - mode (str): Mode of operation, e.g., 'production' or 'test'. **Default**: 'production'.
    - test (bool): Flag indicating if this is a test run. **Default**: False.

    Returns:
    None. Writes data to a log file.
    """

    # Create Log Directory if it doesn't exist
    if not os.path.isdir(LOG_DIR):
        os.mkdir(LOG_DIR)

    # Name the logfile and define its path
    today = date.today()
    train_logfile = os.path.join(LOG_DIR, f"{mode}-trained_on_{today.month}_{today.year}.log")

    # Define the header
    header = ['unique_id', 'timestamp', 'date_range', 'country', 'metric', 'model_version', 'runtime', 'mode', 'note']
    write_header = False

    # Write the header if needed
    if not os.path.exists(train_logfile):
        write_header = True

    # Get the current timestamp
    current_timestamp = datetime.fromtimestamp(time.time()).strftime("%H:%M:%S")

    # Generate a random UUID
    unique_id = uuid.uuid4()
    unique_id = str(unique_id)[:13]

    # Write to CSV
    with open(train_logfile, mode='a+', newline='') as csvfile: 
        writer = csv.writer(csvfile, delimiter=',')

        # Write the header if needed
        if write_header:
            writer.writerow(header)

        # Prepare the row for writing
        to_write = map(str, [unique_id, current_timestamp, date_range, country, metric, version, runtime, mode, note])

        # Write the row to the log file
        writer.writerow(to_write)

    print(f"\nLog entry saved to {train_logfile}")


def update_predict_log(country: str, target_date: str, y_pred: str, y_proba: list, 
                       runtime: str, version: str, mode: str) -> None:
    """
    Update the prediction log file with the prediction results.

    Parameters:
    - country (str): The target country for the prediction.
    - target_date (str): The date of the prediction.
    - y_pred (str): The predicted value.
    - y_proba (list): The predicted probabilities.
    - runtime (str): The runtime of the prediction.
    - version (str): The version of the model used.
    - mode (str): The mode of prediction.
    """
    
    # Name the logfile and define its path
    today = date.today()
    predict_logfile = os.path.join(LOG_DIR, f"{mode}-predicted_on_{today.month}_{today.year}.log")
    
    # Get the current timestamp
    current_timestamp = datetime.fromtimestamp(time.time()).strftime("%H:%M:%S")

    # Generate a random UUID
    unique_id = str(uuid.uuid4())[:13]
    
    # Define the header
    header = ['unique_id', 'timestamp', 'mode', 'country', 'target_date', 'y_pred', 'y_proba', 'model_version', 'runtime']
    
    # Write to CSV
    with open(predict_logfile, 'a+', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        
        # Write the header if needed
        if os.path.getsize(predict_logfile) == 0:  # Check if the file is empty
            writer.writerow(header)
            
        to_write = map(str, [unique_id, current_timestamp, mode, country, target_date, y_pred, y_proba, version, runtime])
        writer.writerow(to_write)
        
        

In [1]:
## Import necessary libraries 
import os, re, time, joblib
from typing import Tuple
from datetime import date
from datetime import datetime
from collections import defaultdict

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error


from data_module import load_json_data, time_series_df
from cleaning_module import data_cleaning_pipeline
from logger import update_predict_log, update_train_log

import warnings
warnings.filterwarnings('ignore')

## model specific variables (iterate the version and note with each change)
NOTE      = "random forest model for time-series"
DATA_DIR = os.path.join(os.path.dirname(__file__), '.', 'data', 'cs-train')
#MODEL_DIR = os.path.join(os.path.dirname(__file__), '.', "models")
MODEL_DIR = "models"
version   = 'v4'
mode      = 'prod'

## Random Forest Regressor model specific variables 
prefix           = 'sl'
#country          = 'all_countries'
#model_scaler     = StandardScaler()
#model            = RandomForestRegressor( random_state = 42)
model_param_grid = { 'model__n_estimators' : [90, 100, 110, 120, 130] ,
                     'model__max_depth'    : [None, 5, 10, 15]        ,
                     'model__criterion'    : ['squared_error']        }


# start timer for runtime
loading_df_time_start = time.time()

## loading the dataframe as loaded_df
loaded_df_original = load_json_data(DATA_DIR)
print(f"\n... Data injested\n")

## Rows count
print(f'\nInjested data contains initialy {len(loaded_df_original):,.0f} entries\n')

# calculate runtime duration
m, s = divmod( time.time() - loading_df_time_start , 60)
h, m = divmod(m, 60)
loading_df_runtime = "%03d:%02d:%02d"%(h, m, s)
print(f'\nData injestion runtime : {loading_df_runtime}\n')

# start timer for runtime
cleaning_time_start = time.time()

# clean the dataframe
loaded_df = data_cleaning_pipeline(loaded_df_original)

# calculate runtime duration
m, s = divmod( time.time() - cleaning_time_start , 60)
h, m = divmod(m, 60)
cleaning_runtime = "%03d:%02d:%02d"%(h, m, s)
print(f'\nCleaning runtime : {cleaning_runtime}\n')


def engineer_features(df: pd.DataFrame, country: str, training: bool) -> Tuple[pd.DataFrame, pd.Series, pd.Series]:
    """
    Engineer features for each day to predict the sum of revenue for the next 30 days.

    Parameters:
    df (pd.DataFrame): Input DataFrame with 'date', 'revenue', 'purchases', and 'total_views'.
    country (str): Country to filter data.
    training (bool): If True, trims the last 30 days of data; otherwise, returns all data.

    Returns:
    Tuple[pd.DataFrame, pd.Series, pd.Series]: 
        - DataFrame with engineered features.
        - Target values (sum of next 30 days' revenue).
        - Dates corresponding to each feature row.
    """
    
    engineer_features_time_start = time.time()

    ts_df = time_series_df(df, country=country)
    ts_df = ts_df[['date', 'revenue', 'purchases', 'total_views']]
    ts_df['date'] = pd.to_datetime(ts_df['date'], errors='coerce')

    # Initialize dictionaries to store features and target values
    eng_features = defaultdict(list)
    y = []

    # Define the look-back periods (in days) for feature engineering
    previous_days = [7, 14, 28, 70]

    # Calculate rolling sums for revenue for each period and shift to align with target
    for num_days in previous_days:
        ts_df[f'revenue_{num_days}'] = ts_df['revenue'].rolling(window=num_days, min_periods=1).sum().shift(1)

    # Iterate over each row in the DataFrame
    for idx, row in ts_df.iterrows():
        current_date = row['date']

        # Append engineered features
        for num_days in previous_days:
            eng_features[f'previous_{num_days}'].append(row[f'revenue_{num_days}'])

        # Target: Sum revenue for the next 30 days
        target_sum = ts_df[(ts_df['date'] >= current_date) & (ts_df['date'] < current_date + pd.Timedelta(days=30))]['revenue'].sum()
        y.append(target_sum)
        
        # Previous year revenue for trend analysis
        prev_year_start = current_date - pd.DateOffset(years=1)
        prev_year_revenue = ts_df[(ts_df['date'] >= prev_year_start) & (ts_df['date'] < prev_year_start + pd.DateOffset(days=30))]['revenue'].sum()
        eng_features['previous_year'].append(prev_year_revenue)
        
        # Non-revenue features: Average invoices and views over the last 30 days
        recent_data = ts_df[(ts_df['date'] >= current_date - pd.Timedelta(days=30)) & (ts_df['date'] < current_date)]
        eng_features['recent_views'].append(recent_data['total_views'].mean() if not recent_data.empty else 0)
    
    # Convert the features dictionary to a DataFrame
    X = pd.DataFrame(eng_features)
    y = pd.Series(y, name='target')
    dates = ts_df['date']

    # Remove rows with all zeros (in cases where no data exists for look-back periods)
    X = X[(X != 0).any(axis=1)]
    y = y[X.index]
    dates = dates[X.index]

    # If training, exclude the last 30 days to ensure target reliability
    if training:
        X = X.iloc[:-30]
        y = y.iloc[:-30]
        dates = dates.iloc[:-30]

    # Reset index for neatness
    X.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)
    dates.reset_index(drop=True, inplace=True)

    # Calculate runtime duration
    m, s = divmod(time.time() - engineer_features_time_start, 60)
    h, m = divmod(m, 60)
    engineer_features_runtime = "%02d:%02d:%02d" % (h, m, s)
    
    return X, y, dates


def perform_training(df: pd.DataFrame, country: str, prefix: str, version: str,
                    model, model_param_grid: dict, model_scaler,
                    training: bool, test: bool) -> str:
    """
    Trains a model on the given dataset, tunes hyperparameters, evaluates performance, and saves the model.

    Parameters:
    - df (DataFrame): Input data for training and testing.
    - country (str): Country for which the model is being trained.
    - prefix (str): Prefix for saved model file name.
    - version (str): Version identifier for the model.
    - model: Model to be trained (e.g., RandomForestRegressor).
    - model_param_grid (dict): Hyperparameters grid for tuning the model.
    - model_scaler: Scaler to normalize the data.
    - training (bool): Flag for training data usage.
    - test (bool): Flag for test mode; saves the model with "test" prefix if True. **Default**: False.

    Returns:
    - str: Name of the model used.
    """

    print("\n... Perform training on train set")

    # start timer for runtime
    perform_training_time_start = time.time()
    
    # prepare the data
    X, y, dates = engineer_features(df, country, training)
    
    # Execute this block only if model is RandomForestRegressor
    if isinstance(model, RandomForestRegressor):
        X = X.dropna()  
        y = y[X.index]
        dates = dates[X.index]
    
    # Create Test Subset of Data (if in Test Mode)
    if test:
        subset = X.sample(frac=0.30, replace=False, random_state=42).index
        dates = dates.loc[subset]
        X = X.loc[subset]
        y = y.loc[subset]

    # Define the date range for logging
    max_date = dates.iloc[-1].strftime('%Y-%m-%d')
    min_date = dates.iloc[0].strftime('%Y-%m-%d')
    date_range = f"{min_date}:{max_date}"
    
    # Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.20, shuffle=True)

    # Create a pipeline with scaling and a random forest model
    pipe = Pipeline([('model_scaler', model_scaler), ('model', model)])
    
    # Tune the hyperparameter
    print("\n... Tuning the model hyperparameters ")
    grid = GridSearchCV(pipe, param_grid=model_param_grid, cv=5, n_jobs=2)

    # Fit the model on train set
    try:
        grid.fit(X_train, y_train)
    except Exception as e:
        print(f"Error during model training on train set: {e}")
        return None

    # Make predictions on test set 
    y_pred = grid.predict(X_test)
    ## Evaluate the model on test set
    # Calculate RMSE
    eval_rmse = round(mean_squared_error(y_test, y_pred)**0.5)
    # Calculate MAPE
    eval_mape = mean_absolute_percentage_error(y_test, y_pred) * 100
    # Define the date range for logging
    eval_metrics = f"[RMSE={eval_rmse},MAPE={eval_mape:.1f}%]"

    # Retrain using all data
    print("\n... Retraining model on all data")
    try:
        grid.fit(X, y)
    except Exception as e:
        print(f"Error during model retraining on all data: {e}")
        return None

    # Best model
    fitted_model = grid.best_estimator_
    
    # make the model name more system compatible and file-friendly when saving the model
    model_name = fitted_model.named_steps['model'].__class__.__name__

    # Define the file path 
    if test:
        saved_model = os.path.join(MODEL_DIR, f"test-{country}-{model_name}-{version}.joblib")
    else:
        saved_model = os.path.join(MODEL_DIR, f"{prefix}-{country}-{model_name}-{version}.joblib")

    # Save the fitted model
    print(f"\n... Saving model version as {prefix} : {saved_model}\n")
    joblib.dump(fitted_model, saved_model)


    # Calculate runtime duration
    m, s = divmod(time.time() - perform_training_time_start, 60)
    h, m = divmod(m, 60)
    perform_training_runtime = "%02d:%02d:%02d" % (h, m, s)

    # update training log
    update_train_log(country, date_range, eval_metrics, perform_training_runtime, version, prefix, test=False, note=NOTE)
    
    return model_name


def model_train(df: pd.DataFrame, country: str, prefix: str, version: str,
                model, model_param_grid: dict, model_scaler, 
                training: bool = True, test: bool = False) -> str:
    """
    Train the model with the given data.

    Parameters:
    - df (DataFrame): The dataset for training.
    - country (str): Target country for the model.
    - prefix (str): Prefix for naming saved model files.
    - version (str): Version number for the model.
    - model: The machine learning model to be trained.
    - model_param_grid (dict): Hyperparameter grid for tuning.
    - model_scaler: Scaler for data normalization.
    - training (bool): Flag indicating training mode. **Default**: True.
    - test (bool): Flag for test mode. If True, saves model with a "test" prefix. **Default**: False.

    Returns:
    - str: Model name after training.
    """
    
    # Start timer for runtime
    model_train_time_start = time.time()

    # Create model directory 
    if not os.path.isdir(MODEL_DIR):
        os.makedirs(MODEL_DIR)
        print(f"Created model directory: {MODEL_DIR}")

    # Test Mode Notification
    if test:
        print("...... testing")
        print("...... subseting data")

    model_name = perform_training(df, country, prefix, version, model, model_param_grid, model_scaler, training, test)    
    print(f'\nModel "{prefix}-{country}-{model_name}-{version}" trained')



    # Calculate runtime duration
    m, s = divmod(time.time() - model_train_time_start, 60)
    h, m = divmod(m, 60)
    model_train_runtime = "%02d:%02d:%02d" % (h, m, s)
    print(f'\nModel training runtime : {model_train_runtime}\n')



def model_load(country: str, training: bool, df: pd.DataFrame, prefix: str = 'prod') -> tuple:
    """
    Loads trained models for a specified country and prepares features for predictions.
    
    Parameters:
    - country (str): The target country for which models are loaded.
    - training (bool): Flag indicating if the data is for training or evaluation.
    - df (DataFrame): The dataset from which features are engineered.
    - prefix (str): The prefix used to identify models.
    
    Returns:
    - dict: A dictionary of models keyed by model name.
    - dict: A dictionary containing features `X`, target `y`, and `dates`.
    """
    
    # Start timer for runtime
    model_load_time_start = time.time()

    # Model directory path
    model_dir = "./models"

    # Initialize return values
    data_dict = {}
    models_dict = {}

    # Retrieve all model filenames matching the specified prefix and country
    models = [filename for filename in os.listdir(model_dir) if prefix in filename and country in filename]

    # Check if any models were found
    if not models:
        print(f"Models starting with the prefix '{prefix}' for the '{country}' country cannot be found! Had you trained it?")
        return models_dict, data_dict  # Return empty dictionaries

    # Load models into a dictionary, keyed by model name without file extension
    models_dict = {model.replace('.joblib', ''): joblib.load(os.path.join(model_dir, model)) for model in models}

    # Engineer features and target variable from the provided DataFrame
    X, y, dates = engineer_features(df, country, training)

    # Convert dates to string format for consistency
    dates = pd.to_datetime(dates).dt.strftime('%Y-%m-%d').tolist()

    # Compile the data into a dictionary for easy access
    data_dict = {"X": X, "y": y, "dates": dates}

    # Calculate runtime duration
    m, s = divmod(time.time() - model_load_time_start, 60)
    h, m = divmod(m, 60)
    model_load_runtime = "%02d:%02d:%02d" % (h, m, s)
    print(f'Loading model runtime : {model_load_runtime}')

    return models_dict, data_dict



def nearest_date(items: list, pivot: date) -> date:
    """Find the nearest date to the given pivot date."""
    if not items:
        raise ValueError("ERROR: items list is empty.")
    return min(items, key=lambda x: abs(date.fromisoformat(x) - pivot))



def model_predict(df: pd.DataFrame, country: str, year: int, month: int, day: int, 
                  mode: str = 'prod', models_dict: dict = None, training: bool = False) -> dict:
    """
    Make predictions using the trained model for a specified country and date.

    Parameters:
    - df (DataFrame): The dataset for prediction.
    - country (str): The target country for the prediction.
    - year (int): The year of the target date.
    - month (int): The month of the target date.
    - day (int): The day of the target date.
    - mode (str): The mode for prediction (e.g., 'prod'). **Default**: 'prod'.
    - models_dict (dict): A dictionary of models keyed by model name. **Default**: None.
    - training (bool): Flag indicating if the data is for training or evaluation. **Default**: False.

    Returns:
    - dict: A dictionary containing predicted values and probabilities.
    """

    ## Start timer for runtime
    model_predict_time_start = time.time()
    
    ## Load model if needed
    print(f"\n... Loading models")    
    if models_dict is None:
        models_dict, data_dict = model_load(country, training, df, prefix='sl')

    # Input checks   
    if not any(f'{country}' in key for key in models_dict.keys()):
        raise Exception(f"\nERROR (model_predict) - any model for country '{country}' could not be found")

    # Finding the model with the latest version
    latest_model_key = max(models_dict.keys(), key=lambda k: int(k.split('-')[-1][1:]))
    latest_model = models_dict[latest_model_key]
    print(f"\nLatest Model used for {country}: {latest_model_key}")    

    # Validate Date Components
    for d in [year, month, day]:
        if re.search(r"\D", str(d)):  
            raise Exception("ERROR (model_predict) - invalid year, month, or day")
        
    ## Load data
    data = data_dict

    # Convert dates to Datetime Format
    dates = pd.to_datetime(data['dates'])

    # Check the target date
    target_date = f"{year}-{str(month).zfill(2)}-{str(day).zfill(2)}"
    print(f"\n... Checking if {target_date} is in the range.")

    # Validate Target Date and Find Nearest Date if Out of Range
    if target_date not in dates.astype(str).values:  # Convert to string for comparison
        print(f"ERROR (model_predict) - date {target_date} not in range [ {data['dates'].iloc[0]} - {data['dates'].iloc[-1]} ]")
        target_date = nearest_date(data['dates'], date.fromisoformat(target_date))
        print(f"Nearest target date is {target_date}")
    else:
        print("Target date is in the range.")

    # Get the index of the target_date
    target_date_indx = dates.get_loc(target_date)  # Correct way to get the index

    # Query the corresponding row
    query = data['X'].iloc[[target_date_indx]]  

    ## Make prediction
    y_pred = latest_model.predict(query)

    ## Add a probability to the prediction 
    y_proba = None
    if 'predict_proba' in dir(latest_model) and getattr(latest_model, 'probability', False):
        y_proba = latest_model.predict_proba(query)

    # Calculate runtime duration
    m, s = divmod(time.time() - model_predict_time_start, 60)
    h, m = divmod(m, 60)
    model_predict_runtime = "%02d:%02d:%02d" % (h, m, s)
    print(f'\nPredicting runtime: {model_predict_runtime}\n')

    # Update predict log
    update_predict_log(country, target_date, f'{y_pred[0]:,.0f}', y_proba, model_predict_runtime, latest_model_key, mode)

    print(f'The expected revenue over the next 30 days is {y_pred[0]:,.0f} £')
    
    
    
if __name__ == "__main__":
    
    from argparse import ArgumentParser
    ap = ArgumentParser()
    
    ap.add_argument('-t', '--training', choices=['dev', 'prod'], default='dev',
                    help='Train either a development or production model. Omitting this implies loading an already-trained model')
    
    ap.add_argument('-m', '--model', choices=['rf', 'et'], default='rf',
                    help='(rf) RandomForestRegressor or (et) ExtraTreesRegressor (default)')
    
    ap.add_argument('-s', '--scaler', choices=['ss', 'rs'], default='ss',
                    help='(ss) StandardScaler or (rs) RobustScaler (default)')
    
    ap.add_argument('-c', '--country', default='all_countries',
                    help="The country to predict revenue for (default: 'all_countries')")
    
    args = ap.parse_args()
    
    train  = args.training
    model  = RandomForestRegressor(random_state = 42) if args.model == 'rf' else ExtraTreesRegressor(random_state = 42)
    scaler = StandardScaler() if args.scaler == 'ss' else RobustScaler()
    
    if train == 'dev':
        # train the model - Development
        print("TRAINING MODELS - DEVELOPMENT")
        model_train(loaded_df, country, prefix, version, model, model_param_grid, model_scaler, training= True, test=True)
    elif train == 'prod':
        # train the model - Production
        print("TRAINING MODELS - PRODUCTION")
        model_train(loaded_df, country, prefix, version, model, model_param_grid, model_scaler, training= True, test=False)
    else:
        # load the model
        print("LOADING MODELS")
        all_data, all_models = model_load(training=False, country=country)
        model_load(country, df, training=False, prefix='prod') 
        print("... models loaded: ", ",".join(all_models.keys()))

    # Predicting for:
    year = '2019'
    month = '06'
    day = '05'
    
    result = model_predict(loaded_df, country, year, month, day, mode='prod', models_dict= None, training=False)
    

NameError: name '__file__' is not defined