# Setting Env-Variables and Enable Mixed-Precision

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' 

# https://github.com/tensorflow/tensorflow/issues/53519
os.environ['TF_DEVICE_MIN_SYS_MEMORY_IN_MB'] = '256' 

In [None]:
from keras import mixed_precision

policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

# Import Necessary Dependencies

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense
import keras
import  tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

# Process Dataset for Problem 3

In [None]:
boxes = pd.read_csv('./datasets/boxes.csv')
purchase_df = pd.read_csv("./datasets/purchase.csv")  # Train SET
problem_df = pd.read_csv("./datasets/problem 3.csv")  # Test SET

# get the total unique MAGIC_KEYS in file
train_keys = np.unique(purchase_df["MAGIC_KEY"].to_list())
test_keys = np.unique(problem_df["MAGIC_KEY"].to_list())


# Debugging
print(f'Unique keys in train set: {len(train_keys)}')
print(f'Unique keys in test set: {len(test_keys)}')

In [None]:
# Check if the Magic Key is in out test set
temp_df = purchase_df[purchase_df['MAGIC_KEY'].isin(test_keys)]

In [None]:
# Data cleaning 
purchase_df = purchase_df[purchase_df['BOX_COUNT'] > 0]

In [None]:
# Parse  datetime column and set it as index
def parse_datetime(df):
    df['PURCHASE_DATE'] = pd.to_datetime(df['PURCHASE_DATE'], format="%d/%m/%Y")
    df.set_index('PURCHASE_DATE', inplace=True)
    return df
# more housekeeping
temp_df = parse_datetime(temp_df)
purchase_df = parse_datetime(purchase_df)

In [None]:
# Create Groups of timescale dataset
def slice_dataframe_by_dates(df):
    days_groups = []
    days_groups.append( df.query("PURCHASE_DATE >= '2018-10-01' and PURCHASE_DATE <= '2018-10-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-10-16' and PURCHASE_DATE <= '2018-10-31'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-11-01' and PURCHASE_DATE <= '2018-11-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-11-16' and PURCHASE_DATE <= '2018-11-30'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-12-01' and PURCHASE_DATE <= '2018-12-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2018-12-16' and PURCHASE_DATE <= '2018-12-31'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2019-01-01' and PURCHASE_DATE <= '2019-01-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2019-01-16' and PURCHASE_DATE <= '2019-01-31'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2019-02-01' and PURCHASE_DATE <= '2019-02-15'") )
    days_groups.append( df.query("PURCHASE_DATE >= '2019-02-16' and PURCHASE_DATE <= '2019-02-28'") )
    return days_groups

# get smaller slices of my groups
test_purchase_groups = slice_dataframe_by_dates(temp_df)
all_purchase_groups = slice_dataframe_by_dates(purchase_df)


In [None]:
# Create a dataset from groups based on unique keys
def create_dataset_from_groups(unique_keys_list, days_groups):
    # Initialize an empty DataFrame
    new_dataset = pd.DataFrame()
    new_dataset['MAGIC_KEY'] = unique_keys_list

    # Iterate over each days group
    for days_groupname, days_group_data in enumerate(days_groups):
        # Check if the MAGIC_KEY is present in the days_group_data and convert to uint8
        new_dataset[f'col_{days_groupname}'] = new_dataset['MAGIC_KEY'].isin(days_group_data['MAGIC_KEY']).astype(np.uint8)
        
        # Initialize lists to store box counts and box IDs
        _box_counts = []
        _box_ids = []
        _box_meats = []
        
        # Iterate over each row in the new dataset
        for idx, data in enumerate(new_dataset[f'col_{days_groupname}']):
            if data == 0:
                # If the MAGIC_KEY is not in the group, set box counts, milks and meats and IDs to 0
                _box_counts.append(0)
                _box_ids.append(0)
                _box_meats.append(0)
            else:
                meats = 0
                
                # Otherwise, retrieve amount of milk and meat from the days_group_data
                g = days_group_data[days_group_data['MAGIC_KEY'] == new_dataset.iloc[idx, 0]]
                _box_counts.append(g['BOX_COUNT'].to_list())
                _box_ids.append(g['BOX_ID'].to_list())
                for x in range(len(_box_counts[-1])):
                    meats += _box_counts[-1][x] * boxes.iloc[int(_box_ids[-1][x])-1, 4]
                _box_meats.append(meats)
        
        # Add box meat volume column to the new dataset
        new_dataset[f'col_{days_groupname}_meat'] = _box_meats
        # new_dataset[f'col_{days_groupname}_box_counts'] = _box_counts
        # new_dataset[f'col_{days_groupname}_box_ids'] = _box_ids
    
    return new_dataset

# Assuming test_purchase_groups, temp_df, test_keys, and all_purchase_groups are defined elsewhere
days_groups = test_purchase_groups
dataset = temp_df

# Create test and train datasets
test_df = create_dataset_from_groups(test_keys, test_purchase_groups)
train_df = create_dataset_from_groups(train_keys, all_purchase_groups)

In [None]:
# Renaming for our convenience 
def process_dataset(df):
    df = df.drop(['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8', 'col_9'], axis=1)
    
    col_name_maps = {
        'col_0_meat': '1st',
        'col_1_meat': '2nd',
        'col_2_meat': '3rd',
        'col_3_meat': '4th',
        'col_4_meat': '5th',
        'col_5_meat': '6th',
        'col_6_meat': '7th',
        'col_7_meat': '8th',
        'col_8_meat': '9th',
        'col_9_meat': '10th',}
    df = df.rename(col_name_maps, axis=1)
    
    return df


test_df = process_dataset(test_df)
train_df = process_dataset(train_df)

In [46]:
# Save the processed dataset
test_df.to_csv('data/problem-3-test.csv', index=False)
train_df.to_csv('data/problem-3-train.csv', index=False)

# Load Preprocessed dataset and Train Models

In [47]:

df_phase1 = pd.read_csv("./data/problem-3-train.csv")
df_phase2 = pd.read_csv("./data/problem-3-test.csv")

In [48]:
df_phase1

Unnamed: 0,MAGIC_KEY,1st,2nd,3rd,4th,5th,6th,7th,8th,9th,10th
0,249670911D8,0.0,0.0,2.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,249751FC4DD,1.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,24978027606,0.0,2.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,24979164422,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0
4,2497B8B4FDA,1.8,0.0,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
95,24AB707A0D3,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,24AB77B25A6,0.0,0.0,2.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,24AB7C0779C,3.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,24AB85041D3,1.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
def get_model(checkpoint_path=None):
    model = Sequential()
    model.add(LSTM(512, return_sequences=True, input_shape=(8, 1))) 
    model.add(LSTM(units=128, return_sequences=True))
    model.add(LSTM(units=64))

    # Add the output layer
    model.add(Dense(1,   activation='linear', dtype=tf.float32))
    
    if checkpoint_path is not None:
        print('Loading Weights')
        model.load_weights(checkpoint_path)
    model.summary()
    return model

def compile_and_fit_model(model, train_data, learning_rate, save_path='model_2.keras'):
    trainX, trainY = train_data
    
    model.compile(loss='mse', optimizer=keras.optimizers.Adam(learning_rate=learning_rate), metrics=[keras.metrics.R2Score()])
    history = model.fit(
        trainX, trainY, 
        epochs=1000, 
        batch_size=4096*2, 
        validation_split=0.1, 
        callbacks=[
            keras.callbacks.ModelCheckpoint(save_path, save_best_only=True, save_weights_only=False, verbose=1),
            keras.callbacks.ReduceLROnPlateau(patience=20, factor=0.1),
            keras.callbacks.EarlyStopping(patience=50)
        ])


In [None]:
X = df_phase1.iloc[:, 1:17].to_numpy().astype(np.float32)
Y = df_phase1.iloc[:, 17].to_numpy().astype(np.float32)

In [None]:
def train_phase(df, pretrained_checkpoint=None, save_path=None):
    X = df.iloc[:, 1:17].to_numpy().astype(np.float32)
    Y = df.iloc[:, 17].to_numpy().astype(np.float32)
    
    trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.1, random_state=42)
    
    trainX = np.expand_dims(trainX, axis=-1)
    testX = np.expand_dims(testX, axis=-1)
    
    model = get_model(checkpoint_path=pretrained_checkpoint)
    compile_and_fit_model(model, (trainX, trainY), 1e-3, save_path=save_path)


In [None]:
# Initially train with the training dataset
train_phase(df_phase1, None, save_path='model.keras')

In [None]:
# In phase 2, fine-tune the model
train_phase(df_phase2, 'model.keras', 'model_finetune.keras')

In [51]:
df_phase2.iloc[:, 3:]

Unnamed: 0,3rd,4th,5th,6th,7th,8th,9th,10th
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.2,0.0,0.0,2.2,0.0,0.0,0.0,0.0
4,0.0,2.2,0.0,0.0,0.0,0.0,2.2,0.0
...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.8
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# predict from model and create a submission
def create_submission(checkpoint_path, out_csv_filename='submission.csv'):
    # Load pretrained weights after training and fine-tuning is done
    model = get_model(checkpoint_path)
    X = df_phase2.iloc[:, 3:]
    preds = model.predict(X)
    preds = preds.reshape(-1,)
    submission = pd.DataFrame()
    submission['MEAT'] = preds
    submission["MAGIC_KEY"] = df_phase2['MAGIC_KEY']
    submission.to_csv(out_csv_filename, index=False)

In [None]:
create_submission('model_finetune.keras', 'submission.csv')