In [77]:
import pandas as pd
import numpy  as np
from tensorflow.keras.models import load_model
import joblib

In [78]:
n_steps = 8
num_steps, num_features = n_steps, 1
size = 0.1


In [79]:
# Load the required datasets
merged_data = pd.read_csv('../processed_data/merged_data.csv')  # Main dataset containing merged data
catalog = pd.read_csv('../data/product_catalog.csv')  # Product catalog dataset
customer_features = pd.read_csv('../processed_data/customer_features.csv')  # Customer feature dataset

# Drop unnecessary columns from the catalog and customer features datasets
catalog.drop(columns=["categories"], inplace=True)  # Remove the 'categories' column as it's not needed
customer_features = customer_features.drop(columns=["frequent_product", "parent_category_id"])  # Remove redundant columns

# Load test data for filtering
test_data = pd.read_csv('../data/test.csv')  # Test dataset containing customer-product pairs

# Display the initial length of the merged data
print("len merge:", len(merged_data))

# Select a subset of the merged data based on the defined size
size10 = int(size * len(merged_data))  # Calculate the size for the subset
merged_data = merged_data[:size10]  # Reduce the merged data to the subset

# Filter the merged data to include only rows that exist in the test data
selected_data = merged_data.merge(
    test_data[["customer_id", "product_id"]],  # Keep only customer_id and product_id from test data
    on=["customer_id", "product_id"],  # Match on customer_id and product_id
    how="inner"  # Perform an inner join to keep only matching rows
)

# Display the final length of the merged data
print("len merge:", len(merged_data))


len merge: 896426
len merge: 89642


In [80]:
# Merge the selected data with the catalog to add product-related features
selected_data = pd.merge(selected_data, catalog, on='product_id')  # Merge on 'product_id' to enrich the dataset with catalog information

In [81]:
# Load the pre-trained scaler
scaler = joblib.load("../scalers/scaler.pkl")  # Load the MinMaxScaler used during training
print("Scaler loaded successfully!")

# Redefine the function used in the Lambda layer
def extract_column(column_index):
    """
    Extract a specific column from the input tensor.
    
    Parameters:
        column_index (int): Index of the column to extract.
        
    Returns:
        Function: A lambda function to extract the specified column.
    """
    return lambda x: x[:, column_index]

# Specify the custom objects needed for loading the model
custom_objects = {"extract_column": extract_column}

# Load the trained model with the custom Lambda function
model = load_model("../models/best_multiclass_model1.keras", custom_objects=custom_objects, safe_mode=False)
print("Model loaded successfully!")

# Load the encoders for customer and product IDs
customer_encoder = joblib.load("../encoders/customer_encoder.pkl")  # Encoder for customer IDs
product_encoder = joblib.load("../encoders/product_encoder.pkl")  # Encoder for product IDs

print("Encoders loaded successfully!")


Scaler loaded successfully!
Model loaded successfully!
Encoders loaded successfully!


In [82]:
def prepare_prediction_inputs(data, catalog, customer_catalog, n_steps):
    """
    Prepares input data for predictions by extracting and encoding relevant features.

    Parameters:
        data (DataFrame): Input data containing customer and product IDs with weekly sales.
        catalog (DataFrame): Product catalog with product-related features.
        customer_catalog (DataFrame): Customer catalog with customer-related features.
        n_steps (int): Number of historical weeks to include in the input.

    Returns:
        Tuple: Arrays for time series input, customer features, and product features.
    """
    X_pred, customers_pred, products_pred = [], [], []

    # Create dictionaries for customer and product features
    customer_features_dict = customer_catalog.set_index('customer_id').to_dict(orient='index')
    product_features_dict = catalog.set_index('product_id').to_dict(orient='index')

    # Select columns representing weekly data
    week_columns = [col for col in data.columns if '/' in col]

    # Encode customer and product IDs
    data["customer_id"] = customer_encoder.transform(data["customer_id"])  # Transform customer IDs
    data["product_id"] = product_encoder.transform(data["product_id"])  # Transform product IDs

    # Prepare inputs for prediction
    for index in data.index:
        # Extract the last n_steps weeks of data
        weeks = data.loc[index, week_columns].values[-n_steps:]
        customer_id = data.loc[index, "customer_id"]
        product_id = data.loc[index, "product_id"]

        # Get customer and product features
        customer_features = customer_features_dict.get(customer_id, {})  # Default to empty dict if not found
        product_features = product_features_dict.get(product_id, {})  # Default to empty dict if not found

        # Only include rows with sufficient data
        if len(weeks) == n_steps:
            X_pred.append(weeks.reshape(-1, 1))  # Reshape weeks data for LSTM input
            customers_pred.append([customer_id] + list(customer_features.values()))  # Include customer features
            products_pred.append([product_id] + list(product_features.values()))  # Include product features

    # Return prepared inputs as numpy arrays
    return np.array(X_pred), np.array(customers_pred), np.array(products_pred)


In [83]:
def scale_and_reshape(scaler, X, num_steps, num_features):
    """
    Scales and reshapes input data for model compatibility.

    Parameters:
        scaler (sklearn.preprocessing.MinMaxScaler): The scaler used to normalize the data.
        X (numpy.ndarray): Input data to be scaled and reshaped.
        num_steps (int): Number of time steps (sequence length) in the input data.
        num_features (int): Number of features for each time step.

    Returns:
        numpy.ndarray: Scaled and reshaped data.
    """
    # Reshape the input data into a 2D array for scaling
    X_reshaped = X.reshape(-1, num_features)  # Flatten the sequences for scaling
    
    # Apply the scaler to normalize the data
    X_scaled = scaler.transform(X_reshaped)  # Scale the data using the provided scaler
    
    # Reshape the scaled data back into the original sequence format
    return X_scaled.reshape(-1, num_steps, num_features)


In [84]:
# Prepare prediction data
X_pred, customers_pred, products_pred = prepare_prediction_inputs(
    selected_data, catalog, customer_features, n_steps
)  # Extract time series, customer, and product features for prediction

# Separate customer IDs and features
customers_pred_ids = customers_pred[:, 0]  # Extract customer IDs
customers_pred_features = customers_pred[:, 1:]  # Extract customer-related features

# Scale and reshape input data for the model
X_pred_scaled = scale_and_reshape(scaler, X_pred, n_steps, 1)  # Scale and reshape the time series data

# Generate predictions using the trained model
predictions = model.predict([customers_pred_ids, customers_pred_features, products_pred, X_pred_scaled])

# Determine the predicted classes
predicted_classes = np.argmax(predictions, axis=1)  # Get the class with the highest probability

# Display the predicted classes
print("Predicted Classes:", predicted_classes)


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
Predicted Classes: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 

In [85]:
print(f"Length of customers_pred: {len(customers_pred)}")
print(f"Length of products_pred: {len(products_pred)}")
print(f"Length of predictions: {len(predictions.argmax(axis=1))}")


Length of customers_pred: 978
Length of products_pred: 978
Length of predictions: 978


In [86]:
products_pred

array([[20664,   408,     4, ...,   284,     0,    66],
       [28231,   193,     4, ...,   468,     3,   108],
       [ 2690,   406,     4, ...,   491,     0,    66],
       ...,
       [23914,   408,     4, ...,   334,     0,    44],
       [11178,   194,    10, ...,   503,     3,    85],
       [32649,   498,     4, ...,   491,     3,    66]])

In [87]:
# Convert prediction results to a DataFrame
prediction_df = pd.DataFrame({
    "customer_id": customer_encoder.inverse_transform(customers_pred_ids.astype(int)),  # Decode customer IDs back to their original values
    "product_id": product_encoder.inverse_transform(products_pred[:, 0].astype(int)),  # Decode product IDs back to their original values
    "prediction": predictions.argmax(axis=1)  # Get the predicted classes
})

# Merge the predictions with the test data
test_data = test_data.drop(columns=['prediction'], errors='ignore')  # Remove the existing 'prediction' column if it exists
test_data = test_data.merge(
    prediction_df,
    on=['customer_id', 'product_id'],  # Match predictions with test data by customer and product IDs
    how='left'  # Use a left join to keep all rows in the test data
)

# Fill missing prediction values with 0 (for rows without predictions)
test_data['prediction'] = test_data['prediction'].fillna(0).astype(int)  # Ensure predictions are integers

# Save the prediction results to a CSV file
output_path = "../predictions/predictions.csv"  # Define the output file path
test_data.to_csv(output_path, index=False)  # Save the DataFrame to a CSV file
print(f"Prediction results saved to {output_path}")

# Print summary information
print(test_data.head())  # Display the first few rows of the updated test data
print("Customers Prediction Shape:", customers_pred.shape)  # Expected: (number_of_samples, total_number_of_features)
print("Products Prediction Shape:", products_pred.shape)  # Expected: (number_of_samples, total_number_of_features)
print("Scaled Prediction Shape:", X_pred_scaled.shape)  # Expected: (number_of_samples, n_steps, 1)


Prediction results saved to ../predictions/predictions.csv
   id  customer_id  product_id  prediction
0   0            0       20664           2
1   1            0       28231           2
2   2           13        2690           2
3   3           15        1299           2
4   4           15       20968           2
Customers Prediction Shape: (978, 5)
Products Prediction Shape: (978, 7)
Scaled Prediction Shape: (978, 8, 1)


In [88]:
print(customers_pred[:5]) 
print(products_pred[:5])  


[[ 0.         20.         13.         10.          1.11111111]
 [ 0.         20.         13.         10.          1.11111111]
 [13.         67.         35.         12.          1.34      ]
 [15.         62.         38.         14.          1.26530612]
 [15.         62.         38.         14.          1.26530612]]
[[20664   408     4     0   284     0    66]
 [28231   193     4     3   468     3   108]
 [ 2690   406     4     3   491     0    66]
 [ 1299  1056     4     0   474    -1   108]
 [20968  1315     4     0   444     0   144]]


In [89]:
prediction_df.head(100)

Unnamed: 0,customer_id,product_id,prediction
0,0,20664,2
1,0,28231,2
2,13,2690,2
3,15,1299,2
4,15,20968,2
...,...,...,...
95,459,28347,2
96,460,31525,2
97,471,8615,2
98,473,18630,2


In [90]:
test_data.head(100)

Unnamed: 0,id,customer_id,product_id,prediction
0,0,0,20664,2
1,1,0,28231,2
2,2,13,2690,2
3,3,15,1299,2
4,4,15,20968,2
...,...,...,...,...
95,95,459,28347,2
96,96,460,31525,2
97,97,471,8615,2
98,98,473,18630,2
