# Step 1: Setup and Imports

In [2]:
import pandas as pd
import numpy as np
import os
import datetime
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import Callback, EarlyStopping, TensorBoard
import time
# Display the version of TensorFlow
print("TensorFlow version:", tf.__version__)
base_path = 'C:\\Dev'
csv_file = os.path.join(base_path, 'OneHotEncoderDataframe.csv')

TensorFlow version: 2.16.1


# Step 2: Load and Preprocess Data

In [7]:
# Step 2: Load and Preprocess Data in Chunks
image_directory = os.path.join(base_path, 'images_binary')
training_path = os.path.join(base_path, 'training_data')
#ItemId,PrimaryHierarchyId,Volume,ColorId,PlpTagId,SiteRulerId,UnitsOfMeasurmentId,IsUnisex,Ingredients,HierarchyId,AudienceId,ProductTypeId,ProgrammaId,CategoryId,FitId,FitTableType,TagTypeId,IsTagMan,IsTagWoman,IsTagKids,CustomTagId,ImageType,ImageFileName,ImagePath
# Create a function to read the CSV in chunks and process the data
def process_csv_in_chunks(csv_file, chunk_size=10000):
    # Read the first chunk to determine column names and dtypes
    first_chunk = pd.read_csv(csv_file, nrows=1)
    column_names = first_chunk.columns
    # Assume all columns except the last two are numeric, and the last two columns are strings
    dtypes = {col: np.float32 for col in column_names}
    dtypes[column_names[22]] = str  # 'imageName'
    dtypes[column_names[23]] = str  # 'ImagePath'

    for chunk in pd.read_csv(csv_file, chunksize=chunk_size, dtype=dtypes):
        yield chunk

# Use the generator to process the CSV in chunks and split into training and validation sets incrementally
def incremental_train_val_split(generator, test_size=0.2, random_state=42):
    train_data_list = []
    val_data_list = []
    for chunk in generator:
        # Fill missing values in 'ImagePath' with a placeholder string
        chunk['ImageFileName'] = chunk['ImageFileName'].fillna('placeholder.jpg')
        chunk['ImagePath'] = chunk['ImagePath'].fillna('placeholder.jpg')
        
        # Ensure all values in 'ImageFileName' and 'ImagePath' are strings
        chunk['ImageFileName'] = chunk['ImageFileName'].astype(str)
        chunk['ImagePath'] = chunk['ImagePath'].astype(str)

        train_chunk, val_chunk = train_test_split(chunk, test_size=test_size, random_state=random_state)
        train_data_list.append(train_chunk)
        val_data_list.append(val_chunk)
    train_data = pd.concat(train_data_list)
    val_data = pd.concat(val_data_list)
    empty_string_values = train_data[train_data['ImagePath'] == '']
    print(empty_string_values)
    return train_data, val_data

# Create the generator
csv_generator = process_csv_in_chunks(csv_file)

# Split the data incrementally
train_data, val_data = incremental_train_val_split(csv_generator)

exclude_columns = [train_data.columns[22], train_data.columns[23]]  # 'imageName' and 'ImagePath'

# Convert paths and labels for compatibility with ImageDataGenerator
train_paths = train_data[exclude_columns[1]].values  # The 'ImagePath' column
val_paths = val_data[exclude_columns[1]].values  # The 'ImagePath' column

train_labels = train_data.drop(columns=['ImageFileName', 'ImagePath']).values
val_labels = val_data.drop(columns=['ImageFileName', 'ImagePath']).values

# Convert labels to dataframe for compatibility with ImageDataGenerator
train_labels_df = pd.DataFrame(train_labels, columns=train_data.columns)
val_labels_df = pd.DataFrame(val_labels, columns=val_data.columns)

# Create ImageDataGenerators
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)

# Combine paths and labels into DataFrames
train_df = pd.concat([pd.Series(train_paths, name='ImagePath'), train_labels_df], axis=1)
val_df = pd.concat([pd.Series(val_paths, name='ImagePath'), val_labels_df], axis=1)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=image_directory,  # The root directory containing all subdirectories
    x_col='ImagePath',
    y_col=train_labels_df.columns.tolist(),
    target_size=(128, 128),  # Resize images to 128x128
    batch_size=32,
    class_mode='raw'
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df,
    directory=image_directory,  # The root directory containing all subdirectories
    x_col='ImagePath',
    y_col=val_labels_df.columns.tolist(),
    target_size=(128, 128),  # Resize images to 128x128
    batch_size=32,
    class_mode='raw'
)

# Optional: Save the processed data
np.save(os.path.join(training_path, 'train_paths.npy'), train_paths)
np.save(os.path.join(training_path, 'val_paths.npy'), val_paths)
np.save(os.path.join(training_path, 'train_labels.npy'), train_labels)
np.save(os.path.join(training_path, 'val_labels.npy'), val_labels)


Empty DataFrame
Columns: [ItemId, PrimaryHierarchyId, Volume, ColorId, PlpTagId, SiteRulerId, UnitsOfMeasurmentId, IsUnisex, Ingredients, HierarchyId, AudienceId, ProductTypeId, ProgrammaId, CategoryId, FitId, FitTableType, TagTypeId, IsTagMan, IsTagWoman, IsTagKids, CustomTagId, ImageType, ImageFileName, ImagePath, AudienceId_1.0, AudienceId_2.0, AudienceId_3.0, AudienceId_nan, ProductTypeId_3.0, ProductTypeId_4.0, ProductTypeId_5.0, ProductTypeId_6.0, ProductTypeId_7.0, ProductTypeId_9.0, ProductTypeId_10.0, ProductTypeId_16.0, ProductTypeId_17.0, ProductTypeId_18.0, ProductTypeId_19.0, ProductTypeId_20.0, ProductTypeId_22.0, ProductTypeId_23.0, ProductTypeId_28.0, ProductTypeId_29.0, ProductTypeId_30.0, ProductTypeId_31.0, ProductTypeId_32.0, ProductTypeId_33.0, ProductTypeId_34.0, ProductTypeId_35.0, ProductTypeId_36.0, ProductTypeId_39.0, ProductTypeId_nan, ProgrammaId_12.0, ProgrammaId_13.0, ProgrammaId_14.0, ProgrammaId_15.0, ProgrammaId_16.0, ProgrammaId_17.0, ProgrammaId_18.0,

TypeError: All values in column x_col=ImagePath must be strings.

# Step 3: Build and Compile the Model

In [None]:
# Define the model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(train_labels_df.shape[1], activation='softmax')  # Use 'sigmoid' for multi-label classification
])
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


# Step 4: Define Custom Callback for Saving

In [None]:
class TimeBasedModelCheckpoint(Callback):
    def __init__(self, filepath, save_freq='epoch', verbose=0):
        super(TimeBasedModelCheckpoint, self).__init__()
        self.filepath = filepath
        self.save_freq = save_freq
        self.verbose = verbose
        self.start_time = time.time()

    def on_epoch_end(self, epoch, logs=None):
        if self.save_freq == 'epoch':
            current_time = time.time()
            elapsed_time = current_time - self.start_time
            # Save every hour (3600 seconds)
            if elapsed_time >= 3600:
                timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
                save_path = self.filepath.format(epoch=epoch + 1, timestamp=timestamp)
                if self.verbose > 0:
                    print(f'\nElapsed time: {elapsed_time:.2f}s - Saving model to {save_path}')
                self.model.save(save_path)
                self.start_time = time.time()

# Path where you want to save your model
models_path = os.path.join(base_path,'models')
filepath = os.path.join(models_path,'model_epoch{epoch:02d}_time{timestamp}.h5')

# Instantiate the custom callback
time_based_checkpoint = TimeBasedModelCheckpoint(filepath, save_freq='epoch', verbose=1)

# Step 5: Setup TensorBoard for Visualization

In [None]:
# Directory where TensorBoard logs will be saved
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Step 6: Train the Model

In [None]:
# Early stopping to avoid unnecessary training
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with the custom callback, TensorBoard, and early stopping
history = model.fit(
    train_generator,
    epochs=20,
    validation_data=val_generator,
    callbacks=[time_based_checkpoint, tensorboard_callback, early_stopping]
)

# Step 7: Launch TensorBoard (Run this cell in a separate code block)

In [None]:

# Ensure TensorBoard server is started in a terminal
# !tensorboard --logdir=logs/fit

# Step 8: Make Predictions and Save JSON Output

In [None]:
# Function to convert predictions to JSON
def predictions_to_json(predictions, field_names):
    results = []
    for pred in predictions:
        result = {field_names[i]: float(pred[i]) for i in range(len(field_names))}
        results.append(result)
    return json.dumps(results, indent=4)

# Make predictions
predictions = model.predict(val_generator)
field_names = data.columns[1:]  # Exclude 'ImagePath'
json_output = predictions_to_json(predictions, field_names)

# Save the JSON output
with open('predictions.json', 'w') as json_file:
    json_file.write(json_output)
