# Summary
## 1. Initial Model Training: Train an initial CNN model using your labeled dataset.
## 2. Continuous Data Ingestion: Collect new labeled data using Power Apps or another tool.
## 3. Self-Learning Loop: Periodically retrain the model with the new data and save the updated model.

Specific Recommendations

# Image Classification:
- From Scratch: Typically, 1,000+ images per class.
- Transfer Learning: 100-1,000 images per class can suffice, depending on the task complexity and similarity to the pre-trained model's domain.

In [1]:
import os
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# Define directories and parameters
excel_path = "/Users/colleenjung/Downloads/Book.xlsx"  # Path to the Excel file
image_dir = "/Users/colleenjung/Desktop/UChicago/24SummerCorrugated/Transit Damage"  # Replace with the directory where images are stored
image_size = (224, 224)
model_path = 'damage_classification_model.h5'  # Path to save the model
batch_size = 32

# Function to read Excel file and preprocess data
def read_excel(excel_path):
    df = pd.read_excel(excel_path)
    df['file_path'] = df['file_name'].apply(lambda x: os.path.join(image_dir, x))
    return df

# Custom data generator
def custom_data_generator(df, batch_size, image_size, subset):
    datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)
    if subset == 'training':
        subset_df = df[df['subset'] == 'training']
    else:
        subset_df = df[df['subset'] == 'validation']
    
    while True:
        for start in range(0, len(subset_df), batch_size):
            end = min(start + batch_size, len(subset_df))
            batch_df = subset_df[start:end]
            
            images = []
            labels = []
            
            for _, row in batch_df.iterrows():
                img = load_img(row['file_path'], target_size=image_size)
                img = img_to_array(img)
                images.append(img)
                labels.append(row['label'])
            
            images = np.array(images, dtype='float32') / 255.0
            labels = to_categorical(labels, num_classes=df['label'].nunique())
            
            yield images, labels

# Function to build and compile the model
def build_model(num_classes):
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    for layer in base_model.layers:
        layer.trainable = False
    
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.5)(x)
    predictions = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=base_model.input, outputs=predictions)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Initial training of the model
def initial_training():
    df = read_excel(excel_path)
    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'])
    train_df['subset'] = 'training'
    val_df['subset'] = 'validation'
    df = pd.concat([train_df, val_df])
    
    num_classes = df['label'].nunique()
    
    train_generator = custom_data_generator(df, batch_size, image_size, 'training')
    validation_generator = custom_data_generator(df, batch_size, image_size, 'validation')
    
    model = build_model(num_classes)
    model.fit(
        train_generator,
        steps_per_epoch=len(train_df) // batch_size,
        validation_data=validation_generator,
        validation_steps=len(val_df) // batch_size,
        epochs=10,
        verbose=1
    )

    model.save(model_path)
    print(f"Model saved at {model_path}")

# Execute initial training
initial_training()


2024-08-06 17:02:01.245376: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


KeyError: 'file_name'

In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

image_dir = "/Users/colleenjung/Desktop/UChicago/24SummerCorrugated/Transit Damage"
excel_path = "/Users/colleenjung/Downloads/Book.xlsx"

def read_excel(excel_path):
    df = pd.read_excel(excel_path)
    print(df.columns)  # Print the columns to debug
    # Use the correct column name
    if 'file_name' in df.columns:
        df['file_path'] = df['file_name'].apply(lambda x: os.path.join(image_dir, x))
    elif 'filename' in df.columns:  # Example alternative column name
        df['file_path'] = df['filename'].apply(lambda x: os.path.join(image_dir, x))
    else:
        raise KeyError("The expected 'file_name' or 'filename' column is not found in the Excel file.")
    return df

def initial_training():
    df = read_excel(excel_path)
    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'])
    train_df['subset'] = 'training'
    val_df['subset'] = 'validation'
    # Your training code here...
    print("Training completed.")
    model_path = "path_to_save_model.h5"  # Update this to your desired model save path
    # model.save(model_path)
    print(f"Model saved at {model_path}")

# Execute initial training
initial_training()


Index(['Main Product Type', 'Material Name', 'ID', 'Material Shape',
       'Paper Type', 'Weight in Lbs. Per Sq Inch', 'Width in Inch',
       'Length in Inch', 'Diameter in Inch', 'Core', 'Length in Feet',
       'Weight in Lbs.', 'Moisture', 'Image #', 'Main Damage Reason', 'Notes',
       'Source', 'Damage Dimensions', 'DamageLocation', 'Method', '#Layers'],
      dtype='object')


KeyError: "The expected 'file_name' or 'filename' column is not found in the Excel file."