In [None]:
!pip install opencv-python-headless
!pip install opencv-python --no-binary :all: --compile
!pip install python-xlib

In [None]:
!pip install --upgrade tensorflow

In [None]:

!pip install --upgrade autogluon
!pip install pandas==1.5.3 pytorch-lightning==1.9.5 scikit-learn==1.3.0 torch==1.13.1 torchmetrics==0.11.4 torchvision==0.14.1


In [36]:
import boto3
import cv2
import numpy as np
from skimage import measure
from skimage.filters import threshold_otsu
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import tempfile
import math
import logging
import shutil
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from sklearn.model_selection import KFold
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Average
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping
from sklearn.model_selection import train_test_split

In [37]:

def load_and_preprocess(file_content):
    # Load the image from file content
    image = cv2.imdecode(np.frombuffer(file_content, np.uint8), cv2.IMREAD_GRAYSCALE)
    
    # Apply contrast stretching
    p2, p98 = np.percentile(image, (2, 98))
    image_stretched = np.interp(image, (p2, p98), (0, 255)).astype(np.uint8)
    
    return image_stretched

def segment_teeth(image):
    # Apply Otsu's thresholding
    thresh = threshold_otsu(image)
    binary = image > thresh

    # Label connected regions
    labels = measure.label(binary)

    # Filter regions based on properties (size, shape, etc.)
    properties = measure.regionprops(labels)
    teeth_regions = [prop for prop in properties if prop.area > 1000 and prop.eccentricity < 0.9]

    # Create a mask for teeth
    teeth_mask = np.zeros_like(image, dtype=np.uint8)
    for region in teeth_regions:
        teeth_mask[labels == region.label] = 255

    return teeth_mask

def create_enhanced_model(input_shape, num_classes):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape, padding='same'),
        BatchNormalization(),
        Conv2D(32, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        Conv2D(64, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        Conv2D(64, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        Conv2D(128, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        Conv2D(128, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        Flatten(),
        Dense(256, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


def process_images(input_bucket, input_prefix, output_bucket, output_prefix):
    s3 = boto3.client('s3')

    # List objects in the input bucket with the specified prefix
    response = s3.list_objects_v2(Bucket=input_bucket, Prefix=input_prefix)

    # Iterate over the objects in the bucket
    for obj in response.get('Contents', []):
        object_key = obj['Key']

        if object_key.endswith(('.png', '.jpg', '.jpeg')):
            # Download the image from S3
            file_obj = s3.get_object(Bucket=input_bucket, Key=object_key)
            file_content = file_obj['Body'].read()

            # Load and preprocess the image
            preprocessed_image = load_and_preprocess(file_content)

            # Segment teeth
            teeth_mask = segment_teeth(preprocessed_image)

            # Apply the mask to the original image
            segmented_image = cv2.bitwise_and(preprocessed_image, preprocessed_image, mask=teeth_mask)

            # Save the segmented image to the output bucket
            output_key = f"{output_prefix}segmented_{object_key.split('/')[-1]}"
            s3.put_object(Bucket=output_bucket, Key=output_key, Body=cv2.imencode('.png', segmented_image)[1].tobytes())

            print(f"Processed and saved: {output_key}")


In [38]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def download_s3_folder(train_bucket, train_prefix, local_dir):
    """
    Download a folder from S3 to a local directory.
    """
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    downloaded_files = 0
    for result in paginator.paginate(Bucket=train_bucket, Prefix=train_prefix):
        for obj in result.get('Contents', []):
            key = obj['Key']
            if not key.endswith('/'):  # Ignore empty directories
                relative_path = os.path.relpath(key, train_prefix)
                local_file_path = os.path.join(local_dir, relative_path)
                os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
                s3.download_file(train_bucket, key, local_file_path)
                downloaded_files += 1
                logger.info(f"Downloaded: {local_file_path}")
    
    logger.info(f"Downloaded {downloaded_files} files from S3")
    if downloaded_files == 0:
        raise ValueError(f"No files found in S3 bucket {train_bucket} with prefix {train_prefix}")

def organize_files(local_dir):
    """
    Organize files into class directories based on their names.
    """
    data_dir = set()
   for root, _, files in os.walk(local_dir):
        for file in files:
            if file.endswith(('.png', '.jpg', '.jpeg')):
                # Extract class name from file name (adjust this based on your file naming convention)
               files_name = file.split('_')[0]  # Assuming class name is the first part of the file name
                data_dir = os.path.join(local_dir, files_name)
                os.makedirs(data_dir, exist_ok=True)
                shutil.move(os.path.join(root, file), os.path.join(data_dir, file))
                data_dirs.add(class_name)
    
    logger.info(f"Organized files into class directories: {class_dirs}")
    return list(data_dirs)

def step_decay(epoch):
    initial_lrate = 0.001
    drop = 0.5
    epochs_drop = 10.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate

def create_data_generators(data_dir, img_height=128, img_width=128, batch_size=10):
    # List all image files
    image_files = [f for f in os.listdir(data_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]
    
    # Get class labels
    labels = [f.split('_')[0] for f in image_files]  # based on file naming convention
    
    # Get unique classes
    classes = sorted(list(set(labels)))
    num_classes = len(classes)
    
    # Create a mapping of class names to indices
    class_to_index = {cls: idx for idx, cls in enumerate(classes)}
    
    # Convert labels to indices
    label_indices = [class_to_index[label] for label in labels]
    

    
    # Further split train+val into train and validation sets
    train_files, val_files, train_labels, val_labels = train_test_split(
        image_files, labels, test_size=0.2, stratify=labels, random_state=42
    )
    
    # Create data generators
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest',
        validation_split=0.2        
    )

    val_test_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest',
        validation_split=0.2       
    )

    # Create generators
    train_generator = train_datagen.flow_from_directory(
        data_dir,
        target_size=(img_height, img_width),
        batch_size=batch_size,
        class_mode='categorical',
        classes=classes,
        shuffle=True,
        color_mode='grayscale'
    )

    validation_generator = val_test_datagen.flow_from_directory(
        data_dir,
        target_size=(img_height, img_width),
        batch_size=batch_size,
        class_mode='categorical',
        classes=classes,
        shuffle=False,
        color_mode='grayscale'
    )

    return train_generator, validation_generator, num_classes

def train_model(train_bucket, train_prefix, model_output_path):
    # Create a temporary local directory for the S3 data
    with tempfile.TemporaryDirectory() as local_data_dir:
        try:
            # Download the data from S3 to the local directory
            download_s3_folder(train_bucket, train_prefix, local_data_dir)

            # Create data generators
            train_generator, validation_generator, num_classes = create_data_generators(local_data_dir)

            # Create the model
            model = create_enhanced_model((128, 128, 1), num_classes)

            # Define callbacks
            lr_scheduler = LearningRateScheduler(step_decay)
            early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

            # Calculate steps_per_epoch and validation_steps
            steps_per_epoch = math.ceil(train_generator.samples / train_generator.batch_size)
            validation_steps = math.ceil(validation_generator.samples / validation_generator.batch_size)

            # Train the model
            history = model.fit(
                train_generator,
                steps_per_epoch=steps_per_epoch,
                validation_data=validation_generator,
                validation_steps=validation_steps,
                epochs=50,  # Increased epochs, early stopping will prevent overfitting
                callbacks=[lr_scheduler, early_stopping]
            )

            # Evaluate the model on the test set
            test_loss, test_accuracy = model.evaluate(validation_generator)
            print(f"Test accuracy: {test_accuracy:.4f}")

            # Save the model
            with tempfile.NamedTemporaryFile(suffix='.h5', delete=False) as temp_model_file:
                model.save(temp_model_file.name)
                
                # Upload the model to S3
                s3 = boto3.client('s3')
                s3.upload_file(temp_model_file.name, train_bucket, model_output_path)
                print(f"Model saved to s3://{train_bucket}/{model_output_path}")

            # Get validation data for metrics calculation
            validation_data = next(validation_generator)
            validation_images, validation_labels = validation_data[0], validation_data[1]

           

        except Exception as e:
            print(f"An error occurred during model training: {str(e)}")
            raise
    return model, validation_images, validation_labels, history

       

In [39]:

def classify_images(input_bucket, input_prefix, model_output_path, classification_output_file):
    s3 = boto3.client('s3')

    # Download the trained model from S3 to a temporary file
    with tempfile.NamedTemporaryFile(suffix='.h5', delete=False) as temp_model_file:
        s3.download_fileobj(input_bucket, model_output_path, temp_model_file)
        temp_model_file.flush()
        
        # Load the model from the temporary file
        model = load_model(temp_model_file.name)

    # Remove the temporary file
    os.unlink(temp_model_file.name)

    # Prepare a list to store results
    results = []

    # List objects in the input bucket with the specified prefix
    response = s3.list_objects_v2(Bucket=input_bucket, Prefix=input_prefix)

    # Iterate over the objects in the bucket
    for obj in response.get('Contents', []):
        object_key = obj['Key']

        if object_key.endswith(('.png', '.jpg', '.jpeg')):
            # Download the image from S3
            file_obj = s3.get_object(Bucket=input_bucket, Key=object_key)
            file_content = file_obj['Body'].read()

            # Load and preprocess the image
            image = load_and_preprocess(file_content)
            image = cv2.resize(image, (128, 128))  # Resize to match model input
            image = image.reshape((1, 128, 128, 1)) / 255.0  # Normalize

            # Predict
            prediction = model.predict(image)
            class_index = np.argmax(prediction)
            class_names = ['normal', 'cavity', 'fracture', 'missing']  # Adjust based on your classes
            predicted_class = class_names[class_index]

            # Store result
            results.append(f"{object_key}: {predicted_class}")

    # Write results to S3
    output_content = '\n'.join(results).encode('utf-8')
    s3.put_object(Bucket=input_bucket, Key=classification_output_file, Body=output_content)

    print(f"Classification results saved to s3://{input_bucket}/{classification_output_file}")



In [40]:
def calculate_and_plot_metrics(model, validation_images, validation_labels, history, output_bucket, output_prefix):
    # Make predictions
    predictions = model.predict(validation_images)
    predicted_classes = np.argmax(predictions, axis=1)
    true_classes = np.argmax(validation_labels, axis=1)

    # Calculate accuracy and F1 score
    accuracy = accuracy_score(true_classes, predicted_classes)
    f1 = f1_score(true_classes, predicted_classes, average='weighted')

    # Calculate ROC curve and AUC for each class
    n_classes = validation_labels.shape[1]
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(validation_labels[:, i], predictions[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Plot ROC curve
    plt.figure(figsize=(10, 8))
    colors = ['blue', 'red', 'green', 'yellow']
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    
    # Save the plot
    with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
        plt.savefig(temp_file.name)
        s3 = boto3.client('s3')
        s3.upload_file(temp_file.name, output_bucket, f"{output_prefix}roc_curve.png")

    # Plot training history
    plt.figure(figsize=(10, 8))
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Save the plot
    with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
        plt.savefig(temp_file.name)
        s3.upload_file(temp_file.name, output_bucket, f"{output_prefix}training_history.png")

    # Save metrics to a file
    metrics_content = f"Accuracy: {accuracy}\nF1 Score: {f1}\n"
    for i in range(n_classes):
        metrics_content += f"AUC for class {i}: {roc_auc[i]}\n"
    
    s3.put_object(Bucket=output_bucket, Key=f"{output_prefix}metrics.txt", Body=metrics_content.encode('utf-8'))

    logger.info(f"Metrics and plots saved to s3://{output_bucket}/{output_prefix}")

    return accuracy, f1, roc_auc

In [41]:
def main():
    # Define S3 bucket and prefix
    input_bucket = 'sagemaker-studio-010526272250-hsf94lgtf6'
    output_bucket = input_bucket
    input_prefix = 'Test_Dentex_Images/Panoramic_Dental_Xray_Dataset/'
    segmented_output_prefix = 'Test_Dentex_Images/Panromic_Xray_Segmented_Dataset/'
    train_bucket = input_bucket
    train_prefix = segmented_output_prefix
    model_output_path = 'Test_Dentex_Images/dental_xray_classifier.h5'
    classification_output_file = 'Test_Dentex_Images/classification_results.txt'
    metrics_output_prefix = 'Test_Dentex_Images/model_metrics/'

    # Process and segment images
    process_images(input_bucket, input_prefix, output_bucket, segmented_output_prefix)

    # Train the model
    model, validation_images, validation_labels, history = train_model(train_bucket, train_prefix, model_output_path)

    # Calculate and plot metrics
    accuracy, f1, roc_auc = calculate_and_plot_metrics(model, validation_images, validation_labels, history, output_bucket, metrics_output_prefix)

    logger.info(f"Model Accuracy: {accuracy}")
    logger.info(f"Model F1 Score: {f1}")
    logger.info(f"Model ROC AUC: {roc_auc}")

    # Classify images
    classify_images(input_bucket, input_prefix, model_output_path, classification_output_file)

  

if __name__ == "__main__":
    main()


Processed and saved: Test_Dentex_Images/Panromic_Xray_Segmented_Dataset/segmented_1.jpg
Processed and saved: Test_Dentex_Images/Panromic_Xray_Segmented_Dataset/segmented_10.jpg
Processed and saved: Test_Dentex_Images/Panromic_Xray_Segmented_Dataset/segmented_100.jpg
Processed and saved: Test_Dentex_Images/Panromic_Xray_Segmented_Dataset/segmented_102.jpg
Processed and saved: Test_Dentex_Images/Panromic_Xray_Segmented_Dataset/segmented_103.jpg
Processed and saved: Test_Dentex_Images/Panromic_Xray_Segmented_Dataset/segmented_104.jpg
Processed and saved: Test_Dentex_Images/Panromic_Xray_Segmented_Dataset/segmented_105.jpg
Processed and saved: Test_Dentex_Images/Panromic_Xray_Segmented_Dataset/segmented_106.jpg
Processed and saved: Test_Dentex_Images/Panromic_Xray_Segmented_Dataset/segmented_108.jpg
Processed and saved: Test_Dentex_Images/Panromic_Xray_Segmented_Dataset/segmented_109.jpg
Processed and saved: Test_Dentex_Images/Panromic_Xray_Segmented_Dataset/segmented_11.jpg
Processed and 

INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_1.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_10.jpg


Processed and saved: Test_Dentex_Images/Panromic_Xray_Segmented_Dataset/segmented_99.jpg


INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_100.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_102.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_103.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_104.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_105.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_106.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_108.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_109.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_11.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_110.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_111.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_112.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_113.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_114.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_115.jpg
INFO:__main__:Downloaded: /tmp/tmpxhjrbo6b/segmented_116.jpg
INFO:__main__:Downloaded:

Found 0 images belonging to 1 classes.
Found 0 images belonging to 1 classes.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


An error occurred during model training: Must provide at least one structure


ValueError: Must provide at least one structure