In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
class DefectCSV:
    def __init__(self, data_path, target_column):
        self.data_path = data_path
        self.target_column = target_column
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.xgb_model = None
        self.rf_model = None

    def load_data(self):
        data = pd.read_csv(self.data_path)

        # Calculate the percentage of missing values for each column
        missing_percentage = data.isnull().sum() * 100 / len(data)

        # Drop columns with more than 20% missing values
        data = data.drop(columns=missing_percentage[missing_percentage > 20].index)

        # Impute missing values
        for col in data.columns:
            if data[col].dtype.name == 'object':
                data[col] = data[col].fillna(data[col].mode().iloc[0])
            else:
                data[col] = data[col].fillna(data[col].mean())

        self.X = data.drop(columns=[self.target_column])  # Replace 'target_column' with your actual target column name
        self.y = data[self.target_column]

        print('Preprocessing Data ..')
        # Handle categorical and numerical features
        categorical_cols = self.X.select_dtypes(include=['object']).columns
        numerical_cols = self.X.select_dtypes(include=['number']).columns

        # Encode categorical features
        le = LabelEncoder()
        self.X[categorical_cols] = self.X[categorical_cols].apply(lambda col: le.fit_transform(col))

        # Scale numerical features
        scaler = StandardScaler()
        self.X[numerical_cols] = scaler.fit_transform(self.X[numerical_cols])

        # Split data into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)

    def train_xgboost(self):
        param_grid = {
            'max_depth': [3, 5, 7],
            'learning_rate': [0.1, 0.05, 0.01],
            'n_estimators': [100, 200, 300],
            'subsample': [0.8, 0.9, 1.0],
            'colsample_bytree': [0.8, 0.9, 1.0]
        }
        print('Training XG Boost ..')
        xgb_model = XGBClassifier()
        grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(self.X_train, self.y_train)

        self.xgb_model = grid_search.best_estimator_

    def train_random_forest(self):
        param_dist = {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 7],
            'max_features': ['auto', 'sqrt', 'log2'],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }

        print('Training Random Forest ..')
        rf_model = RandomForestClassifier()
        random_search = RandomizedSearchCV(rf_model, param_dist, n_iter=10, cv=5, scoring='accuracy')
        random_search.fit(self.X_train, self.y_train)
        self.rf_model = random_search.best_estimator_

    def evaluate_models(self):
        print('Evaluating Models ..')
        xgb_pred = self.xgb_model.predict(self.X_test)
        rf_pred = self.rf_model.predict(self.X_test)

        xgb_accuracy = accuracy_score(self.y_test, xgb_pred)
        xgb_precision = precision_score(self.y_test, xgb_pred)
        xgb_recall = recall_score(self.y_test, xgb_pred)
        xgb_f1 = f1_score(self.y_test, xgb_pred)

        rf_accuracy = accuracy_score(self.y_test, rf_pred)
        rf_precision = precision_score(self.y_test, rf_pred)
        rf_recall = recall_score(self.y_test, rf_pred)
        rf_f1 = f1_score(self.y_test, rf_pred)

        print("XGBoost Metrics:")
        print("Accuracy:", xgb_accuracy)
        print("Precision:", xgb_precision)
        print("Recall:", xgb_recall)
        print("F1-Score:", xgb_f1)

        print("\nRandom Forest Metrics:")
        print("Accuracy:", rf_accuracy)
        print("Precision:", rf_precision)
        print("Recall:", rf_recall)
        print("F1-Score:", rf_f1)

        # Choose the model with the best performance based on your chosen metric
        best_model = self.xgb_model if xgb_f1 > rf_f1 else self.rf_model
        return best_model

In [29]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet152
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input
from tensorflow.keras.callbacks import EarlyStopping

class DefectIMG:
    def __init__(self, train_dir, test_dir, image_size=(224, 224), batch_size=32, grayscale=False):
        self.train_dir = train_dir
        self.test_dir = test_dir
        self.image_size = image_size
        self.batch_size = batch_size
        self.grayscale = grayscale
        self.model = None
        self.class_labels = None
        self.input_shape = (image_size[0], image_size[1], 1) if grayscale else (image_size[0], image_size[1], 3)

    def preprocess_data(self):
        color_mode = 'grayscale' if self.grayscale else 'rgb'

        # Data augmentation for training set
        train_datagen = ImageDataGenerator(
            rescale=1./255,
            rotation_range=30,
            width_shift_range=0.3,
            height_shift_range=0.3,
            shear_range=0.3,
            zoom_range=0.3,
            horizontal_flip=True,
            fill_mode='nearest'
        )

        # Simple preprocessing for validation and test set
        val_datagen = ImageDataGenerator(rescale=1./255)
        
        self.train_generator = train_datagen.flow_from_directory(
            self.train_dir,
            target_size=self.image_size,
            batch_size=self.batch_size,
            class_mode='categorical',
            color_mode=color_mode
        )

        self.test_generator = val_datagen.flow_from_directory(
            self.test_dir,
            target_size=self.image_size,
            batch_size=self.batch_size,
            class_mode='categorical',
            color_mode=color_mode,
            shuffle=False
        )

        # Save class labels for later reference
        self.class_labels = self.train_generator.class_indices

    def build_model(self, num_classes):
        base_model = ResNet152(include_top=False, weights='imagenet', input_shape=self.input_shape)

        # Fine-tune the last few layers
        for layer in base_model.layers[:-20]:
            layer.trainable = False

        # Add custom classification layers
        x = GlobalAveragePooling2D()(base_model.output)
        x = Dense(1024, activation='relu')(x)
        x = Dense(512, activation='relu')(x)
        predictions = Dense(num_classes, activation='softmax')(x)

        self.model = Model(inputs=base_model.input, outputs=predictions)

    def train_model(self, epochs):
        self.model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                           loss='categorical_crossentropy',
                           metrics=['accuracy'])
        
        # Adding EarlyStopping to avoid overfitting
        early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        
        self.model.fit(
            self.train_generator,
            steps_per_epoch=len(self.train_generator),
            epochs=epochs,
            callbacks=[early_stopping]
        )

    def predict(self, image_path):
        # Preprocess the image
        img = tf.keras.preprocessing.image.load_img(image_path, target_size=self.image_size, color_mode='grayscale' if self.grayscale else 'rgb')
        x = tf.keras.preprocessing.image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x /= 255.0

        # Make prediction
        prediction = self.model.predict(x)
        predicted_class_index = np.argmax(prediction)

        # Get the actual class name using the saved labels
        predicted_class_name = list(self.class_labels.keys())[predicted_class_index]
        return predicted_class_name

    def evaluate_model(self):
        loss, accuracy = self.model.evaluate(self.test_generator, steps=len(self.test_generator))
        print(f"Test Accuracy: {accuracy * 100:.2f}%")
        return accuracy


In [4]:
class MultimodalClassifier:
    def __init__(self, csv_path, image_dir, target_column):
        self.csv_path = csv_path
        self.image_dir = image_dir
        self.target_column = target_column
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.defect_classifier_img = None
        self.defect_classifier_csv = None
        self.best_model = None

    def load_data(self):
        print('Loading Data ..')
        
        # Load CSV data
        data = pd.read_csv(self.csv_path)

        # Create a DefectIMG instance
        self.defect_classifier_img = DefectIMG(f"{self.image_dir}/train", f"{self.image_dir}/val")
        self.defect_classifier_img.preprocess_data()
        self.defect_classifier_img.build_model(num_classes=2)  # Assuming binary classification for defect
        self.defect_classifier_img.train_model(epochs=10)

        # Predict defect classes for images in the CSV
        image_paths = data['image_path']  # Replace 'image_path' with your actual column name
        defect_predictions = []
        for path in image_paths:
            defect_predictions.append(self.defect_classifier_img.predict(path))
        data['defect_prediction'] = defect_predictions

        self.multi_modal_data = 'multi_modal_data.csv'
        data.to_csv(self.multi_modal_data, index=False)

    def defect_classification(self):
        print('Defect Classification on CSV ..')
        self.defect_classifier_csv = DefectCSV(self.multi_modal_data, self.target_column)
        self.defect_classifier_csv.load_data()
        self.defect_classifier_csv.train_random_forest()
        self.defect_classifier_csv.train_xgboost()
        self.best_model = self.defect_classifier_csv.evaluate_models()

### Testing CSV Data

In [5]:
path = 'manufacturing_defect_dataset.csv'
target_column = 'DefectStatus'
csv_obj = DefectCSV(path, target_column)
csv_obj.load_data()
csv_obj.train_random_forest()
csv_obj.train_xgboost()
best_model = csv_obj.evaluate_models()

Preprocessing Data ..
Training Random Forest ..


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\BhavishyaPandit\Desktop\VSC Projects\FlawFinder\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\BhavishyaPandit\Desktop\VSC Projects\FlawFinder\venv\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\BhavishyaPandit\Desktop\VSC Projects\FlawFinder\venv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\BhavishyaPandit\Desktop\VSC P

Training XG Boost ..
Evaluating Models ..
XGBoost Metrics:
Accuracy: 0.9552469135802469
Precision: 0.9591474245115453
Recall: 0.989010989010989
F1-Score: 0.9738503155996393

Random Forest Metrics:
Accuracy: 0.9552469135802469
Precision: 0.9591474245115453
Recall: 0.989010989010989
F1-Score: 0.9738503155996393


### Testing Image Data

In [31]:
# Define the directories for training, validation, and test data
train_dir = 'Breast Cancer New/Train'
test_dir = 'Breast Cancer New/Test'

# Initialize the DefectIMG class
defect_img_model = DefectIMG(
    train_dir=train_dir,
    test_dir=test_dir,
    image_size=(224, 224),  # Image size as expected by ResNet152
    batch_size=32,          # Batch size
    grayscale=False         # Set to True if you want to convert images to grayscale
)

# Preprocess the data (including data augmentation and normalization)
defect_img_model.preprocess_data()

# Get the number of classes
num_classes = len(defect_img_model.class_labels)

# Build the model with the specified number of classes
defect_img_model.build_model(num_classes=num_classes)

# Train the model with a specified number of epochs
defect_img_model.train_model(epochs=30)

# Evaluate the model on the test set and print accuracy
defect_img_model.evaluate_model()

# Example of predicting a single image
# image_path = 'path/to/single_image.jpg'
# predicted_class = defect_img_model.predict(image_path)
# print(f"The predicted class for the image is: {predicted_class}")

Found 656 images belonging to 2 classes.
Found 164 images belonging to 2 classes.
Epoch 1/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 3s/step - accuracy: 0.5139 - loss: 0.7002
Epoch 2/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 3/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 3s/step - accuracy: 0.6610 - loss: 0.6119
Epoch 4/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 977us/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 5/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 3s/step - accuracy: 0.7059 - loss: 0.5555
Epoch 6/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 746us/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 7/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 3s/step - accuracy: 0.6586 - loss: 0.5834
Epoch 8/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

0.5