## Build an image classifier for the Dogs vs. Cats Redux Kaggle competition using both:
### A custom CNN
### A transfer learning model (ResNet50)

## Download the library

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shutil

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input
from tensorflow.keras.optimizers import RMSprop, SGD
from tensorflow.keras.callbacks import EarlyStopping

# Unzip training images
try:
    with zipfile.ZipFile('/kaggle/input/dogs-vs-cats-redux-kernels-edition/train.zip', 'r') as zip_ref:
        zip_ref.extractall('train')
except zipfile.BadZipFile:
    print("Error: 'train.zip' is corrupted or not a valid ZIP file.")

# Unzip testing images
try:
    with zipfile.ZipFile('/kaggle/input/dogs-vs-cats-redux-kernels-edition/test.zip', 'r') as zip_ref:
        zip_ref.extractall('test')
except zipfile.BadZipFile:
    print("Error: 'test.zip' is corrupted or not a valid ZIP file.")
    # You might want to download the file again or try alternative ways to extract it.

# Reorganize test/ directory to have subfolder (required by flow_from_directory)
os.makedirs('test/test', exist_ok=True)
for fname in os.listdir('test'):
    if fname.endswith('.jpg'):
        shutil.move(os.path.join('test', fname), os.path.join('test/test', fname))

## Set up cat and dog category

In [None]:
import os
import shutil

# Source path (where images are now)
src_folder = os.path.join('train', 'train')  # 'train/train'

# Target folders
cat_dir = os.path.join('train', 'cat')
dog_dir = os.path.join('train', 'dog')
os.makedirs(cat_dir, exist_ok=True)
os.makedirs(dog_dir, exist_ok=True)

# Move images into their respective folders
for fname in os.listdir(src_folder):
    if fname.startswith('cat'):
        shutil.move(os.path.join(src_folder, fname), os.path.join(cat_dir, fname))
    elif fname.startswith('dog'):
        shutil.move(os.path.join(src_folder, fname), os.path.join(dog_dir, fname))

# Remove the empty 'train/train' folder
shutil.rmtree(src_folder)

print("✅ Moved files into 'cat' and 'dog' folders successfully.")

In [None]:
import os
print("Classes found:", os.listdir('train'))

In [None]:
import os

for class_name in os.listdir('train'):
    class_path = os.path.join('train', class_name)
    if os.path.isdir(class_path):
        num_files = len(os.listdir(class_path))
        print(f"{class_name}: {num_files} files")

## Data Preparation

In [None]:
train_dir = 'train'
test_dir = 'test'

train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    validation_split=0.2,
    horizontal_flip=True,
    rotation_range=20,
    zoom_range=0.2
)

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    subset='training'
)

val_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    subset='validation',
    shuffle=False  # ❗ must be False
)


In [None]:
sample_images, sample_labels = next(train_generator)

plt.figure(figsize=(10, 10))
for i in range(9):
    plt.subplot(3, 3, i + 1)
    img = sample_images[i]
    img = (img - img.min()) / (img.max() - img.min())  # normalize for imshow
    plt.imshow(img)
    label = 'Dog' if sample_labels[i] else 'Cat'
    plt.title(label)
    plt.axis('off')
plt.tight_layout()
plt.show()

## Model1: ResNet50

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
input_tensor = Input(shape=(224, 224, 3))
base_model = ResNet50(weights='imagenet', include_top=False, input_tensor=input_tensor)

# Freeze base layers
for layer in base_model.layers:
    layer.trainable = False

# Add custom head
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=input_tensor, outputs=output)

model.compile(
    optimizer=Adam(learning_rate=1e-4),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

## Train the Model

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10,
    callbacks=[early_stop]
)

In [None]:
# Unfreeze last 30 layers
for layer in base_model.layers[-30:]:
    layer.trainable = True

# Re-compile with lower LR
model.compile(
    optimizer=Adam(learning_rate=1e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Fine-tune
history_fine = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=5,
    callbacks=[early_stop]
)


## Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

val_generator.reset()
pred_probs = model.predict(val_generator)
pred_classes = (pred_probs > 0.5).astype(int).reshape(-1)
true_classes = val_generator.classes

cm = confusion_matrix(true_classes, pred_classes)
print("Confusion Matrix:\n", cm)

print("\nClassification Report:\n")
print(classification_report(true_classes, pred_classes, target_names=['cat', 'dog']))


In [None]:
model.save("resnet50_cat_dog_model.h5")
print("✅ Model saved as resnet50_cat_dog_model.h5")


## Test on the testing set

In [None]:
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode=None,
    shuffle=False
)

test_generator.reset()
pred_probs = model.predict(test_generator)


## Save the model

In [None]:
filenames = test_generator.filenames
ids = [int(f.split('/')[-1].split('.')[0]) for f in filenames]

submission = pd.DataFrame({
    'id': ids,
    'label': pred_probs.ravel()
})

submission = submission.sort_values('id')
submission.to_csv("MSBA.Session1.Charlottewang.csv", index=False)
print("✅ Submission saved.")


## Model 2: Normal CNN

In [None]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam

## Date Preprocssing

In [None]:
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    horizontal_flip=True,
    zoom_range=0.2,
    rotation_range=20
)

train_generator = train_datagen.flow_from_directory(
    'train',
    target_size=(150, 150),
    batch_size=32,
    class_mode='binary',
    subset='training'
)

val_generator = train_datagen.flow_from_directory(
    'train',
    target_size=(150, 150),
    batch_size=32,
    class_mode='binary',
    subset='validation',
    shuffle=False  # for correct label alignment
)



## Train the Model

In [None]:

model = Sequential()

model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)))
model.add(MaxPooling2D(2, 2))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))

model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer=Adam(learning_rate=1e-4),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

## Train the Model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10,
    callbacks=[early_stop]
)

## Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

val_generator.reset()
predictions = model.predict(val_generator)
predicted_classes = (predictions > 0.5).astype(int).ravel()
true_classes = val_generator.classes

# Confusion Matrix
cm = confusion_matrix(true_classes, predicted_classes)
print("Confusion Matrix:\n", cm)

# Classification Report
print("\nClassification Report:\n")
print(classification_report(true_classes, predicted_classes, target_names=['cat', 'dog']))

In [None]:
# Save your trained CNN model
model.save("cnn_cat_dog_model.h5")
print("✅ Model saved as cnn_cat_dog_model.h5")


## Testing on the test dataset

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import pandas as pd

# Load model
model = load_model("cnn_cat_dog_model.h5")
print("✅ Model loaded!")

# Setup test data generator
test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_directory(
    'test',                # Folder should contain test/test/ with test images
    target_size=(150, 150),
    batch_size=32,
    class_mode=None,
    shuffle=False
)

# Make predictions
test_generator.reset()
pred_probs = model.predict(test_generator)
pred_labels = (pred_probs > 0.5).astype(int).ravel()

# Format for Kaggle submission
filenames = test_generator.filenames
ids = [int(f.split('/')[-1].split('.')[0]) for f in filenames]

submission = pd.DataFrame({
    'id': ids,
    'label': pred_labels
})

submission = submission.sort_values('id')
#submission.to_csv("MSBA.Session1.charlotte_test.csv", index=False)
submission.to_csv("test.csv", index=False)
print("✅ Submission file saved.")

Summary: The custom CNN works well as a learning baseline but lacks depth for generalization.

ResNet50, with proper preprocessing and fine-tuning, achieved high validation accuracy and strong leaderboard score.