<a href="https://colab.research.google.com/github/carolinehagood/ds4002-project3/blob/main/project3_loadingdata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from PIL import Image

In [2]:

from google.colab import drive
drive.mount("/content/drive", force_remount=True)
import zipfile
import os



Mounted at /content/drive


In [3]:
# Path to the zip file in Google Drive
zip_path = '/content/drive/MyDrive/dogimages.zip'
extract_path = '/content/Dogs'  # Choose a location in Colab to extract the files

# Unzip the file
try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print("Files unzipped successfully.")
except FileNotFoundError:
    print("Zip file not found at the specified path.")
except zipfile.BadZipFile:
    print("The file is not a zip file or is corrupted.")
except Exception as e:
    print(f"An error occurred: {e}")


Files unzipped successfully.


In [4]:
images_dir = '/content/Dogs/dog-images'


In [5]:
#function to load images and create dataframe
import re
from sklearn.preprocessing import LabelEncoder

def load_images_with_labels(images_dir, img_size=(128, 128)):
    image_data = []
    breed_labels = []

    # Get list of breed folders
    breed_names = sorted(os.listdir(images_dir))

    for breed in breed_names:
        breed_folder_path = os.path.join(images_dir, breed)

        # Only proceed if it's a directory
        if not os.path.isdir(breed_folder_path):
            continue

        for image_file in os.listdir(breed_folder_path):
            image_path = os.path.join(breed_folder_path, image_file)

            # Check if the current file is an image
            if os.path.isfile(image_path) and image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                try:
                    # Open and resize the image
                    img = Image.open(image_path).resize(img_size)
                    img = np.array(img)

                    # Check if image has three color channels (RGB)
                    if img.shape == (img_size[0], img_size[1], 3):
                        # Flatten the image (convert 128x128x3 to 1D array)
                        image_data.append(img.flatten())
                        breed_labels.append(breed)
                except Exception as e:
                    print(f"Could not load image {image_path}: {e}")

    # Convert lists to numpy arrays for efficient processing
    image_data = np.array(image_data)

    # Normalize image data (scaling pixel values between 0 and 1)
    image_data = image_data / 255.0

    # Apply LabelEncoder to convert breed names to numeric labels
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(breed_labels)

    # Create a DataFrame for organized data
    data_df = pd.DataFrame(image_data)

    # Add the encoded labels as a new column
    data_df['breed'] = encoded_labels

    # Add breed names as a new column (optional)
    data_df['breed_name'] = data_df['breed'].apply(lambda x: label_encoder.inverse_transform([x])[0])

    return data_df, image_data, encoded_labels, label_encoder


# Path to extracted images directory
images_dir = '/content/Dogs/dog-images'

# Load images and labels
data_df, image_data, encoded_labels, label_encoder = load_images_with_labels(images_dir)


In [13]:
print(data_df.head(1))


         0         1         2         3         4         5         6  \
0  0.07451  0.070588  0.054902  0.066667  0.062745  0.047059  0.070588   

          7        8        9  ...    49144     49145     49146    49147  \
0  0.066667  0.05098  0.12549  ...  0.32549  0.301961  0.368627  0.34902   

      49148     49149     49150     49151  breed             breed_name  
0  0.321569  0.380392  0.364706  0.321569      0  n02085936-Maltese_dog  

[1 rows x 49154 columns]


In [None]:
#testing data set

print(f"Image data shape: {image_data.shape}")


Image data shape: (541, 49152)


In [7]:
#number of rows of dataframe
len(data_df)

541

In [11]:
def split_by_breed_and_save(data_df, breed_column='breed_name'):
    # Get unique breeds
    unique_breeds = data_df[breed_column].unique()

    for breed in unique_breeds:
        # Filter the DataFrame by the current breed
        breed_df = data_df[data_df[breed_column] == breed]

        # Replace spaces or special characters in breed name to avoid filename issues
        safe_breed_name = str(breed).replace(" ", "_").replace("/", "_")

        # Save the filtered DataFrame as a compressed CSV file
        breed_df.to_csv(f'{safe_breed_name}.csv.gz', index=False, compression='gzip')
        print(f'Saved {safe_breed_name}.csv.gz')

# Call the function with your DataFrame
split_by_breed_and_save(data_df, breed_column='breed_name')

Saved n02085936-Maltese_dog.csv.gz
Saved n02088364-beagle.csv.gz
Saved n02099601-golden_retriever.csv.gz
Saved n02106662-German_shepherd.csv.gz
Saved n02110958-pug.csv.gz


In [12]:
from google.colab import files

# List each specific file name and download it
file_names = [
    "n02085936-Maltese_dog.csv.gz",
    "n02088364-beagle.csv.gz",
    "n02099601-golden_retriever.csv.gz",
    "n02106662-German_shepherd.csv.gz",
    "n02110958-pug.csv.gz"
]

for file_name in file_names:
    files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>