In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from itertools import chain

In [None]:
labels_df = pd.read_csv('/kaggle/input/planets-dataset/planet/planet/train_classes.csv')
labels_df.head()

In [None]:
# Print all unique tags
from itertools import chain
labels_list = list(chain.from_iterable([tags.split(" ") for tags in labels_df['tags'].values]))
labels_set = set(labels_list)
print("There is {} unique labels including {}".format(len(labels_set), labels_set))


In [None]:
# Histogram of label instances
labels_s = pd.Series(labels_list).value_counts() # To sort them by count
fig, ax = plt.subplots(figsize=(16, 8))
sns.barplot(x=labels_s, y=labels_s.index, orient='h')


In [None]:
images_title = [labels_df[labels_df['tags'].str.contains(label)].iloc[i]['image_name'] + '.jpg' 
                for i, label in enumerate(labels_set)]

plt.rc('axes', grid=False)
_, axs = plt.subplots(5, 4, sharex='col', sharey='row', figsize=(15, 20))
axs = axs.ravel()

for i, (image_name, label) in enumerate(zip(images_title, labels_set)):
    img = mpimg.imread('/kaggle/input/planets-dataset/planet/planet/train-jpg' + '/' + image_name)
    axs[i].imshow(img)
    axs[i].set_title('{} - {}'.format(image_name, label))


In [12]:
# Input parameters
train_jpeg_dir = '/kaggle/input/planets-dataset/planet/planet/train-jpg'
train_csv_file = '/kaggle/input/planets-dataset/planet/planet/train_classes.csv'
test_jpeg_dir = '/kaggle/input/planets-dataset/planet/planet/test-jpg'
test_additional_jpeg_dir = '/kaggle/input/planets-dataset/test-jpg-additional/test-jpg-additional'
img_resize = (128, 128)  # Desired image size
validation_split = 0.2
batch_size = 32
AUTOTUNE = tf.data.AUTOTUNE

In [13]:
# Read the CSV file
labels_df = pd.read_csv(train_csv_file)

# Extract all unique labels
labels = sorted(set(chain.from_iterable([tags.split(" ") for tags in labels_df['tags'].values])))

# Create a mapping from label to index
labels_map = {label: idx for idx, label in enumerate(labels)}
num_classes = len(labels_map)

In [14]:
# Function to one-hot encode the tags
def encode_tags(tags_str):
    tags = tags_str.split(' ')
    targets = np.zeros(num_classes, dtype='float32')
    for tag in tags:
        targets[labels_map[tag]] = 1.0
    return targets

# Apply the encoding to the DataFrame
labels_df['targets'] = labels_df['tags'].apply(encode_tags)

In [15]:
# Split the DataFrame into training and validation sets
train_df, val_df = train_test_split(labels_df, test_size=validation_split, random_state=42)

# Reset indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [16]:
# Function to load and preprocess images
def load_and_preprocess_image(path, label):
    # Read the image file
    image = tf.io.read_file(path)
    # Decode the image
    image = tf.image.decode_jpeg(image, channels=3)
    # Resize the image
    image = tf.image.resize(image, img_resize)
    # Normalize the image to [-1, 1]
    image = (image / 127.5) - 1.0
    return image, label

# Function to create a dataset from the DataFrame
def create_dataset(df, training=True):
    # Construct full file paths
    image_paths = df['image_name'].apply(lambda x: os.path.join(train_jpeg_dir, f"{x}.jpg")).tolist()
    labels = np.stack(df['targets'].values)
    # Create a TensorFlow Dataset
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
    # Map the load_and_preprocess_image function to the dataset
    dataset = dataset.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)
    if training:
        # Shuffle and repeat for training
        dataset = dataset.shuffle(buffer_size=1000)
    # Batch and prefetch the dataset
    dataset = dataset.batch(batch_size).prefetch(AUTOTUNE)
    return dataset

In [17]:
# Create the training dataset
train_dataset = create_dataset(train_df, training=True)

# Create the validation dataset
val_dataset = create_dataset(val_df, training=False)

In [18]:
# Get test file paths
test_files = [os.path.join(test_jpeg_dir, f) for f in os.listdir(test_jpeg_dir)]
test_files += [os.path.join(test_additional_jpeg_dir, f) for f in os.listdir(test_additional_jpeg_dir)]

# Function to load and preprocess test images
def load_and_preprocess_test_image(path):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, img_resize)
    image = (image / 127.5) - 1.0
    return image

# Create test dataset
test_dataset = tf.data.Dataset.from_tensor_slices(test_files)
test_dataset = test_dataset.map(load_and_preprocess_test_image, num_parallel_calls=AUTOTUNE)
test_dataset = test_dataset.batch(batch_size).prefetch(AUTOTUNE)

In [19]:
# Example: Display shapes of batches
for images, labels in train_dataset.take(1):
    print(images.shape)  # Expected: (batch_size, img_resize[0], img_resize[1], 3)
    print(labels.shape)  # Expected: (batch_size, num_classes)

(32, 128, 128, 3)
(32, 17)


In [21]:
train_dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 128, 128, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 17), dtype=tf.float32, name=None))>