In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
import shutil
from PIL import Image
from matplotlib.image import imread
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import image_dataset_from_directory
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping

path = "../input/planets-dataset/"
os.listdir(path)

In [None]:
#Loading the image datasets
train_path = '../input/planets-dataset/planet/planet/train_classes.csv'
test_path = '../input/planets-dataset/planet/planet/sample_submission.csv'
train_images = '../input/planets-dataset/planet/planet/train-jpg'
test_images = '../input/planets-dataset/planet/planet/test-jpg'

In [None]:
train_df = pd.read_csv("/kaggle/input/planets-dataset/planet/planet/train_classes.csv")
print(train_df.shape)
train_df.head()

In [None]:
test_df = pd.read_csv(r"/kaggle/input/planets-dataset/planet/planet/sample_submission.csv")
print(test_df.shape)
test_df.head()

In [None]:
# Let's view some images
plt.figure(figsize=(20,20))
# define location of dataset
folder = train_images
# plot first few images
for i in range(9):
    # define subplot
    plt.subplot(330 + 1 + i)
    # define filename
    filename = folder+ "/" + 'train_' + str(i) + '.jpg'
    # load image pixels
    image = imread(filename)
    # plot raw pixel data
    plt.imshow(image)
# show the figure
plt.show()

In [None]:
# Number of images in the dataset
print(f'Number of images: {train_df.shape[0]}')

In [None]:
#get number of unique classes in the train dataset
train_df['tags'].nunique()

In [None]:
# Tags present in the dataset
tags = train_df['tags'].apply(lambda x: x.split(' '))
tags = [item for sublist in tags for item in sublist]
tag_counts = pd.Series(tags).value_counts()

# Plot the tags
plt.figure(figsize=(10,6))
plt.bar(tag_counts.index, tag_counts.values, alpha=0.8)
plt.title('Tag counts')
plt.ylabel('Number of occurrences', fontsize=12)
plt.xlabel('Tags', fontsize=12)
plt.xticks(rotation=90)
plt.show();

In [None]:
labels = set()
def splitting_tags(tags):
    '''
    Takes in tags column, splits the tags and store as a set
    '''
    [labels.add(tag) for tag in tags.split()]
    
# Create a copy of `train_df`
train_df1 = train_df.copy()
train_df1['tags'].apply(splitting_tags)
labels = list(labels)
print(labels)

In [None]:
##One hot encoding is performed on the labels in train classes 
for tag in labels:
    train_df1[tag] = train_df1['tags'].apply(lambda x: 1 if tag in x.split() else 0)
    
## adding .jpg extension to the column image_name so as to have same name format as the image files
train_df1['image_name'] = train_df1['image_name'].apply(lambda x: '{}.jpg'.format(x))
train_df1.head()

In [None]:
# Define the columns
columns = list(train_df1.columns[2:])
columns

In [None]:
train_datagen = ImageDataGenerator(
    rescale = 1./255., 
    validation_split = 0.2,
    rotation_range=40,
    width_shift_range=0.3,
    height_shift_range=0.3,
    shear_range=0.3,
    zoom_range=0.3,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Generating train data generator 
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df1,
    directory =train_images, 
    x_col='image_name',
    y_col=columns, 
    subset='training', 
    batch_size=64,
    seed=42, 
    shuffle=True, 
    class_mode='raw',
    target_size=(256,256)
)

#generating validation data which is expected to be 20% of the train dataset since validation split is 0.2
val_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df1,
    directory =train_images, 
    x_col='image_name',
    y_col=columns,
    subset='validation', 
    batch_size=32,
    seed=42, 
    shuffle=True, 
    class_mode='raw',
    target_size=(256,256)
)

In [None]:
#setting up step size for training and validation image data
step_train_size = int(np.ceil(train_generator.samples / train_generator.batch_size))
step_val_size = int(np.ceil(val_generator.samples / val_generator.batch_size))
print(step_train_size , step_val_size)

In [None]:
model = Sequential([
    layers.Conv2D(64, (3, 3), activation='relu', input_shape=(256, 256, 3)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D(),
    # Flatten layer
    layers.Conv2D(512, (3,3), activation='relu'),
    layers.Flatten(),
    # Fully connected layers
    layers.Dense(17, activation='softmax'),  
])

In [None]:
# Compile the model
model.compile(
    optimizer='sgd', 
    loss='categorical_crossentropy', 
    metrics=['accuracy','FBetaScore', 'CategoricalAccuracy']
)

In [None]:
call_backs = EarlyStopping(
    monitor='val_accuracy', 
    patience=3, 
    verbose=1, 
    mode='max', 
    restore_best_weights=True
)

In [None]:
# Fit the model 
history = model.fit(
    x = train_generator, 
    validation_data = val_generator,
    steps_per_epoch = step_train_size,
    epochs = 10,
    verbose = 1
)

In [None]:
from keras.applications import ResNet50

new_model = ResNet50(
    weight = "imagenet",
    include_top = False,
    input_shape = (256, 256)
)

for layer in new_model.layers:
    layer.trainable = False

In [None]:
final_model = Sequential([
    new_model,
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D(),
    # Flatten layer
    layers.Conv2D(512, (3,3), activation='relu'),
    layers.Flatten(),
    # Fully connected layers
    layers.Dense(17, activation='softmax'), 
])

In [None]:
history = final_model.fit(
    x = train_generator,
    validation_data = val_generator,
    steps_per_epoch = step_train_size,
    epochs = 10,
    verbose = 1
)

In [None]:
##adding .jpg extension to image name in the sample submission file
sample_submission = pd.read_csv('../input/planets-dataset/planet/planet/sample_submission.csv')
sample_submission1 = sample_submission.copy()
sample_submission1['image_name'] = sample_submission1['image_name'].apply(lambda x: '{}.jpg'.format(x))
sample_submission1.head()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['training', 'validation'], loc='upper left')

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['training', 'validation'], loc='upper left')

plt.show()

In [None]:
 Divide the sample submission file into two splits,
# first test1_df contains the first 40669 images 
test_df1 = sample_submission1.iloc[:40669]['image_name'].reset_index().drop('index', axis =1)
test_df1.head()

In [None]:
#initialize imagedatagenerator for the test images and also rescaling
test_datagen = ImageDataGenerator(rescale = 1/255)

#creating a generator for the images found in the first test image files
test_gen = test_datagen.flow_from_dataframe(dataframe=test_df1, 
                                            directory='/kaggle/input/planets-dataset/planet/planet/test-jpg/', 
                                            x_col="image_name", 
                                            y_col=None, 
                                            batch_size=32,
                                            seed=42,
                                            shuffle=False, 
                                            class_mode='categorical', 
                                            target_size=(256,256))

step_test_size1 = int(np.ceil(test_gen.samples/test_gen.batch_size))

In [None]:
test_gen.reset()
pred = final_model.predict(test_gen, steps=step_test_size1, verbose=1)

In [None]:
file_names = test_gen.filenames

# Convert the predicted values to a dataframe and join two labels together if prob(occurrance of the label) > 0.5 
pred_tags = pd.DataFrame(pred)
pred_tags = pred_tags.apply(lambda x: ' '.join(np.array(labels)[x > 0.5]), axis = 1)

#then the result should look like this 
result1 = pd.DataFrame({'image_name': file_names, 'tags': pred_tags})
result1.head()

In [None]:
#second batch of the test dataset
additional_df = sample_submission1.iloc[40669:]['image_name'].reset_index().drop('index', axis =1)
additional_df.head()

In [None]:
#creating a generator for the second batch of test image files
test_gen1 = test_datagen.flow_from_dataframe(dataframe=additional_df, 
                                                directory='../input/planets-dataset/test-jpg-additional/test-jpg-additional', 
                                                x_col='image_name', 
                                                y_col=None, 
                                                batch_size=500, 
                                                shuffle=False, 
                                                class_mode=None, 
                                                target_size=(256,256))

step_test_size2 = int(np.ceil(test_gen1.samples/test_gen1.batch_size))

In [None]:
#we reset the generator to avoid shuffling, then make prediction on the generator
test_gen1.reset()
pred1 = model1.predict(test_gen1, steps = step_test_size2, verbose = 1)

In [None]:
#this is to get the filenames in the generator using the attribute .filenames
file_names1 = test_gen1.filenames

#convert the predicted values to a dataframe
#join two labels together if the prob(occurrance of the label) > 0.5
pred_tags1 = pd.DataFrame(pred1)
pred_tags1 = pred_tags1.apply(lambda x: ''.join(np.array(labels)[x>0.5]), axis = 1)

result2 = pd.DataFrame({'image_name': file_names1, 'tags': pred_tags1})
result2.head()

In [None]:
# Final result of the predicted tags for the test images,
# we need to concat the first and second results in 
#that order to avoid shuffling the index
final_df = pd.concat([result1, result2])

final_df = final_df.reset_index().drop('index', axis =1)

print(final_df.shape)
final_df.head()

In [None]:
# Remove the .jpg extension from the image_name of the last_result 
final_df['image_name'] = final_df['image_name'].apply(lambda x: x[:-4])
final_df.head()

In [None]:
# Finally, we save the result to a csv file using the .to_csv() 
# method and setting the index to false.
final_df.to_csv('submission2.csv', index = False)