<a href="https://colab.research.google.com/github/abdullateefogundipe/Hamoye/blob/main/Ogundipe_Abdullateef_Stage_D_Tag_along_project_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import io 
from google.colab import files

In [2]:
from google.colab import files
data_to_load = files.upload()

Saving train_v2.csv to train_v2.csv


In [3]:
train_label = pd.read_csv(io.BytesIO(data_to_load['train_v2.csv']))
train_label.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [4]:
labels = set()
def splitting_tags(tags):
    '''
    Takes in tags column, splits the tags and store as a set
    '''
    [labels.add(tag) for tag in tags.split()]
    
# Create a copy of train_label
train = train_label.copy()
train['tags'].apply(splitting_tags)
labels = list(labels)
print(labels)

['artisinal_mine', 'primary', 'water', 'haze', 'bare_ground', 'agriculture', 'clear', 'conventional_mine', 'selective_logging', 'cloudy', 'blooming', 'blow_down', 'cultivation', 'road', 'slash_burn', 'partly_cloudy', 'habitation']


In [6]:
##One hot encoding is performed on the labels in train classes 

for tag in labels:
    train[tag] = train['tags'].apply(lambda x: 1 if tag in x.split() else 0)
    
## adding .jpg extension to the column image_name so as to have same name format as the image files
train['image_name'] = train['image_name'].apply(lambda x: '{}.jpg'.format(x))
train.head()

Unnamed: 0,image_name,tags,artisinal_mine,primary,water,haze,bare_ground,agriculture,clear,conventional_mine,selective_logging,cloudy,blooming,blow_down,cultivation,road,slash_burn,partly_cloudy,habitation
0,train_0.jpg,haze primary,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,train_1.jpg,agriculture clear primary water,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0
2,train_2.jpg,clear primary,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,train_3.jpg,clear primary,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,train_4.jpg,agriculture clear habitation primary road,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1


In [7]:
#importing libraries for training
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dropout, Flatten
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [8]:
# Defining the columns,i.e the labels that were newly added to the train_classes via hot encoding.
columns = list(train.columns[2:])

In [9]:
columns

['artisinal_mine',
 'primary',
 'water',
 'haze',
 'bare_ground',
 'agriculture',
 'clear',
 'conventional_mine',
 'selective_logging',
 'cloudy',
 'blooming',
 'blow_down',
 'cultivation',
 'road',
 'slash_burn',
 'partly_cloudy',
 'habitation']

In [10]:
def fbeta(y_true, y_pred, beta = 2, epsilon = 1e-4):
    '''
    Set y_true and y_pred

    Args:
        y_true: correct target values
        Y_pred: predicted values returned by the classifer
        beta = 2
        epsilon= 1e-4
        
    Returns:
        fbeta score
    '''
    
    beta_squared = beta**2
    
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.greater(tf.cast(y_pred, tf.float32), tf.constant(0.5)), tf.float32)
    
    tp = tf.reduce_sum(y_true * y_pred, axis = 1)
    fp = tf.reduce_sum(y_pred, axis = 1) - tp
    fn = tf.reduce_sum(y_true, axis = 1) - tp
    
    precision = tp/(tp+fp+epsilon)
    recall = tp/(tp+fn+epsilon)
    
    fb = (1+beta_squared)*precision*recall / (beta_squared*precision+recall+epsilon)
    return fb

In [11]:
def multi_label_acc(y_true, y_pred, epsilon = 1e-4):
    '''
    Retuns accuracy value for multi_label classification
    
    Set y_true and y_pred

    Args:
        y_true: correct target values
        Y_pred: predicted values returned by the classifer
        epsilon= 1e-4
        
    Returns:
        Accuracy score
    '''
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.greater(tf.cast(y_pred, tf.float32), tf.constant(0.5)), tf.float32)
    
    tp = tf.reduce_sum(y_true * y_pred, axis = 1)
    fp = tf.reduce_sum(y_pred, axis = 1) - tp
    fn = tf.reduce_sum(y_true, axis = 1) - tp
    
    y_true = tf.cast(y_true, tf.bool)
    y_pred = tf.cast(y_pred, tf.bool)
        
    tn = tf.reduce_sum(tf.cast(tf.logical_not(y_true), tf.float32)
                       * tf.cast(tf.logical_not(y_pred), tf.float32), axis = 1)
    
    return (tp+tn)/(tp+tn+fp+fn+epsilon)

In [12]:
#defining our model
def build_model():
    model = Sequential()
    model.add(BatchNormalization(input_shape=(128, 128, 3)))
    model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(256, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(256, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(17, activation='sigmoid'))

    opt = Adam(lr=1e-4)
    
    # We need binary here, since categorical_crossentropy l1 norms the output before calculating loss.
    model.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=[multi_label_acc, fbeta])

    return model

In [13]:
#modelcheckpoint is set to monitor the model using validation fbeta score and save the best only
save_best_check_point = ModelCheckpoint(filepath = 'best_model.hdf5', 
                                        monitor = 'val_fbeta',
                                        mode = 'max',
                                        save_best_only = True,
                                        save_weights_only = True)

##Getting the dataset from kaggle

In [17]:
data_to_load = files.upload()

Saving kaggle.json to kaggle.json


In [18]:
! pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
! mkdir ~/.kaggle

In [20]:
! cp kaggle.json ~/.kaggle/

In [21]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download <name-of-competition>

In [22]:
!kaggle datasets download -d nikitarom/planets-dataset

Downloading planets-dataset.zip to /content
100% 1.49G/1.50G [00:34<00:00, 64.2MB/s]
100% 1.50G/1.50G [00:34<00:00, 47.2MB/s]


In [25]:
!unzip /content/planets-dataset.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: test-jpg-additional/test-jpg-additional/file_5499.jpg  
  inflating: test-jpg-additional/test-jpg-additional/file_55.jpg  
  inflating: test-jpg-additional/test-jpg-additional/file_550.jpg  
  inflating: test-jpg-additional/test-jpg-additional/file_5500.jpg  
  inflating: test-jpg-additional/test-jpg-additional/file_5501.jpg  
  inflating: test-jpg-additional/test-jpg-additional/file_5502.jpg  
  inflating: test-jpg-additional/test-jpg-additional/file_5503.jpg  
  inflating: test-jpg-additional/test-jpg-additional/file_5504.jpg  
  inflating: test-jpg-additional/test-jpg-additional/file_5505.jpg  
  inflating: test-jpg-additional/test-jpg-additional/file_5506.jpg  
  inflating: test-jpg-additional/test-jpg-additional/file_5507.jpg  
  inflating: test-jpg-additional/test-jpg-additional/file_5508.jpg  
  inflating: test-jpg-additional/test-jpg-additional/file_5509.jpg  
  inflating: test-jpg-additional/test-jpg

In [26]:
#initializing imagedatagenerator with a validation split of 0.2
train_image_gen = ImageDataGenerator(rescale = 1/255, validation_split = 0.2)

#generating train data generator which is 80% of the train dataset
#note that a generator contains both features and target of the data
train_generator = train_image_gen.flow_from_dataframe(dataframe=train,
                                                directory ="/content/planet/planet/train-jpg",  
                                                x_col="image_name", y_col=columns, subset="training", 
                                                batch_size=16,seed=2021, shuffle=True, 
                                                class_mode="raw", target_size=(128,128))

#generating validation data which is expected to be 20% of the train dataset since validation split is 0.2
val_generator = train_image_gen.flow_from_dataframe(dataframe=train,
                                                directory ="/content/planet/planet/train-jpg",  
                                                x_col="image_name", y_col=columns, subset="validation", 
                                                batch_size=16,seed=2021, shuffle=True, 
                                                class_mode="raw", target_size=(128,128))

Found 32384 validated image filenames.
Found 8095 validated image filenames.


In [27]:
#setting up step size for training and validation image data
step_train_size = int(np.ceil(train_generator.samples / train_generator.batch_size))
step_val_size = int(np.ceil(val_generator.samples / val_generator.batch_size))

In [29]:
#initialize the model
model1 = build_model()

In [30]:
# Preview the model architecture
model1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization_1 (Batc  (None, 128, 128, 3)      12        
 hNormalization)                                                 
                                                                 
 conv2d_8 (Conv2D)           (None, 128, 128, 32)      896       
                                                                 
 conv2d_9 (Conv2D)           (None, 126, 126, 32)      9248      
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 63, 63, 32)       0         
 2D)                                                             
                                                                 
 dropout_5 (Dropout)         (None, 63, 63, 32)        0         
                                                                 
 conv2d_10 (Conv2D)          (None, 63, 63, 64)       

In [31]:
#fitting our model using the parameters already defined 
model1.fit(x = train_generator, 
           steps_per_epoch = step_train_size, 
           validation_data = val_generator, 
           validation_steps = step_val_size,epochs = 25, 
           callbacks = [save_best_check_point])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fe2b066b5d0>

In [32]:
#initializing a second model to make predictions
model2 = build_model()

In [34]:
#loading in the weights of the trained model
model2.load_weights('best_model.hdf5')

In [None]:
##adding .jpg extension to image name in the sample submission file
sample_submission = pd.read_csv('/content/planet/planet/sample_submission.csv')
sample_submission1 = sample_submission.copy()
sample_submission1['image_name'] = sample_submission1['image_name'].apply(lambda x: '{}.jpg'.format(x))
sample_submission1.head()

In [None]:
# Divide the sample submission file into two splits,
# first test_df which contains the first 40669 images 
test_df = sample_submission1.iloc[:40669]['image_name'].reset_index().drop('index', axis =1)
test_df.head()

In [36]:
#initialize imagedatagenerator for the test images and also rescaling
test_image_gen = ImageDataGenerator(rescale = 1/255)


#creating a generator for the images found in the first test image files
test_generator = test_image_gen.flow_from_dataframe(dataframe=test_df, 
                                                directory="/content/planet/planet/test-jpg", 
                                                x_col="image_name", 
                                                y_col=None, 
                                                batch_size=16, 
                                                shuffle=False, 
                                                class_mode=None, 
                                                target_size=(128,128))

step_test_size = int(np.ceil(test_generator.samples/test_generator.batch_size))

Found 40669 validated image filenames.


In [37]:
#first, we reset the test generator to avoid shuffling of index as we want it to be orderly
test_generator.reset()
pred = model2.predict(test_generator, steps = step_test_size, verbose = 1)



In [38]:
#this is to get the filenames in the generator using the attribute .filenames
file_names = test_generator.filenames

#convert the predicted values to a dataframe and join two labels together if the probability of occurrance 
#of the label is greater than 0.5 
pred_tags = pd.DataFrame(pred)
pred_tags = pred_tags.apply(lambda x: ' '.join(np.array(labels)[x>0.5]), axis = 1)

#then the result should look like this 
result1 = pd.DataFrame({'image_name': file_names, 'tags': pred_tags1})
result1.head()

Unnamed: 0,image_name,tags
0,test_0.jpg,primary clear
1,test_1.jpg,primary clear
2,test_2.jpg,primary partly_cloudy
3,test_3.jpg,primary clear
4,test_4.jpg,primary partly_cloudy


In [None]:
#additional test dataset
add_test_df = sample_submission.iloc[40669:]['image_name'].reset_index().drop('index', axis =1)
add_test_df.head()

In [41]:
#creating a generator for the additional test image files
add_test_generator = test_image_gen.flow_from_dataframe(dataframe= add_test_df, 
                                                directory="/content/test-jpg-additional/test-jpg-additional", 
                                                x_col="image_name", 
                                                y_col=None, 
                                                batch_size=16, 
                                                shuffle=False, 
                                                class_mode=None, 
                                                target_size=(128,128))

step_test_size2 = int(np.ceil(add_test_generator.samples/add_test_generator.batch_size))

Found 20522 validated image filenames.


In [42]:
#we reset the generator to avoid shuffling, then make prediction on the generator
add_test_generator.reset()
add_pred = model2.predict(add_test_generator, steps = step_test_size2, verbose = 1)



In [43]:
#this is to get the filenames in the generator using the attribute .filenames
file_names = test_generator.filenames

#convert the predicted values to a dataframe and join two labels together if the probability of occurrance 
#of the label is greater than 0.5
add_pred_tags = pd.DataFrame(add_pred)
add_pred_tags = add_pred_tags.apply(lambda x: ''.join(np.array(labels)[x>0.5]), axis = 1)

#then the result should look like this
result2 = pd.DataFrame({'image_name': file_names, 'tags': add_pred_tags})
result2.head()

Unnamed: 0,image_name,tags
0,file_0.jpg,primaryclear
1,file_1.jpg,primaryagriculturepartly_cloudy
2,file_10.jpg,primarywateragricultureroad
3,file_100.jpg,primarywaterclear
4,file_1000.jpg,primaryclear


In [44]:
#for the final result of the predicted tags for the test images,
# we need to concat the first and second results in 
#that order to avoid shuffling the index
last_result = pd.concat([result1, result2])

last_result = last_result.reset_index().drop('index', axis =1)

print(last_result.shape)
#print the final result
last_result.head()

(61191, 2)


Unnamed: 0,image_name,tags
0,test_0.jpg,primary clear
1,test_1.jpg,primary clear
2,test_2.jpg,primary partly_cloudy
3,test_3.jpg,primary clear
4,test_4.jpg,primary partly_cloudy


In [45]:
#we need to remove the .jpg extension from the image_name of the
# last_result because from the sample submission the 
#extension was not there, we added it for easy manipulation of the data.
last_result['image_name'] = last_result['image_name'].apply(lambda x: x[:-4])
last_result.head()

Unnamed: 0,image_name,tags
0,test_0,primary clear
1,test_1,primary clear
2,test_2,primary partly_cloudy
3,test_3,primary clear
4,test_4,primary partly_cloudy


In [46]:
# Finally, we save the result to a csv file using the .to_csv() 
# method and setting the index to false.
last_result.to_csv('submission1.csv', index = False)