In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
root_dir = "/kaggle/input/airbus-ship-detection/"
print(os.listdir(root_dir))

In [None]:
train_dir = os.path.join(root_dir,"train_v2")
test_dir = os.path.join(root_dir,"test_v2")
print(f"Files in train dir: {len(os.listdir(train_dir))}\nFiles in test dir: {len(os.listdir(test_dir))}")

In [None]:
not_jpg_train_files = [filename for filename in os.listdir(train_dir) if not filename.endswith('.jpg')]
not_jpg_test_files = [filename for filename in os.listdir(test_dir) if not filename.endswith('.jpg')]
print(f"""Checking if in train and test are only photos:
Not jpg files in train: {len(not_jpg_train_files)}
Not jpg files in test: {len(not_jpg_test_files)}""")

In [None]:
train_csv_path =  os.path.join(root_dir, "train_ship_segmentations_v2.csv")
train_df = pd.read_csv(train_csv_path, index_col='ImageId')#load df with RLE encoding 
print(f"Shape with duplicates:{train_df.shape}")
train_df = train_df[~train_df.index.duplicated(keep='first')] # Get rid of duplicates
print(f"Shape without duplicates:{train_df.shape} (the same amount we have in train dir)")
train_df.head()

As you see here images with ships have non NaN values

In [None]:
#Decoder for Run Length Encodings
def rle_decode(mask_rle, shape=(768, 768)):
    #Reference https://www.kaggle.com/code/inversion/run-length-decoding-quick-start
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    if isinstance(mask_rle, float) or (mask_rle==0):
        return np.zeros(shape)
    if isinstance(mask_rle, str):
        s = mask_rle.split()
        starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
        starts -= 1
        ends = starts + lengths
        img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
        for lo, hi in zip(starts, ends):
            img[lo:hi] = 1
        return img.reshape(shape).T
    list_mask_rle = mask_rle
    all_mask = np.zeros(shape)
    for mask_rle in list_mask_rle:
        s = mask_rle.split()
        starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
        starts -= 1
        ends = starts + lengths
        img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
        for lo, hi in zip(starts, ends):
            img[lo:hi] = 1
        all_mask += img.reshape(shape).T
    return all_mask
        
        
    

Dataset have some corrupted images which we will discard during training

In [None]:
train_df['file_size_kb'] = train_df.index.map(lambda c_img_id: os.stat(os.path.join(train_dir, c_img_id)).st_size/1024)
print(f"amount of corrupted images: {len(train_df[train_df['file_size_kb']<50])}")

In [None]:
#Next I will show small samples with ships and its corresponding RLE target mask so for this I prep df with only ships
temp = train_df.dropna()
temp.shape[0]#actual number of photos with ships

In [None]:
import cv2

#Show sample of images with target mask 
random_row = np.random.choice(temp.index, size=5)
rle = temp.loc[random_row,'EncodedPixels']
fig, axarr = plt.subplots(1, 5, figsize=(20, 45))
for i, m in enumerate(rle):
    path = os.path.join(train_dir, random_row[i])
    img = cv2.imread(path)
    mask = rle_decode(m)
    axarr[i].imshow(img)
    axarr[i].imshow(mask, alpha=0.5)
plt.show()



In [None]:
import seaborn as sns

trgt01 = train_df.fillna(0)
trgt01['EncodedPixels'] = (trgt01['EncodedPixels'] !=0).astype("uint8")
value = trgt01['EncodedPixels'].value_counts().tolist()

sns.barplot(x =  ["Non Ships", "Ships"], y = value)
print(f"As we can see images with no ships are a majority class (ratio {value[0]/np.sum(value)})")

In [None]:
temp["size_of_ship"] = temp['EncodedPixels'].apply(lambda x: np.sum(rle_decode(x)))

In [None]:
temp[temp["size_of_ship"] == temp["size_of_ship"].max()] #biggest ships

In [None]:
rle = temp[temp["size_of_ship"] == temp["size_of_ship"].max()]['EncodedPixels']
all_mask = rle_decode(rle[0])
plt.imshow(all_mask) #Show the biggest ship mask

In [None]:
plt.imshow(cv2.imread(os.path.join(train_dir,'a129c36b3.jpg')))

**Actually also dataset have replicated images**

In [None]:
plt.imshow(cv2.imread(os.path.join(train_dir,'eba27cc8a.jpg')))

Lets look on the distribution of ship sizes

In [None]:
sns.histplot(temp['size_of_ship'], bins= 15) 

**Histogram with long tails , we have many small sized ships in contrast to max size ship**

Dataset have class imbalance, so lets see actually how many pixels of positive class we have

In [None]:
postive_class_pixels = temp["size_of_ship"].sum()
all_pixels = temp.shape[0] * 768* 768
print(f"""pixels percentage of positive class:{postive_class_pixels/all_pixels * 100} | calculated only for images with ships
So when we have all ~190000 images it would be even smaller""")

Not so many :)

So as we can see from the EDA we have a dataset with class imbalance and actually very small masks of positive class, some portions of replicated and corrupted, bad labeled images(not all ships have coressponding pixel area).
So it would be better to experiment discarding some portions of images where we have only negative class(where is no ship) for segmentation, and create a simple binary classifier for detecting whether a ship on an image or not which will stand and filter non ship images, so segmentation model have to work with less photos of just background
