In [63]:
import pandas as pd

from sklearn.model_selection import train_test_split

In [64]:
# Read dataframe
full_df = pd.read_csv("./dataset/train_ship_segmentations_v2.csv")
full_df.head()

Unnamed: 0,ImageId,EncodedPixels
0,00003e153.jpg,
1,0001124c7.jpg,
2,000155de5.jpg,264661 17 265429 33 266197 33 266965 33 267733...
3,000194a2d.jpg,360486 1 361252 4 362019 5 362785 8 363552 10 ...
4,000194a2d.jpg,51834 9 52602 9 53370 9 54138 9 54906 9 55674 ...


In [65]:
# Count amount of ships on each single image
full_df["ships"] = full_df["EncodedPixels"].map(lambda c_row: 1 if isinstance(c_row, str) else 0)
grouped_full_df = full_df.groupby("ImageId").agg({"ships": "sum"}).reset_index()
grouped_full_df["has_ship"] = grouped_full_df["ships"].map(lambda x: 1.0 if x>0 else 0.0)
grouped_full_df["has_ship_vec"] = grouped_full_df["has_ship"].map(lambda x: [x])

grouped_full_df.head()

Unnamed: 0,ImageId,ships,has_ship,has_ship_vec
0,00003e153.jpg,0,0.0,[0.0]
1,0001124c7.jpg,0,0.0,[0.0]
2,000155de5.jpg,1,1.0,[1.0]
3,000194a2d.jpg,5,1.0,[1.0]
4,0001b1832.jpg,0,0.0,[0.0]


In [66]:
# Balance unbalanced df
# Sample images using amount of ships on it, take all if there are fewer images than SAMPLES_PER_GROUP (min(images_amount, SAMPLES_PER_GROUP))
SAMPLES_PER_GROUP = 3000
balanced_full_df = grouped_full_df.groupby("ships").apply(lambda x: x.sample(SAMPLES_PER_GROUP) if len(x) > SAMPLES_PER_GROUP else x)

balanced_full_df.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,ImageId,ships,has_ship,has_ship_vec
ships,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,181778,f1c202ded.jpg,3,1.0,[1.0]
3,81525,6c4978481.jpg,3,1.0,[1.0]
2,20857,1bbde0436.jpg,2,1.0,[1.0]
0,159060,d3a94e9b2.jpg,0,0.0,[0.0]
0,79043,69038bc40.jpg,0,0.0,[0.0]
0,144094,bf97306b5.jpg,0,0.0,[0.0]
6,90716,787b491df.jpg,6,1.0,[1.0]
1,92687,7b2222397.jpg,1,1.0,[1.0]
3,87228,73d86f282.jpg,3,1.0,[1.0]
10,22623,1e0b06edb.jpg,10,1.0,[1.0]


In [67]:
# Split into train and validation parts
train_ids, val_ids = train_test_split(balanced_full_df,
                                        test_size = 0.25,
                                        stratify = balanced_full_df["ships"])
train_df = pd.merge(full_df, train_ids.drop(columns=["ships"]))[["ImageId", "EncodedPixels"]]
val_df = pd.merge(full_df, val_ids.drop(columns=["ships"]))[["ImageId", "EncodedPixels"]]

print(f"training images: {train_df.shape[0]} validation images: {val_df.shape[0]}")

training images: 38455 validation images: 12816


In [68]:
train_df.to_csv("train_df.csv", index=False)
val_df.to_csv("val_df.csv", index=False)