In [24]:
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [10]:
df = pd.read_csv('datasets/train.csv')
df

Unnamed: 0,Image,Id
0,00022e1a.jpg,w_e15442c
1,000466c4.jpg,w_1287fbc
2,00087b01.jpg,w_da2efe0
3,001296d5.jpg,w_19e5482
4,0014cfdf.jpg,w_f22f3e3
...,...,...
9845,ffe5c306.jpg,w_2ceab05
9846,ffeaa7a4.jpg,w_b067417
9847,ffecec63.jpg,w_8b56cb1
9848,fff04277.jpg,w_2dcbf82


In [19]:
ID_MAPPER = {whale_id: i for i, whale_id in enumerate(df['Id'].sort_values().drop_duplicates().to_list())}
df['whale_id_index'] = df['Id'].apply(lambda whale_id: ID_MAPPER[whale_id])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Image,Id,whale_id_index
1,000466c4.jpg,w_1287fbc,17
2,00087b01.jpg,w_da2efe0,269
7,0031c258.jpg,new_whale,0
8,0035632e.jpg,w_3d0bc7a,68
13,00467ae9.jpg,w_fd1cb9d,307
...,...,...,...
9836,ffa78ccc.jpg,w_89d9c03,161
9837,ffb71ac2.jpg,new_whale,0
9840,ffc0b437.jpg,w_ace8c54,208
9844,ffd1e7aa.jpg,new_whale,0


From EDA, we saw this dataset is unbalance. We can't split data to train set and validation set with balanced whale ID if dataset contains a lot of small amount of image of any whale ID.

To fix this problem, we have to discard some of whale ID that have small amount of image (below 4). With this method, we will keep balance of whale ID on both set with validation ratio = 20% .

In [20]:
counts = df['Id'].value_counts()

non_single_id = counts[counts > 4].index
df = df[df['Id'].isin(non_single_id)]

In [21]:
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['Id'], random_state=42)

In [22]:
len(train_df), len(val_df)

(2654, 664)

In [23]:
train_df.set_index('Image').to_csv('datasets/train_sets.csv')
val_df.set_index('Image').to_csv('datasets/val_sets.csv')