# Data preprocessing
## Import librairies and configuration

In [None]:
import sys
import os
import importlib
import requests
import io
import urllib.parse
from types import SimpleNamespace
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import seaborn as sns
import pandas as pd
import numpy as np
import imagesize
import cv2


In [None]:
SOURCE_URL = 'https://storage.googleapis.com/dm-turtle-recall/images.tar'
BASE_URL = 'https://storage.googleapis.com/dm-turtle-recall/'
DATA_PATH = '../input'
IMAGE_DIR = '../input/images'
TAR_PATH = os.path.join(IMAGE_DIR, os.path.basename(SOURCE_URL))
EXPECTED_IMAGE_COUNT = 13891

## Download data

In [None]:

%sx mkdir --parents "{IMAGE_DIR}"
if len(os.listdir(IMAGE_DIR)) != EXPECTED_IMAGE_COUNT:
  %sx wget --no-check-certificate -O "{TAR_PATH}" "{SOURCE_URL}"
  %sx tar --extract --file="{TAR_PATH}" --directory="{IMAGE_DIR}"
  %sx rm "{TAR_PATH}"

print(f'The total number of images is: {len(os.listdir(IMAGE_DIR))}')

The total number of images is: 13891


In [None]:

def read_csv_from_web(file_name):
     
    url = urllib.parse.urljoin(BASE_URL, file_name)  
    content = requests.get(url).content
    df = pd.read_csv(io.StringIO(content.decode('utf-8')))
    if file_name in ['train.csv', 'test.csv']:
        df.image_location = df.image_location.apply(lambda x: x.lower())
        assert set(df.image_location.unique()) == set(['left', 'right', 'top'])
    csv_path = os.path.join(DATA_PATH, file_name)
    df.to_csv(csv_path,index=False)
    return df

# Read in csv files.
train = read_csv_from_web('train.csv')
test = read_csv_from_web('test.csv')
extra = read_csv_from_web('extra_images.csv')
sample_submission = read_csv_from_web('sample_submission.csv')



In [None]:
train

Unnamed: 0,image_id,image_location,turtle_id
0,ID_2RK4WLN8,top,t_id_VP2NW7aV
1,ID_VVW0QXLX,left,t_id_qZ0iZYsC
2,ID_RVATH2HZ,right,t_id_3b65X5Lw
3,ID_2GB90GPS,left,t_id_YjXYTCGC
4,ID_LM6S0B1M,top,t_id_d6aYXtor
...,...,...,...
2140,ID_BDMVQH6G,left,t_id_SwQZGIpa
2141,ID_JD58AF27,left,t_id_QqeoI5F3
2142,ID_Y2LJOVUQ,left,t_id_Lhp87PBX
2143,ID_GH1RFB6Z,right,t_id_FBsGDJhU


## Preprocessing
After a visual analysis of the images, I found that the dataset contains 112 noisy images, 2 of them from the test set. For the train / extra 110 outliers, they were simply dropped. While for the 2 test set outliers, they were cropped to remove the noisy part of the image.

In [None]:
img_drop=['ID_IMAQGTIN',
 'ID_P5VVGPBU',
 'ID_0TN13JTG',
 'ID_L97ZNL2K',
 'ID_WGV55GDN',
 'ID_S6JST5WY',
 'ID_7L41HEU6',
 'ID_HVM9ORN6',
 'ID_BTDA7Q8H',
 'ID_7WAV935V',
 'ID_S05YLC07',
 'ID_MWPMHK5H',
 'ID_YS82019U',
 'ID_XGAC2ZEL',
 'ID_3KZ7R7P4',
 'ID_QEK39KD8',
 'ID_WPT3J1B2',
 'ID_8NI03IIC',
 'ID_FNMID14I',
 'ID_VO9F35W0',
 'ID_BP8W8O5R',
 'ID_T8O3U5PP',
 'ID_I2T5XT7A',
 'ID_GCRJLJZW',
 'ID_K71C8RVN',
 'ID_LWXY80LD',
 'ID_U9YJYSDV',
 'ID_5GNR7EMS',
 'ID_6QL1VU2K',
 'ID_49KUI8CD',
 'ID_WR6FUMSZ',
 'ID_7GMOEQH9',
 'ID_LC9JUG8W',
 'ID_3M8121WW',
 'ID_JBRT4503',
 'ID_4PPAIAXY',
 'ID_4HTVIGBZ',
 'ID_V8HIJWCK',
 'ID_XPFD4K5X',
 'ID_K9B8QJ46',
 'ID_HWF8Q419',
 'ID_PBH3NGK2',
 'ID_41DWNJ1J',
 'ID_XSNPD945',
 'ID_NNELLNTI',
 'ID_X0XAIS75',
 'ID_5PFE7YYF',
 'ID_TNVPSLBN',
 'ID_2Y1LHSRF',
 'ID_N1PAA2YA',
 'ID_3EEVOO7W',
 'ID_5MOA0OFW',
 'ID_LOUHONAG',
 'ID_AT60344N',
 'ID_86QYLVNG',
 'ID_0RNNI62X',
 'ID_T1D1QH16',
 'ID_FTQ0NSBW',
 'ID_ZHMYWIEL',
 'ID_695SSVSP',
 'ID_5TUGX36G',
 'ID_UYH1TYUJ',
 'ID_MY3GH6GS',
 'ID_W0NX51ZP',
 'ID_RZ014Y1Y',
 'ID_GJ81LS0L',
 'ID_Z3R5DIHH',
 'ID_NR0AFWOE',
 'ID_ZPOSNRIO',
 'ID_Y2J0HJLP',
 'ID_UKUR1AIT',
 'ID_EJ4WAIK8',
 'ID_N504W2WM',
 'ID_RAHATO8R',
 'ID_4QTHJUFV',
 'ID_6UCQE56F',
 'ID_YZOQKOUX',
 'ID_6SNGAL79',
 'ID_7RIYI4GK',
 'ID_O75CMOVM',
 'ID_IZVXC8V9',
 'ID_EOYCZDW2',
 'ID_11TAWVBT',
 'ID_PANMA3I6',
 'ID_RROL3HGG',
 'ID_ACMP57DU',
 'ID_ZCVMBSPC',
 'ID_SPKAD3NP',
 'ID_3QG1H2FB',
 'ID_MVL2I75E',
 'ID_VI265LB9',
 'ID_DX7YXJ5S',
 'ID_BFZXPZWY',
 'ID_V0YCO3PQ',
 'ID_T7VG4HOD',
 'ID_NY6LT42V',
 'ID_OHE4WBNS',
 'ID_4D1ALRKR',
 'ID_V469PCE1',
 'ID_A2YRN4BI',
 'ID_K815EDWU',
 'ID_9TVJQ5D7',
 'ID_YALWO1Y0',
 'ID_8ESGYE0F',
 'ID_P8M3XAXU',
 'ID_I2N4YL81',
 'ID_9DK7OBSE',
 'ID_KZUAHRRK',
 'ID_JYMW34U9',
 'ID_3FU2RS4H']
len(img_drop)

110

In [None]:
crop_coord={'ID_XSU34ZXA':[9,2280,1340,1172],'ID_YZ3HZD7O':[3374, 2443, 1232, 1013]}
def preprocess_test_images(crop_dict):
    for img_id,coord in  crop_dict.items():
        x,y,w,h = coord
        image = cv2.imread(f"{IMAGE_DIR}/{img_id}.JPG")
        image = image[y:y+h,x:x+w]
        os.rename(f"{IMAGE_DIR}/{img_id}.JPG", f"{IMAGE_DIR}/{img_id}_OLD.JPG")
        cv2.imwrite(f"{IMAGE_DIR}/{img_id}.JPG", image)


In [None]:
preprocess_test_images(crop_coord)

## Validation set up
Two datasets were created, the bigger is composed of the concatenation of train + extra images while the smaller contains only the subset with only train images.
The validation folds were composed only of images of turtles that have at least 5 instances.

In [None]:
def create_purged_folds(images_drop):
    dft = pd.read_csv(f"{DATA_PATH}/train.csv")[["image_id", "turtle_id"]]
    dfe = pd.read_csv(f"{DATA_PATH}/extra_images.csv")[["image_id", "turtle_id"]]
    df = pd.concat([dft, dfe], axis=0).sample(
        frac=1, random_state=7).reset_index(drop=True)
    df['image_path'] = IMAGE_DIR+'/'+df['image_id']+'.JPG'
    dfp = df.query("image_id not in @images_drop").reset_index(drop=True)
    gp = dfp.groupby("turtle_id")["image_id"].agg("count").reset_index()
    gp.columns = ["turtle_id", "count"]
    dfp = dfp.merge(gp, how="left", on="turtle_id")
    dfp.loc[:, "fold"] = -1
    dfp1 = dfp[dfp["count"] >= 5].reset_index(drop=True)
    dfp2 = dfp[dfp["count"] < 5].reset_index(drop=True)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
    target = "turtle_id"
    for fold, (trn_idx, val_idx) in enumerate(skf.split(dfp1, dfp1[target])):
        dfp1.loc[val_idx, "fold"] = fold

    dfp = pd.concat([dfp1, dfp2], axis=0).reset_index(drop=True)
    dfp.to_csv(f"{DATA_PATH}/train_extra_purged_folds.csv", index=False)

In [None]:
create_purged_folds(img_drop)

In [None]:
def create_purged_folds_subset():
    df = pd.read_csv(f"{DATA_PATH}/train_extra_purged_folds.csv")
    dft = pd.read_csv(f"{DATA_PATH}/train.csv")
    t = dft.turtle_id.unique()
    dff = df.query("turtle_id in @t").reset_index(drop=True)
    dff.to_csv(f"{DATA_PATH}/train_purged_folds.csv", index=False)

In [None]:
create_purged_folds_subset()