### Norwegian dataset splitting and augmentation
#### Imports

In [1]:
import cv2 as cv
import augmentation
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
from tqdm import tqdm
import os
import augment_dataset


#### Read data

In [2]:
df = pd.read_csv('../image_data/norwegian_data/trim.csv')
print(df)


                                    Image      Word
0       no-nb_digimanus_101261_0001_0.jpg       Til
1       no-nb_digimanus_101261_0001_1.jpg  assessor
2      no-nb_digimanus_101261_0001_10.jpg     takke
3      no-nb_digimanus_101261_0001_11.jpg       Dig
4      no-nb_digimanus_101261_0001_12.jpg        på
...                                   ...       ...
46586   no-nb_digimanus_81638_0001_60.jpg  hengivne
46587   no-nb_digimanus_81638_0001_61.jpg   Harriet
46588    no-nb_digimanus_81638_0001_7.jpg       Tak
46589    no-nb_digimanus_81638_0001_8.jpg       for
46590    no-nb_digimanus_81638_0001_9.jpg      hvad

[46591 rows x 2 columns]


#### Stratified K-fold split

In [3]:
def copy_split_to_folder(df: pd.DataFrame, src_path: str, dest_path: str):
    os.makedirs(dest_path)

    for img_name in tqdm(df['Image']):
        img_path = os.path.join(src_path, img_name)
        new_img_path = os.path.join(dest_path, img_name)

        img = cv.imread(img_path)
        img = augmentation.resize_img(img)
        img = augmentation.gray_scale_img(img)
        img = augmentation.threshold_image(img)

        cv.imwrite(new_img_path, img)


k_folder = StratifiedKFold(5, shuffle=True)

split_index = k_folder.split(df['Image'], df['Word'])

c = 1
for train_index, test_index in split_index:
    # print("TRAIN:", len(train_index), "TEST:", len(test_index))

    train_fold = df.iloc[train_index]
    test_fold = df.iloc[test_index]

    X_train, X_valid, y_train, y_valid = train_test_split(
        train_fold['Image'], train_fold['Word'], test_size=0.1)

    df_train = pd.concat([X_train, y_train], axis=1)
    df_valid = pd.concat([X_valid, y_valid], axis=1)
    df_test = test_fold

    # print('TRAIN SIZE:', len(df_train))
    # print('VALID SIZE:', len(df_valid))
    # print('TEST SIZE:', len(df_test))

    df_train.to_csv(
        f'../image_data/norwegian_data/train_threshold_split{c}.csv', index=False)
    df_valid.to_csv(
        f'../image_data/norwegian_data/valid_threshold_split{c}.csv', index=False)
    df_test.to_csv(
        f'../image_data/norwegian_data/test_threshold_split{c}.csv', index=False)

    copy_split_to_folder(df_train, '../image_data/norwegian_data/trim',
                         f'../image_data/norwegian_data/train_threshold_split{c}')
    copy_split_to_folder(df_valid, '../image_data/norwegian_data/trim',
                         f'../image_data/norwegian_data/valid_threshold_split{c}')
    copy_split_to_folder(df_test, '../image_data/norwegian_data/trim',
                         f'../image_data/norwegian_data/test_threshold_split{c}')

    c += 1


100%|██████████| 33544/33544 [10:57<00:00, 51.02it/s]
100%|██████████| 3728/3728 [01:11<00:00, 51.86it/s]
100%|██████████| 9319/9319 [03:01<00:00, 51.31it/s]
100%|██████████| 33545/33545 [10:49<00:00, 51.65it/s] 
100%|██████████| 3728/3728 [01:13<00:00, 50.80it/s]
100%|██████████| 9318/9318 [03:03<00:00, 50.65it/s]
100%|██████████| 33545/33545 [10:41<00:00, 52.29it/s]
100%|██████████| 3728/3728 [01:15<00:00, 49.11it/s]
100%|██████████| 9318/9318 [03:04<00:00, 50.52it/s]
100%|██████████| 33545/33545 [10:59<00:00, 50.86it/s]
100%|██████████| 3728/3728 [01:15<00:00, 49.21it/s]
100%|██████████| 9318/9318 [03:10<00:00, 48.83it/s]
100%|██████████| 33545/33545 [10:44<00:00, 52.07it/s]
100%|██████████| 3728/3728 [01:13<00:00, 50.89it/s]
100%|██████████| 9318/9318 [03:01<00:00, 51.25it/s]


#### Augmentation of train splits (use augment_dataset.py)

In [4]:
# for i in range(1, 6):
#     src_path = f'image_data/norwegian_data/train_split{i}'
#     df = pd.read_csv(f'../image_data/norwegian_data/test_split{i}.csv')

#     words_count = augment_dataset.count_words(df)

#     print(len(words_count))

    # for word in tqdm(words_count.keys()):
    #     df = augment_dataset.augment_word_class(
    #         df, word, words_count[word], src_path, src_path, 100)
