In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from pathlib import Path

In [2]:
data_root = Path(r'D:\data\SJJ\SingleOCR\mixed_passed11000_filtered1000')
assert data_root.is_dir()

labels = [str(i).zfill(2) for i in range(17)]
assert all((data_root / label).is_dir() for label in labels)

In [3]:
dataset = {'path': [], 'label': []}

for label in labels:
    label_dir = data_root / label
    assert label_dir.is_dir()
    for img_path in label_dir.glob('*.jpg'):
        rel_path = f'./{label}/{img_path.name}'
        dataset['path'].append(rel_path)
        dataset['label'].append(int(label))

In [4]:
dataset_df = pd.DataFrame(dataset)
dataset_df.head()

Unnamed: 0,path,label
0,./00/120312_400055_1201.jpg,0
1,./00/180818_310027_0602.jpg,0
2,./00/180818_310041_1201.jpg,0
3,./00/180818_310046_4501.jpg,0
4,./00/180818_310048_1101.jpg,0


In [5]:
dataset_df.tail()

Unnamed: 0,path,label
183056,./16/210409_300020_2902.jpg,16
183057,./16/210409_300021_4501.jpg,16
183058,./16/210409_300021_4502.jpg,16
183059,./16/210409_300112_3002.jpg,16
183060,./16/210409_300137_2506.jpg,16


In [6]:
dataset_df.shape

(183061, 2)

In [7]:
for label, label_df in dataset_df.groupby('label'):
    print(label, len(label_df))

0 12000
1 12000
2 12000
3 12000
4 12000
5 12000
6 12000
7 12000
8 12000
9 12000
10 12000
11 12000
12 507
13 12000
14 12000
15 12000
16 2554


## ラベル12, 13, 16が少ないので同じ画像を繰り返してデータ数を調整する

In [8]:
# label9_df = dataset_df[dataset_df['label'] == 9]
label12_df = dataset_df[dataset_df['label'] == 12]
# label13_df = dataset_df[dataset_df['label'] == 13]
# label14_df = dataset_df[dataset_df['label'] == 14]
label16_df = dataset_df[dataset_df['label'] == 16]

In [9]:
nb_class_data = 12000
ratios = {l: round(nb_class_data / len(dataset_df[dataset_df['label'] == l])) for l in [12,16]}
ratios

{12: 24, 16: 5}

In [10]:
# samples = {l: nb_class_data - len(dataset_df[dataset_df['label'] == l]) for l in [9, 14]}
# samples

{9: 3070, 14: 2674}

In [10]:
augmented_12_df = pd.concat([label12_df] * ratios[12], ignore_index=True)

In [11]:
augmented_12_df.shape

(12168, 2)

In [12]:
augmented_16_df = pd.concat([label16_df] * ratios[16], ignore_index=True)
augmented_16_df.shape

(12770, 2)

In [None]:
# augmented_13_df = pd.concat([label13_df] * ratios[13], ignore_index=True)
# augmented_13_df.shape

In [18]:
sample13_df = label13_df.sample(n=(nb_class_data - len(augmented_13_df)), random_state=10)
sample13_df.shape

(1354, 2)

In [19]:
augmented_13_df = pd.concat([label13_df, label13_df, sample13_df], ignore_index=True)
augmented_13_df.shape

(8000, 2)

In [20]:
sample9_df = label9_df.sample(n=samples[9], random_state=10)
sample9_df.shape

(3070, 2)

In [21]:
augmented_9_df = pd.concat([label9_df, sample9_df], ignore_index=True)
augmented_9_df.shape

(8000, 2)

In [22]:
sample14_df = label14_df.sample(n=samples[14], random_state=10)
sample14_df.shape

(2674, 2)

In [23]:
augmented_14_df = pd.concat([label14_df, sample14_df], ignore_index=True)
augmented_14_df.shape

(8000, 2)

In [14]:
augmented_df = pd.concat([dataset_df[~dataset_df['label'].isin([12, 16])],
#                           augmented_9_df,
                          augmented_12_df,
#                           augmented_13_df,
#                           augmented_14_df,
                          augmented_16_df], ignore_index=True)

In [15]:
for label, label_df in augmented_df.groupby('label'):
    print(label, len(label_df))

0 12000
1 12000
2 12000
3 12000
4 12000
5 12000
6 12000
7 12000
8 12000
9 12000
10 12000
11 12000
12 12168
13 12000
14 12000
15 12000
16 12770


In [16]:
augmented_df = augmented_df.rename(columns={'path': 'x', 'label': 'y'})

In [17]:
csv_path = data_root / 'all_ai_ocr_numeric_v7_0_0.csv'
augmented_df.to_csv(csv_path, index=False)

## train test split

In [18]:
test_size = 0.2

In [19]:
version = 'v7_0_0'
dataset_name = 'ai_ocr_numeric_' + version

csv_path = data_root / f'all_{dataset_name}.csv'
assert csv_path.is_file()
augmented_df = pd.read_csv(csv_path)

train_path = data_root / f'train_{dataset_name}.csv'
val_path = data_root / f'val_{dataset_name}.csv'

In [20]:
train_df, test_df = train_test_split(augmented_df, test_size=test_size, stratify=augmented_df['y'])

In [21]:
train_df['y'].value_counts()

16    10216
12     9734
7      9600
1      9600
2      9600
3      9600
4      9600
5      9600
6      9600
8      9600
15     9600
9      9600
10     9600
11     9600
13     9600
14     9600
0      9600
Name: y, dtype: int64

In [22]:
test_df['y'].value_counts()

16    2554
12    2434
7     2400
1     2400
2     2400
3     2400
4     2400
5     2400
6     2400
8     2400
15    2400
9     2400
10    2400
11    2400
13    2400
14    2400
0     2400
Name: y, dtype: int64

In [23]:
train_df.to_csv(train_path, index=False)
test_df.to_csv(val_path, index=False)