In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_raw = pd.read_csv('../data/train.csv')
train_df = train_raw.copy()
uni_raw = pd.read_csv('../data/unicode_translation.csv')
uni_df = uni_raw.copy()

In [3]:
train_df.head()

Unnamed: 0,image_id,labels
0,100241706_00004_2,U+306F 1231 3465 133 53 U+304C 275 1652 84 69 ...
1,100241706_00005_1,U+306F 1087 2018 103 65 U+304B 1456 1832 40 73...
2,100241706_00005_2,U+306F 572 1376 125 57 U+306E 1551 2080 69 68 ...
3,100241706_00006_1,U+3082 1455 3009 65 44 U+516B 1654 1528 141 75...
4,100241706_00007_2,U+309D 1201 2949 27 33 U+309D 1196 1539 27 36 ...


In [4]:
uni_df.head()

Unnamed: 0,Unicode,char
0,U+0031,1
1,U+0032,2
2,U+0034,4
3,U+0036,6
4,U+0039,9


## Transform

In [5]:
train_df.iloc[0,1]

'U+306F 1231 3465 133 53 U+304C 275 1652 84 69 U+3044 1495 1218 143 69 U+3051 220 3331 53 91 U+306B 911 1452 61 92 U+306B 927 3445 71 92 U+306E 904 2879 95 92 U+5DE5 1168 1396 187 95 U+3053 289 3166 69 97 U+4E09 897 3034 121 107 U+306E 547 1912 141 108 U+3084 1489 2675 151 109 U+3068 1561 2979 55 116 U+5DF1 1513 2500 127 117 U+3082 1213 1523 72 119 U+3055 1219 3266 95 124 U+306E 259 2230 68 125 U+306E 1184 2423 169 125 U+4E16 849 2236 163 127 U+7D30 1144 1212 200 128 U+305D 316 3287 57 133 U+4EBA 217 2044 183 135 U+3051 277 2974 112 137 U+308C 201 3423 181 137 U+3060 243 2830 159 143 U+5F37 1479 2034 163 145 U+306E 1497 1567 123 152 U+305F 1164 952 145 153 U+3066 552 1199 97 155 U+4FF3 537 2095 176 155 U+6839 203 1439 184 156 U+304B 1188 2606 156 157 U+8AE7 549 2328 156 159 U+308C 1495 2784 168 159 U+5B50 891 1255 100 164 U+3092 584 2546 117 164 U+53CA 849 1588 151 164 U+8005 1192 2198 133 169 U+305A 889 1763 103 171 U+907F 513 945 181 171 U+6B63 539 1439 136 172 U+6587 192 2382 216 17

In [6]:
train_df['labels'] = train_df['labels'].apply(lambda x: x.split())

In [7]:
train_df['image_id'].value_counts(ascending = False)

100241706_00004_2    1
200021925-00004_2    1
200022050-00001      1
200022050-00002_1    1
200022050-00002_2    1
                    ..
200014740-00040_2    1
200014740-00041_1    1
200014740-00041_2    1
200014740-00042_1    1
umgy012-042          1
Name: image_id, Length: 3605, dtype: int64

In [8]:
# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
train_df = train_df.explode('labels').reset_index(drop = True)

In [9]:
train_df.head()

Unnamed: 0,image_id,labels
0,100241706_00004_2,U+306F
1,100241706_00004_2,1231
2,100241706_00004_2,3465
3,100241706_00004_2,133
4,100241706_00004_2,53


In [10]:
train_df_len = len(train_df)//5

In [11]:
type_values = ['Uni', 'X', 'Y', 'W', 'H']*train_df_len

In [12]:
train_df['type'] = type_values

In [13]:
train_df.describe()

Unnamed: 0,image_id,labels,type
count,3417320,3417320,3417320
unique,3605,8293,5
top,200014685-00010_1,U+306B,Uni
freq,3070,24685,683464


In [14]:
train_df.value_counts(ascending = False)

image_id           labels  type
brsk001-022        U+3068  Uni     44
200022050-00012_1  U+306B  Uni     41
brsk005-003        U+3044  Uni     38
brsk005-041        U+306B  Uni     37
brsk004-021        U+3068  Uni     37
                                   ..
200015779_00045_1  2257    Y        1
                   2247    Y        1
                   2244    Y        1
                   2238    Y        1
umgy012-042        U+9063  Uni      1
Length: 2079130, dtype: int64

In [15]:
train_df.isnull().sum()

image_id    0
labels      0
type        0
dtype: int64

In [28]:
train_df.head()

Unnamed: 0,image_id,labels,type
0,100241706_00004_2,U+306F,Uni
1,100241706_00004_2,1231,X
2,100241706_00004_2,3465,Y
3,100241706_00004_2,133,W
4,100241706_00004_2,53,H


In [17]:
t_df = train_df.reset_index()

In [18]:
# https://stackoverflow.com/questions/71519254/transpose-dataframe-at-certain-number-of-rows
t_df = t_df.pivot(['index', 'image_id'], 
               values = 'labels', 
               columns = 'type').add_prefix('label_').rename_axis(columns=[None]).reset_index()

In [19]:
t_df.head()

Unnamed: 0,index,image_id,label_H,label_Uni,label_W,label_X,label_Y
0,0,100241706_00004_2,,U+306F,,,
1,1,100241706_00004_2,,,,1231.0,
2,2,100241706_00004_2,,,,,3465.0
3,3,100241706_00004_2,,,133.0,,
4,4,100241706_00004_2,53.0,,,,


In [29]:
t_df.drop(columns = 'index', inplace = True)