In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [None]:
from fastai.text import *
from fastai.vision import *
# import textwrap

In [None]:
PATH = Path('data/IAM_handwriting')
TMP_PATH = PATH/'tmp'

In [None]:
device = torch.device('cpu')#torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

# Helpers

In [None]:
def standardize_imgs(imgs, baseheight):
    resized_imgs = []
    for img in imgs:
        hpercent = (baseheight / float(img.size[1]))
        wsize = int((float(img.size[0]) * float(hpercent)))
        img = img.resize((wsize, baseheight), PIL.Image.ANTIALIAS)
        resized_imgs.append(img)
    return resized_imgs

In [None]:
def resize_max(im, size=1000):
    "Resize an image so that the largest dimension is of specified size"
    r,c = im.size
    ratio = size/max(r,c)
    return im.resize((int(r*ratio), int(c*ratio)), Image.ANTIALIAS)

In [None]:
def square_max(im, top_left=False, size=None):
    '''
    Add whitespace to square an image by its largest dimension or specified size.
    Args:
        top_left: image is aligned with the top_left corner
        size: size of final squared image.  If left blank size = largest dimension
    '''
    
    r,c = im.size
    if size is not None and size > max(r,c):
        sz = size
    else:
        sz = max(r,c)
        
    new_im = Image.new('RGB', (sz, sz), color=(255,255,255))  # new white image
    
    # box logic
    if top_left:
        box = (0,0)
    else:
        if sz == r:
            box = (0,random.randint(0,sz-c)) 
        elif sz == c:
            box = (random.randint(0,sz-r),0)
        else:
            box = (random.randint(0,sz-r),random.randint(0,sz-c))
            
    new_im.paste(im, box=box)
    return new_im

In [None]:
def resize_dir(fn, src, targ=None):
    if targ is None: targ = src
    dirs = os.listdir(src)
    for item in tqdm(dirs):
        if os.path.isdir(src/item): continue     # skip if src dir
#         if os.path.isfile(targ/item): continue   # skip if file exists in targ dir
        im = Image.open(src/item)
        rsz = fn(im)
        rsz.save(targ/item)
        rsz.close()
        im.close()

In [None]:
def resize_to_square(src_dir, targ_dir, size):
    "Resize and square all images in src_dir and save in targ_dir"
    resize_dir(partial(resize_max, size=size), src_dir, targ_dir)
    resize_dir(square_max, targ_dir)

In [None]:
def show_sample(df, path, row=2, col=2):
    fig, axes = plt.subplots(row,col, figsize=(20, 10))
    for i,ax in enumerate(axes.flat):
        row = df.iloc[i]
        im = Image.open(path/row.filename)
        ax.imshow(im)
        label = row.labels
    #     label = '\n'.join(textwrap.wrap(row.labels, 70))
        ax.set_title(label)

    plt.tight_layout(pad=0.2)

# Synthesize Lines

## From DataFrame

In [None]:
df = pd.read_csv(f'{PATH}/ascii/lines.txt', names=['filename','result','value'], escapechar='\\',
                          delim_whitespace=True, skiprows=23, header=None, usecols=[0,1,8])
df['text'] = df.apply(lambda row: row.value.replace('|', ' '), axis=1)

In [None]:
def cleanup(x):
    return x.replace(' .', '.').replace(' ,', ',').replace(" 'll", "'ll").replace(
                     ' !', '!').replace(' :', ':').replace(' ;', ';').replace(
                     ' ?', '?').replace(" 'd", "'d").replace(" 're", "'re").replace(
                     " 's", "'s").replace(" 't", "'t")

df['text'] = df.apply(lambda row: cleanup(row.text), axis=1)

In [None]:
# calculate character lengths
lgts = df.text.apply(len)  
df['text_len'] = lgts.astype('int32')

df.head()

In [None]:
df = df[df.result != 'err']
df = df.loc[df['text_len'] > 20]
len(df)

In [None]:
df.text_len.min(), df.text_len.median(), df.text_len.max()

## remove lines in paragraph validation set

In [None]:
# get paragraph val idxs
PG_CSV = PATH/'paragraph_chars.csv'
pg_csv = pd.read_csv(PG_CSV)
val_idxs = np.array(pg_csv.sample(frac=0.15, random_state=42).index)

In [None]:
names = pg_csv.filename[val_idxs].values
names = [name[:-4] for name in names]

In [None]:
fnames = df.filename.values
fnames = [name[:-3] for name in fnames]

In [None]:
st = set(names)
vals = [i for i, e in enumerate(fnames) if e in st]
len(vals), len(fnames)

In [None]:
# remove vals from df
df.drop(df.index[vals], inplace=True)
len(df)

## Create Data

In [None]:
def create_img(imgs, targ_path, num_lines, max_size, pad=30):
    w = 1
    h = num_lines
        
    widths, heights = zip(*(i.size for i in imgs))
    
    median_height = int(np.median(heights))
    stzd_imgs = standardize_imgs(imgs, median_height)
    lines = [stzd_imgs[i:i + w] for i in range(0, len(stzd_imgs), w)]
    
    total_width = max([np.sum([word.size[0] for word in line]) for line in lines]) + (pad*(w+1))   
    total_height = (median_height * h) + (pad*(h+1)) #sum(heights)

    new_im = Image.new('RGB', (total_width, total_height), color=(255,255,255))

    y_offset = pad
    x_offset = pad
    
    for line in lines:
        x_offset = pad
        for word in line:
            new_im.paste(word, (x_offset,y_offset))
            x_offset += word.size[0] + pad
        y_offset += median_height + pad
    
    if max_size: 
        resize_max(new_im, max_size).save(targ_path)
    else:
        new_im.save(targ_path/fname)

In [None]:
# number of words/image
def create_synth_data(num, num_lines, src_path, targ_path, max_size=1000, offset=0):
    d={}
    for i in tqdm(range(num)):
        samp = df.sample(num_lines)
        files = list(map(lambda x: x+'.png', samp.filename.values))
        imgs  = [Image.open(src_path/f) for f in files]
        
        # split into rows with \n
        label = '\n'.join([' '.join(row) for row in np.array_split(samp.text.values, num_lines)])
#         label = ' '.join(samp.text.values)

        fname = str(num_lines)+'_'+'{:04d}'.format(i+offset)+'.png'
        create_img(imgs, targ_path/fname, num_lines, max_size)
        [f.close() for f in imgs]
        d[fname] = label
    return d

In [None]:
src_path = PATH/'lines'
targ_path = PATH/'cat_lines'

In [None]:
#single
num_lines = 3
d = create_synth_data(10, num_lines, src_path, targ_path)
len(d)

In [None]:
synth = pd.DataFrame({'filename': list(d.keys()), 'labels': list(d.values())})
synth.head()

In [None]:
show_sample(synth, targ_path)

In [None]:
#multi
for i in tqdm(range(3,15)):   #tqdm([11,12,13,14]):   #tqdm([7,8,9,10]):
    num_lines = i
    d = create_synth_data(1000, num_lines, src_path, targ_path)
    synth = pd.DataFrame({'filename': list(d.keys()), 'labels': list(d.values())})
    
    joined_labels = list(synth.labels) #list(map(lambda x: ' '.join(x), labels))

    stoi = collections.defaultdict(lambda: 82, {v:k for k,v in enumerate(itos)})
    ids = np.array([np.array([stoi[letter] for letter in word] + [3]) for word in joined_labels])

    # convert to strings (as labels)
    str_ids = np.array([' '.join(str(l) for l in w) for w in ids]).reshape(-1,1)
    synth['char_ids'] = str_ids
    
    CSV = str(targ_path)+'_'+str(num_lines)+'.csv'
    synth.to_csv(CSV, columns=['filename', 'char_ids'], index=False)

In [None]:
show_sample(a, targ_path)

# Synthesize Words

### From pd.read_csv

In [None]:
df = pd.read_csv(f'{PATH}/ascii/words.txt', names=['filename','result','value'], escapechar='\\', delim_whitespace=True, skiprows=23, header=None, usecols=[0,1,8])
df.rename(columns={'value': 'word'}, inplace=True)
len(df)

In [None]:
# remove errors
df = df[df.result != 'err']
len(df)

In [None]:
# calculate character lengths
lgts = df.word.apply(len)  
df['char_len'] = lgts.astype('int32')

In [None]:
# lots of errors from pd.read_csv
# only keep rows w/ word length < 20
df = df[df.char_len < 20]

In [None]:
df = df.loc[df['char_len'] > 3]

In [None]:
df.head()

### via manually created DF

In [None]:
maxTextLen = 32
samples = []
chars = set()

f=open(f'{PATH}/ascii/words.txt')
for line in f:
    # ignore comment line
    if not line or line[0]=='#':
        continue

    lineSplit = line.strip().split(' ')
    assert len(lineSplit) >= 9

    fileName = lineSplit[0]

    # GT text are columns starting at 9
    gtText = ''.join(lineSplit[8:])[:maxTextLen]
    char_len = len(gtText)
    chars = chars.union(set(list(gtText)))

    # put sample into list
    samples.append([fileName, gtText, char_len])
    
samples = np.stack(samples)
df = pd.DataFrame(samples, columns=['filename', 'word', 'char_len'], )
del samples

In [None]:
df['char_len'] = df.char_len.astype('int32')
df = df.loc[df['char_len'] > 3]
df = df.loc[df['char_len'] < 20]
df.head()

## num words / line

In [None]:
def create_img(src_path, targ_path, files, fname, sz=None, pad=30):
    if sz==None: sz=(1, len(files))  #(w,h)
    w = sz[1]
    h = sz[0]
        
    imgs = [ PIL.Image.open(src_path/f) for f in files ]
    widths, heights = zip(*(i.size for i in imgs))
    
    median_height = int(np.median(heights))
    stzd_imgs = standardize_imgs(imgs, median_height)
    lines = [stzd_imgs[i:i + w] for i in range(0, len(stzd_imgs), w)]
    
    total_width = max([np.sum([word.size[0] for word in line]) for line in lines]) + (pad*(w+1))   
    total_height = (median_height * h) + (pad*(h+1)) #sum(heights)

    new_im = Image.new('RGB', (total_width, total_height), color=(255,255,255))

    y_offset = pad
    x_offset = pad
    
    for line in lines:
        x_offset = pad
        for word in line:
            new_im.paste(word, (x_offset,y_offset))
            x_offset += word.size[0] + pad
        y_offset += median_height + pad
        
    new_im.save(targ_path/fname)

In [None]:
# number of words/image
def create_synth_data(src_path, targ_path, num, sz, offset=0, randomize=False, pre=''):
    d={}
    for i in tqdm(range(num)):
        if randomize:
            r = random.randint(1,sz[0])
            c = random.randint(1,sz[1])
        else:
            r,c = sz
            
        num_samp = np.product((r,c))
        res = df.sample(num_samp)
        files = list(map(lambda x: x+'.png', res.filename.values))
        
        # split into rows with \n
        label = '\n'.join([' '.join(row) for row in np.array_split(res.word.values, r)])
#         label = ' '.join(res.word.values)
        
        fname = pre+'{:04d}'.format(i+offset)+'.png'
        create_img(src_path, targ_path, files, fname, (r,c))
        d[fname] = label
    return d

In [None]:
src_path = PATH/'words'
synth_path = PATH/'small_synth_words'
!rm -rf {synth_path}

os.makedirs(synth_path, exist_ok=True)

In [None]:
d = create_synth_data(src_path, synth_path, 20000, (4,3), randomize=True, pre='sm_')
len(d)

In [None]:
synth = pd.DataFrame({'filename': list(d.keys()), 'labels': list(d.values())})
synth.head()

In [None]:
# resize_dir(partial(resize_max, size=512), src_path)
resize_dir(partial(square_max, size=1000), synth_path)

In [None]:
show_sample(synth, synth_path)

## size of image

In [None]:
synth_path = PATH/'large_synth_words_test'
!rm -rf {synth_path}

os.makedirs(synth_path, exist_ok=True)

In [None]:
def create_img(sz, fname, pad=30, median_height=None):
    # TODO: randomize padding
    new_im = Image.new('RGB', (sz,sz), color=(255,255,255))
    
    res   = df.sample(50)
    files = list(map(lambda x: x+'.png', res.filename.values))
    lbls  = res.word.values.tolist()
    imgs  = [ PIL.Image.open(PATH/'words'/f) for f in files ]
    
    if median_height is None:
        w, h  = zip(*(i.size for i in imgs))
        # standardize heights and sort longest to shortest words
        median_height = int(np.median(h))        # TODO: randomize this between mean/std
        
    stzd_imgs = standardize_imgs(imgs, median_height)
    
    #loop through standardized images and find the next image which satisfies the condition
    labels = []
    y_offset = pad
    while y_offset+median_height+pad < sz:        
        x_offset = pad
        
        gen = (i for i,x in enumerate(stzd_imgs) if x.size[0]+x_offset+pad <= sz)
        lines = []
        for idx in gen:
            word = stzd_imgs.pop(idx)            
            lines.append(lbls.pop(idx))
            new_im.paste(word, (x_offset,y_offset))
            x_offset += word.size[0] + pad
        y_offset += median_height+pad
        labels.append(' '.join(lines))

    new_im.save(synth_path/fname)    
    return '\n'.join(labels)

In [None]:
# size of image
def create_synth_data(qty, sz, fname_offset=0):
    d={}
    for i in tqdm(range(qty)):
        fname = '{:04d}'.format(i+fname_offset)+'.png'
        p = random.randint(10,20)
        h = random.randint(25,35)
        d[fname] = create_img(sz, fname, pad=p, median_height=h)
    return d

d = create_synth_data(20000, 512)
len(d)
# ~25min to create 5000 1000x1000 images

In [None]:
synth = pd.DataFrame({'filename': list(d.keys()), 'labels': list(d.values())})
synth.head()

In [None]:
show_sample(synth, synth_path)

# Numericalize

## Chars

In [None]:
# same as used in single word / multi-word
itos = pickle.load(open(TMP_PATH/'char_itos.pkl', 'rb'))
len(itos)

In [None]:
joined_labels = list(synth.labels) #list(map(lambda x: ' '.join(x), labels))

stoi = collections.defaultdict(lambda: 82, {v:k for k,v in enumerate(itos)})
ids = np.array([np.array([stoi[letter] for letter in word] + [3]) for word in joined_labels])

# convert to strings (as labels)
str_ids = np.array([' '.join(str(l) for l in w) for w in ids]).reshape(-1,1)

In [None]:
synth['char_ids'] = str_ids
synth.head()

In [None]:
CSV = str(synth_path) + '.csv'
synth.to_csv(CSV, columns=['filename', 'char_ids'], index=False)

In [None]:
# # multi-line
# CSV = str(targ_path)+'_'+str(num_lines)+'.csv'
# synth.to_csv(CSV, columns=['filename', 'char_ids'], index=False)

### Add to existing CSV

In [None]:
CSV = PATH/'large_synth_words_10000.csv'
csv = pd.read_csv(CSV)
len(csv)

In [None]:
# CSV = PATH/'synth_words_50000.csv'
CSV = PATH/'large_synth_words_50000.csv'

new = pd.concat([csv, synth[['filename', 'char_ids']]], ignore_index=True)
new.to_csv(CSV, columns=['filename', 'char_ids'], index=False)

In [None]:
len(new)

## Concatenate CSVs

In [None]:
a = pd.read_csv(PATH/'cat_lines_11.csv')
b = pd.read_csv(PATH/'cat_lines_12.csv')
c = pd.read_csv(PATH/'cat_lines_13.csv')
d = pd.read_csv(PATH/'cat_lines_14.csv')

In [None]:
new = pd.concat([a,b,c,d], ignore_index=True)
len(new)

In [None]:
new.to_csv(PATH/'cat_lines_11-14.csv', index=False)

In [None]:
def to_string(row):
    return ''.join([itos[int(c)] for c in row.split(' ')])


fig, axes = plt.subplots(2,2, figsize=(20, 20))
for i,ax in enumerate(axes.flat):
    row = a.iloc[i]
    im = Image.open(targ_path/row.filename)
    ax.imshow(im)
    label = to_string(row.char_ids)
#     label = '\n'.join(textwrap.wrap(row.labels, 70))
    ax.set_title(label)

plt.tight_layout(pad=0.2)

## Words

In [None]:
itos = pickle.load(open(TMP_PATH/'synth_word_itos.pkl', 'rb'))
len(itos)

In [None]:
joined_labels = list(synth.labels)

stoi = collections.defaultdict(lambda: 2, {v:k for k,v in enumerate(itos)})
ids = np.array([np.array([stoi[word] for word in line.split(' ')]+[3]) for line in joined_labels])

# convert to strings (as labels)
str_ids = np.array([' '.join(str(l) for l in w) for w in ids]).reshape(-1,1)

In [None]:
synth['word_ids'] = str_ids
synth.head()

### Add to existing CSV

In [None]:
CSV = PATH/'large_synth_word_ids_10000.csv'
csv = pd.read_csv(CSV)

In [None]:
# CSV = PATH/'synth_words_50000.csv'
CSV = PATH/'large_synth_word_ids_50000.csv'

new = pd.concat([csv, synth[['filename', 'word_ids']]], ignore_index=True)
new.to_csv(CSV, columns=['filename', 'word_ids'], index=False)

In [None]:
len(new)

### Modify csv/itos to match previous versions

In [None]:
itos_old = pickle.load(open(TMP_PATH/'synth_word_itos.pkl', 'rb'))

# same as used in single word / multi-word
itos = pickle.load(open(TMP_PATH/'char_itos.pkl', 'rb'))

In [None]:
res = [''.join([itos_old[int(c)] for c in line.split(' ')]) for line in csv.char_ids]
csv['words'] = res
csv.head()

In [None]:
joined_labels = list(csv.words) #list(map(lambda x: ' '.join(x), labels))

stoi = collections.defaultdict(lambda: 2, {v:k for k,v in enumerate(itos)})
ids = np.array([np.array([stoi[letter] for letter in word]+[3]) for word in joined_labels])

In [None]:
# convert to strings (as labels)
str_ids = np.array([' '.join(str(l) for l in w) for w in ids]).reshape(-1,1)

In [None]:
csv['char_ids'] = str_ids
csv = csv[['filename', 'char_ids']]
csv.head()

### Verify

In [None]:
def label_text(pred):
#     ints = to_np(pred).astype(int)
#     ints = np.trim_zeros(ints)   # remove padding (0)
    return ''.join([itos[int(i)] for i in pred])

In [None]:
fig, axes = plt.subplots(5,1, figsize=(10, 10))
for i,ax in enumerate(axes.flat):
    row = synth.iloc[i]
    im = Image.open(synth_path/row.filename)
    ax.imshow(im)
    ax.set_title(label_text(row.char_ids.split(' ')))
    
plt.tight_layout(pad=0.2)

# Batch Resize

In [None]:
targ_path = PATH/'test_resize'
os.makedirs(targ_path, exist_ok=True)

In [None]:
src_path = PATH/'cat_lines'
# targ_path = PATH/'resized_cat_lines'
# os.makedirs(targ_path, exist_ok=True)

In [None]:
resize_to_square(src_path, targ_path, 1000)

In [None]:
# resize_dir(partial(resize_max, size=512), src_path)
resize_dir(partial(square_max, size=1000), src_path, targ_path)

# v1 Batch Resize

In [None]:
from fastai.vision import *

path = untar_data(URLs.PETS)
path_hr = path/'images'
path_lr = path/'small-96'
path_mr = path/'small-256'

il = ImageList.from_folder(path_hr)

def resize_one(fn, i, path, size):
    dest = path/fn.relative_to(path_hr)
    dest.parent.mkdir(parents=True, exist_ok=True)
    img = PIL.Image.open(fn)
    targ_sz = resize_to(img, size, use_min=True)
    img = img.resize(targ_sz, resample=PIL.Image.BILINEAR).convert('RGB')
    img.save(dest, quality=75)

# create smaller image sets the first time this nb is run
sets = [(path_lr, 96), (path_mr, 256)]
for p,size in sets:
    if not p.exists(): 
        print(f"resizing to {size} into {p}")
        parallel(partial(resize_one, path=p, size=size), il.items)

# Convert datasets to new itos.pkl

In [None]:
itos = pickle.load(open(TMP_PATH/'char_itos.pkl', 'rb'))
len(itos)

In [None]:
voc = pickle.load(open(TMP_PATH/'itos.pkl', 'rb'))
len(voc)

In [None]:
def convert_char_itos(old_itos, new_itos, path, old_fname, new_fname):
    old_csv = pd.read_csv(path/old_fname)
    # convert to text and remove _eos_ token
    res = [''.join([old_itos[int(c)] for c in line.split(' ')[:-1]]) for line in old_csv.char_ids]    
    # xxunk: 3, xxeos: 2
    stoi = collections.defaultdict(lambda: 3, {v:k for k,v in enumerate(new_itos)})
    ids = np.array([np.array([stoi[letter] for letter in word] + [2]) for word in list(res)])

    # convert to strings (as labels)
    str_ids = np.array([' '.join(str(l) for l in w) for w in ids]).reshape(-1,1)
    old_csv['char_ids'] = str_ids
    old_csv.to_csv(path/new_fname, columns=['filename', 'char_ids'], index=False)

In [None]:
# old,new = 'small_synth_words.csv','sm_synth.csv'
# old,new = 'multi_synth_words.csv','3x2_synth.csv'
# old,new = 'paragraphs.csv','pg.csv'
# old,new = 'mix_words_dl.csv','full_mix.csv'
# old,new = 'mix_words.csv','mix.csv'
old,new = 'downloaded_images.csv', 'dl.csv'

In [None]:
convert_char_itos(itos, voc, PATH, old, new)

## Verify

In [None]:
csv = pd.read_csv(PATH/new)

res = [''.join([voc[int(c)] for c in line.split(' ')[:-1]]) for line in csv.char_ids]
csv['text'] = res
csv.head()