# Prelim

In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [None]:
from fastai import *
from fastai.vision import *

In [None]:
import pyperclip

In [None]:
PATH = Path('data/IAM_handwriting')

In [None]:
# shouldn't be on gpu...

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device

# Create/Edit PG dataset

## Lines from edited_lines.txt

In [None]:
lines = pd.read_csv(f'{PATH}/ascii/edited_lines.txt', names=['filename','result','text'],
                    escapechar='\\', delimiter='|', header=None, usecols=[0,1,8])

In [None]:
lines.head()

In [None]:
len(lines[lines.result == 'err'])

## combine lines into paragraphs

In [None]:
pgs = list(set(line[:-3] for line in lines.filename))
pgs.sort()
pgs[:10]

In [None]:
paragraphs = []
for fname in pgs:
    nums = []
    text = []
    rows = lines[lines.filename.str.startswith(fname+'-')]
    for i,r in rows.iterrows():
        text.append(r.text)
        nums.append(r.filename[-2:])
    paragraphs.append({'filename': fname+'.png', 'text': '\n'.join(text), 'nums': nums})

In [None]:
len(paragraphs), len(pgs)

In [None]:
pg = pd.DataFrame(paragraphs)
pg.head()

## fix quotation marks

In [None]:
def despace_double_quotes(m):
    m = m.group(0)   # entire matched string
    m = m.replace('" ','"')
    m = m.replace(' "','"')
    return m

def clean_double_quotes(x): return re.sub(r'"(.+?)"', despace_double_quotes, x, flags=16)

In [None]:
pg['text'] = pg.apply(lambda row: clean_double_quotes(row.text), axis=1)

In [None]:
pg.to_csv(PATH/'pg_tmp1.csv', columns=['filename','text','nums'], index=False)

## Manually editing bad data

In [None]:
bad = pg[pg.text.str.contains('" But, "he continues," the greatest is')]

In [None]:
bad = pg[pg.text.str.contains(' " ')]
len(bad)

In [None]:
rp = bad.iterrows()

In [None]:
i,row = next(rp)
#print(i)
pyperclip.copy(row.text)
# pg.iloc[i].text = r.text+'.'
print(row.text)
PIL.Image.open(PATH/f"paragraphs/{row.filename}").resize((600,400))

In [None]:
pg.at[i,'text'] = '''From being an assembler in an aircraft
factory to becoming a paint sprayer in
a ceramic factory, he can be qualified
for a completely new job in less time
than it takes to say "Tolpuddle
Martyrs!" "CRIKET," says the Oxford
Dictionary, is "an open air game played
with ball, bats and wickets between
two sides consisting of eleven players
each." Not so, dear Oxford Dictionary.'''

In [None]:
print(pg.iloc[i].text)

In [None]:
bad_fnames = res[res.result=='mod'].filename.values

In [None]:
bf = [f[:-7]+'.png' for f in bad_fnames]

In [None]:
l = list(set(bf))
len(l)

In [None]:
bad = pg[pg.filename.isin(l)]

# Remove Test Data

In [None]:
len(pg)

In [None]:
test = pg.sample(15, random_state=42)
test

In [None]:
pg = pg.drop(test.index)
len(pg)

In [None]:
test.reset_index(drop=True, inplace=True)
test.to_csv(PATH/'test_pg.csv', columns=['filename', 'text'], index=False)

In [None]:
pg.reset_index(drop=True, inplace=True)
pg.to_csv(PATH/'edited_pg.csv', columns=['filename', 'text'], index=False)

In [None]:
len(pg)

# Break edited_pg back into edited_lines

In [None]:
pg = pd.read_csv(PATH/'edited_pg.csv')
pg.head()

In [None]:
lines = []
for i,row in pg.iterrows():
    #ls = row.text.split('\n')
    for i,l in zip(row.nums, row.text.split('\n')):
        lines.append({'filename': f"{row.filename[:-4]}-{i}.png", 'text': l})

In [None]:
edited_lines = pd.DataFrame(lines)
edited_lines.head()

In [None]:
edited_lines.sample(10)

In [None]:
edited_lines.to_csv(PATH/'edited_lines.csv', columns=['filename', 'text'], index=False)

## fix line errors

In [None]:
lines["filename"] = lines.filename.apply(lambda x: x+'.png')

In [None]:
res = edited_lines.merge(lines[['filename','result']], on='filename')

In [None]:
res.to_csv(PATH/'res_edited_lines.csv', columns=['filename', 'text', 'result'], index=False)

In [None]:
res = pd.read_csv(PATH/'res_edited_lines.csv')

In [None]:
bad = res[res.result=='err']
len(bad)

In [None]:
rl = bad.iterrows()

In [None]:
# fix a previous one
i = 12404
row = res.iloc[i]

In [None]:
i,row = next(rl)
print(i)
pyperclip.copy(row.text)
res.at[i,'result'] = 'fix'
print(row.text)
PIL.Image.open(PATH/f"lines/{row.filename}").resize((800,100))

In [None]:
#modify text
res.at[i,'text'] = 'overlooking, the sweeping brown tides'
res.at[i,'result'] = 'mod'

In [None]:
#remove final '.'
res.at[i,'text'] = row.text[:-1]
res.at[i,'result'] = 'mod'

In [None]:
#keep result as 'err'
res.at[i,'result'] = 'err'

In [None]:
res.iloc[i].text

In [None]:
res.iloc[i].result

# Synthesize new data

In [None]:
edited_lines = pd.read_csv(PATH/'edited_lines.csv')
len(edited_lines)

## Helpers

In [None]:
def standardize_imgs(imgs, baseheight):
    resized_imgs = []
    for img in imgs:
        hpercent = (baseheight / float(img.size[1]))
        wsize = int((float(img.size[0]) * float(hpercent)))
        img = img.resize((wsize, baseheight), PIL.Image.ANTIALIAS)
        resized_imgs.append(img)
    return resized_imgs

In [None]:
def resize_max(im, size=1000):
    "Resize an image so that the largest dimension is of specified size"
    r,c = im.size
    ratio = size/max(r,c)
    return im.resize((int(r*ratio), int(c*ratio)), PIL.Image.ANTIALIAS)

In [None]:
def show_sample(df, path, row=2, col=2, show_files=False):
    fig, axes = plt.subplots(row,col, figsize=(20, 20))
    samp = df.sample(row*col).values #=> outputs as an array [[filename, labels]]
    for i,ax in enumerate(axes.flat):
        row = samp[i]
        ax.imshow(PIL.Image.open(path/row[0]))
        title = row[1]+f"\n{row[2]}" if show_files else row[1]
        ax.set_title(title)

#     plt.tight_layout(pad=0.2)

## Synth Lines

In [None]:
def create_img(imgs, targ_path, num_lines, max_size=None, pad=50):
    w = 1
    h = num_lines
        
    widths, heights = zip(*(i.size for i in imgs))
    median_height = int(np.median(heights))
    
    #stzd_imgs = standardize_imgs(imgs, median_height)
    lines = [imgs[i:i + w] for i in range(0, len(imgs), w)]
    
    total_width = max([np.sum([word.size[0] for word in line]) for line in lines]) + (pad*(w+1))   
    total_height = (median_height * h) + (pad*(h+1)) #sum(heights)

    new_im = PIL.Image.new('RGB', (total_width, total_height), color=(255,255,255))

    y_offset = pad
    x_offset = pad
    
    for line in lines:
        x_offset = pad
        for word in line:
            new_im.paste(word, (x_offset,y_offset))
            x_offset += word.size[0] + pad
        y_offset += median_height + pad
    
    if max_size: 
        resize_max(new_im, max_size).save(targ_path)
    else:
        new_im.save(targ_path)

In [None]:
# number of words/image
def create_synth_data(df, num, num_lines, src_path, targ_path, max_size=None, offset=0):
    d={}
    for i in progress_bar(range(num)):
        samp = df.sample(num_lines)
        files = samp.filename.values
        imgs  = [PIL.Image.open(src_path/f) for f in files]
        
        # split into rows with \n
        label = '\n'.join([' '.join(row) for row in np.array_split(samp.text.values, num_lines)])
#         label = ' '.join(samp.text.values)

        fname = str(num_lines)+'_'+'{:04d}'.format(i+offset)+'.png'
        create_img(imgs, targ_path/fname, num_lines, max_size)
        [f.close() for f in imgs]
        d[fname] = label
    return pd.DataFrame({'filename': list(d.keys()), 'label': list(d.values())})

In [None]:
src_path = PATH/'lines'
synth_path = PATH/'cat_lines'

!rm -rf {synth_path}
os.makedirs(synth_path, exist_ok=True)

### single

In [None]:
num_lines = 5
synth = create_synth_data(edited_lines, 10, num_lines, src_path, synth_path)

In [None]:
show_sample(synth, synth_path)

In [None]:
synth.head()

### multi

In [None]:
for num_lines in progress_bar(range(3,14)):
    synth = create_synth_data(edited_lines, 2000, num_lines, src_path, synth_path)    
    CSV = str(synth_path)+'_'+str(num_lines)+'.csv'
    synth.to_csv(CSV, columns=['filename', 'label'], index=False)

In [None]:
a = pd.read_csv(PATH/'cat_lines_3.csv')
b = pd.read_csv(PATH/'cat_lines_4.csv')
c = pd.read_csv(PATH/'cat_lines_5.csv')
d = pd.read_csv(PATH/'cat_lines_6.csv')
e = pd.read_csv(PATH/'cat_lines_7.csv')
f = pd.read_csv(PATH/'cat_lines_8.csv')
g = pd.read_csv(PATH/'cat_lines_9.csv')
h = pd.read_csv(PATH/'cat_lines_10.csv')
i = pd.read_csv(PATH/'cat_lines_11.csv')
j = pd.read_csv(PATH/'cat_lines_12.csv')
k = pd.read_csv(PATH/'cat_lines_13.csv')

In [None]:
new = pd.concat([a,b,c,d,e,f,g,h,i,j,k], ignore_index=True)
len(new)

In [None]:
new.head()

In [None]:
CSV = str(synth_path) + '_22k.csv'
new.to_csv(CSV, columns=['filename', 'label'], index=False)

In [None]:
show_sample(new, synth_path, 3,2)

# Create Mix Dataset

In [None]:
combo = pd.read_csv(PATH/'combo.csv')

In [None]:
combo_145k = pd.read_csv(PATH/'combo_145k.csv') 

In [None]:
def show_random(df, im_path):
    idx = random.choice(df.index)
    row = df.iloc[idx]
    print(row.label)
    return PIL.Image.open(PATH/f'{im_path}/{row.filename}')

In [None]:
show_random(combo, 'combo')

In [None]:
show_random(combo_145k, 'combo_cat')

In [None]:
# dl = pd.read_csv(PATH/'downloaded_images.csv')
# combo = pd.read_csv(PATH/'combo.csv')
# cat = pd.read_csv(PATH/'cat_lines_22k.csv')
# pg = pd.read_csv(PATH/'edited_pg.csv')

In [None]:
# mix = pd.concat([dl,sm,cat,pg], ignore_index=True)
# len(mix)

In [None]:
# /fonts_resize: 74083 -> edited_font.csv ('arthur', 'forster', 'imdb', 'james', 'shaw', 'wharton', 'zane')
# older version - no backgrounds, up to 20 lines

In [None]:
# /combo: 29906 -> combo.csv (adrift,age,american,knot,room,zane)

In [None]:
### /combo_cat: 145216 -> combo_145k.csv
# [wiki(50k), imdb(50k), downloaded_images(1771), paragraphs(1539), edited_cat_lines(12k), combo(29906)]