In [47]:
import csv
import numpy as np
import random
from tqdm import tqdm
from pathlib import Path

import pandas as pd
from datasets import load_dataset, load_metric, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from ast import literal_eval

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import set_seed, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers.utils import check_min_version


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.6.0.dev0")

def generate_examples(row):
    hid1, hid2 = row['hid1'], row['hid2']
    loss1, loss2 = row['loss1'], row['loss2']
    diff1, diff2 = row['diff1'], row['diff2']
    ctx1, ctx2 = row['ctx1'], row['ctx2']
    ocr1, ocr2 = ctx1[diff1[0]:diff1[1]], ctx2[diff2[0]:diff2[1]]
    ex1 = ' '.join(ctx1[:diff1[0]]) + ' <ocr> ' + ' '.join(ocr1) + ' </ocr> ' + ' '.join(ctx1[diff1[1]:])
    ex2 = ' '.join(ctx2[:diff2[0]]) + ' <ocr> ' + ' '.join(ocr2) + ' </ocr> ' + ' '.join(ctx2[diff2[1]:])
    correct = "<blank>"
    if loss1 < loss2:
        if ocr1:
            correct = ' '.join(ocr1)
        return hid2, ex2, correct
    else:
        if ocr2:
            correct = ' '.join(ocr2)
        return hid1, ex1, correct

def preprocess_function(examples):
    inputs = examples['orig']
    targets = examples['corrected']
    inputs = [inp for inp in inputs]
    model_inputs = tokenizer(inputs, padding=True, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding=True, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [48]:
seed = 1729
set_seed(seed)
model_name = "ocr_correction_model"

print("Loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.add_tokens(["<ocr>", "</ocr>", "<blank>"], special_tokens=True)        
tokenizer.add_special_tokens({"additional_special_tokens": ["<ocr>", "</ocr>", "<blank>"]})




Loading tokenizer


0

In [49]:
print("Loading test")
num_samples = 2000
testp = Path('/home/allekim/ocr-detection/ocr_data/test.csv')
df = pd.read_csv(testp, converters={'ctx1': eval, 'ctx2': eval, 'diff1': eval, 'diff2': eval}, nrows=num_samples)
df[['hid', 'orig','corrected']] = df.apply(generate_examples, axis=1, result_type="expand")
test_dataset = Dataset.from_pandas(df[['hid', 'orig', 'corrected']])

test_dataset = test_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
)

Loading test


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [50]:
print("Loading model")
model = AutoModelForSeq2SeqLM.from_pretrained('ocr_correction_model')
model.resize_token_embeddings(len(tokenizer))
model.eval()

Loading model


T5ForConditionalGeneration(
  (shared): Embedding(32103, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32103, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [51]:
test_dataset.column_names

['attention_mask', 'corrected', 'hid', 'input_ids', 'labels', 'orig']

In [35]:
# test_dataset

Dataset({
    features: ['attention_mask', 'corrected', 'hid', 'input_ids', 'labels', 'orig'],
    num_rows: 200
})

In [28]:
# def generate_result(example):
#     input_ids = torch.tensor(example['input_ids'])
#     attention_mask = torch.tensor(example['attention_mask'])
#     print(input_ids)
#     result = model.generate(input_ids=input_ids, attention_mask=attention_mask, output_scores=True, return_dict_in_generate=True)
#     return {'sequences': result.sequences, 'scores': result.scores}

In [29]:
# encoded_dataset = test_dataset.map(generate_result, batched=True)

In [None]:
# test_dataset[:10]['labels']

In [None]:
# x = torch.tensor(test_dataset[0:32]['input_ids'])
# y = torch.tensor(test_dataset[0:32]['attention_mask'])

In [None]:
# result = model.generate(input_ids=x, attention_mask=y, output_scores=True, return_dict_in_generate=True)

In [36]:
test_dataset

Dataset({
    features: ['attention_mask', 'corrected', 'hid', 'input_ids', 'labels', 'orig'],
    num_rows: 200
})

In [56]:
set([len(y) for y in test_dataset['input_ids']])

{260, 448}

In [63]:
x['hid']

['uc1.$b753568',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.$b753568',
 'uc1.$b753568',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.$b753568',
 'uc1.$b753568',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.$b753568',
 'uc1.$b753568',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.$b753568',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.b3370386',
 'uc1.b3370386',
 'uc1.$b753568',
 'uc1.b3370386

In [64]:
generated

NameError: name 'generated' is not defined

In [65]:
test_dataset

Dataset({
    features: ['attention_mask', 'corrected', 'hid', 'input_ids', 'labels', 'orig'],
    num_rows: 2000
})

In [87]:
len(np.where(generated==2)[0])

0

In [71]:
result

GreedySearchEncoderDecoderOutput(sequences=tensor([[    0, 32102,     1,     0,     0,     0,     0,     0,     0,     0],
        [    0,  2147,    18, 19161,  1765,   260,  6577,     1,     0,     0],
        [    0,     3,    15,     7,  1169,    60,     1,     0,     0,     0],
        [    0,    27,     1,     0,     0,     0,     0,     0,     0,     0],
        [    0,   326,     1,     0,     0,     0,     0,     0,     0,     0],
        [    0,  5376,    77,     1,     0,     0,     0,     0,     0,     0],
        [    0,     3,  1489,    89,     3,    55,     1,     0,     0,     0],
        [    0,     3,   184,     3,  8270,     3,   117,     1,     0,     0],
        [    0,  3963,     1,     0,     0,     0,     0,     0,     0,     0],
        [    0, 14246,     1,     0,     0,     0,     0,     0,     0,     0],
        [    0,  1551,     1,     0,     0,     0,     0,     0,     0,     0],
        [    0, 32102,     1,     0,     0,     0,     0,     0,     0,     0

In [None]:
results = []
i = 0
batch_size = 64
for i in tqdm(range(0,len(test_dataset), batch_size)):
    x = test_dataset[i:i+batch_size]
    input_ids = torch.tensor(x['input_ids'])
    attention_mask = torch.tensor(x['attention_mask'])
    result = model.generate(input_ids=input_ids, attention_mask=attention_mask, output_scores=True, return_dict_in_generate=True)
    for j in range(len(x)):
        scores = np.array([y[j].numpy() for y in result.scores])
        generated = result.sequences[j].numpy()
        end_idx = np.where(generated==1)[0]
        if len(end_idx) > 0:
            outtoks = tokenizer.convert_ids_to_tokens(generated)
            final_string = tokenizer.convert_tokens_to_string(outtoks[1:end_idx[0]])
            results.append((x['hid'][j], x['orig'][j], x['corrected'][j], final_string, scores))

  3%|▎         | 1/32 [00:45<23:19, 45.16s/it]

In [None]:
results

In [None]:
df = pd.DataFrame(results)

In [None]:
df

In [None]:
df = pd.DataFrame(results, columns=['sent', 'truth', 'gen', 'scores'])

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [None]:
df.to_csv('new_results.csv')

In [None]:
df = pd.read_csv('results.csv')

In [None]:
results = df[['sent', 'truth', 'gen']]

In [None]:
results

In [None]:
results['match'] = results.apply(lambda row: row['truth'] == row['gen'], axis=1)

In [None]:
results

In [None]:
pd.set_option('max_colwidth', -1)

In [None]:
results[(-results['match']) & (results['truth']!='<blank>') & results['gen'].apply(lambda x: '<unk>' in x)]

In [None]:
results[(results['truth']!='<blank>')]

In [None]:
7975 / 14414