# Reconocimiento caracteres de PDF protegido del Servel
### Por Alvaro Jeria M alvaro.jeria@uv.cl
### Profesor. U Valpo

El reconocimiento óptico de caracteres (OCR) ha sido una tarea muy popular. Tesseract es el software de código abierto más disponible para el OCR. Fue desarrollado inicialmente por HP como una herramienta en C++. Desde 2006 es desarrollado por Google. El software original está disponible como una herramienta de línea de comandos para Windows. Vivimos en un mundo python. Debido a su popularidad, la herramienta también está disponible en python, desarrollada y mantenida como un proyecto de código abierto.

### Paso 1 . Instalar tesseract-OCR y lenguaje in Google Colab.

In [None]:
!sudo apt install tesseract-ocr
!sudo apt-get install tesseract-ocr-spa


Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 40 not upgraded.
Need to get 4,795 kB of archives.
After this operation, 15.8 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-eng all 4.00~git24-0e00fe6-1.2 [1,588 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-osd all 4.00~git24-0e00fe6-1.2 [2,989 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr amd64 4.00~git2288-10f4998a-2 [218 kB]
Fetched 4,795 kB in 2s (3,041 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl

### Paso 2 . Instalar librerias para manejar PDF y OCR


In [None]:
!pip install pytesseract
!pip install PyMuPDF
!pip install easyocr

Collecting pytesseract
  Downloading pytesseract-0.3.8.tar.gz (14 kB)
Building wheels for collected packages: pytesseract
  Building wheel for pytesseract (setup.py) ... [?25l[?25hdone
  Created wheel for pytesseract: filename=pytesseract-0.3.8-py2.py3-none-any.whl size=14071 sha256=6c9db1819a92bce3e605e4c16910542711e4b02b4eb0146017d4c540d8643848
  Stored in directory: /root/.cache/pip/wheels/a4/89/b9/3f11250225d0f90e5454fcc30fd1b7208db226850715aa9ace
Successfully built pytesseract
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.8
Collecting PyMuPDF
  Downloading PyMuPDF-1.18.15-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 7.1 MB/s 
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.18.15
Collecting easyocr
  Downloading easyocr-1.4-py3-none-any.whl (63.6 MB)
[K     |████████████████████████████████| 63.6 MB 21 kB/s 
Collecting python-bidi
  Downloadin

### Paso 3. Subir archivos (fonts y pdf a trabajar)

In [None]:
from google.colab import files

uploaded = files.upload()

Saving arial.ttf to arial.ttf


### Paso 4. Importar librerias y código

In [None]:
import fitz
import pytesseract
import io
from PIL import Image, ImageDraw, ImageFont
import easyocr
import numpy as np
import time
from pytesseract import Output
import cv2

class OCRHandler:
    def __init__(self):
        pass
    def get_text_from_image(self,img):
        pass

class TesseractOCR(OCRHandler):
    def get_text_from_image(self,img):
        self.textohoja=pytesseract.image_to_string(img, lang='spa',config='--psm 4 tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+.')

class EasyOCR(OCRHandler):
    def __init__(self):
        self.reader = easyocr.Reader(['es'])

    def get_text_from_image(self,img):
        self.textohoja=self.reader.readtext(np.array(img), detail = 0)
    

class PdfHandler:
    def __init__(self,input_pdf_file):
        self.input_pdf=input_pdf_file
        self.original_image = None
        self.pix = None
        self.doc = fitz.open(self.input_pdf)

    def get_image_from_page(self,num_page):
        zoom_x = 3.0  # horizontal zoom
        zoom_y = 3.0  # vertical zoom
        self.pix = self.doc.loadPage(num_page).getPixmap(matrix=fitz.Matrix(zoom_x, zoom_y), colorspace=fitz.csGRAY,
                             clip=fitz.IRect(15, 78, 535, 590), alpha=False, annots=False)
        return Image.open(io.BytesIO(self.pix.getPNGdata()))


class Outputhandler:
    def __init__(self):
        self.filename='output.txt'
        self.file =open(self.filename, "w+", encoding='utf-8')
        pass
    def write(self,procesador):
        if isinstance(procesador,EasyOCR):
          for ele in procesador.textohoja:
              self.file.write(ele+'\n')
        else:
          self.file.write(procesador.textohoja.replace('\n\n', '\n').replace('\x0C', ''))

    def __del__(self):
        self.file.close()

class ImageHandler:

    def __init__(self):
        self.img_separator=None
        self.original_image=None
        self.treated_image=None

    def get_clean_image2(self,dirty_image):
        def funclimp(p):
            if p > 175:
                return 255
            else:
                return p

        return Image.eval(dirty_image, funclimp)

    def get_concat_h(self,input_imglist):
        def merge(imgmergelist):
            new_width = 0
            for img_for_merge in imgmergelist:
                new_width = new_width + img_for_merge.width
            dst = Image.new('L', (new_width, imgmergelist[0].height))
            paste_width = 0
            for img_for_merge in imgmergelist:
                if paste_width == 0:
                    dst.paste(img_for_merge, (0, 0))
                else:
                    dst.paste(img_for_merge, (paste_width, 0))
                paste_width = paste_width + img_for_merge.width
            return dst

        def putsymbol(sourceimage): #hay que pensar en otra alternativa
            fnt = ImageFont.truetype("arial.ttf", 15)
            if self.img_separator is None:
                self.img_separator = Image.new('L', (15, sourceimage.height), 255)
                d2 = ImageDraw.Draw(self.img_separator)
                advance = 1
                for linea in range(154):
                    d2.text((2, advance), "+", font=fnt, fill=0)
                    advance = advance + 18
            return merge([self.get_clean_image2(sourceimage), self.img_separator])

        for i,img in enumerate(input_imglist):
            input_imglist[i]=putsymbol(img)

        return merge(input_imglist)

    def reorderimage(self):
        timestr = time.strftime("%Y%m%d-%H%M%S")
        #self.get_clean_image2(self.original_image)
        #img = cv2.imread(timestr+'ori.clea.png')
        #d = pytesseract.image_to_data(img, output_type=Output.DICT)
        #n_boxes = len(d['level'])
        #for i in range(n_boxes):
        #    (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
        #    cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
        #cv2.imwrite(timestr+'cuadro.ori.clea.png',img)
        height_img = self.original_image.height
        width_img = self.original_image.width
        imagen_nombre = self.original_image.crop((10, 0, 500, height_img))
        imagen_rut = self.original_image.crop((720, 0, 830, height_img))
        imagen_sexo =self.original_image.crop((835, 0, 915, height_img))
        imagen_domicilio = self.original_image.crop((920, 0, width_img, height_img))
        self.treated_image = self.get_concat_h([imagen_rut, imagen_nombre, imagen_sexo,imagen_domicilio])
        self.treated_image.save(timestr+'.png')

    def treat_image(self,input_image):
        self.original_image=input_image
        self.reorderimage()
        return self.treated_image


class PdfToFile:
    def __init__(self, inputpdffile, hojas):
        self.pdffile = inputpdffile
        self.numhojas = hojas
        self.imagehandler=ImageHandler()
        self.OCR=EasyOCR()
        self.PDF=PdfHandler(self.pdffile)
        self.File=Outputhandler()

    def process(self):

        for hoja in range(0, self.numhojas):
            imagen_hoja=self.PDF.get_image_from_page(hoja)
            imagen_hoja_tratada=self.imagehandler.treat_image(imagen_hoja)
            self.OCR.get_text_from_image(imagen_hoja_tratada)
            self.File.write(self.OCR)
            print('Hoja'+str(hoja))

## Paso 5. Extracción de texto desde pdf
#### Hay que definir el numero de hojas

In [None]:
prueba = PdfToFile('A02103.pdf',3)
prueba.process()


Hoja0
Hoja1
Hoja2


### Paso extra. Averiguar que tarjeta se asignó

In [None]:
!nvidia-smi

Thu Jul 29 02:10:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0    34W /  70W |   2802MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces