Skip to content

PHP Tesseract OCR is a C++ extension of PHP for character recognition and OCR learning in PHP environment.

License

Notifications You must be signed in to change notification settings

Albert-Zhan/php-tesseract-ocr

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

21 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

English | 中文

PHP Tesseract OCR

License

PHP Tesseract OCR is a C++ extension of PHP for character recognition and OCR learning in PHP environment.

Installation

This article only introduces the installation of PHP-CPP and PHP TesseractOCR. For Tesseract installation and other configuration, please refer to this article: http://www.5lazy.cn/post-141.html

Requirements

  • Linux, OS X does not support windows for the time being
  • PHP7 or later
  • tesseract 4.0.0 or latest
  • PHP-CPP2.1.2 or later
  • GCC 4.8 or later

âš  Before installation, add php-config of the current PHP environment to the environment variable.

1. Install PHP-CPP

git clone https://github.com/CopernicaMarketingSoftware/PHP-CPP.git
cd PHP-CPP
make
sudo make install

If there are multiple warnings in OSX compilation, please ignore them

âš  To do this, make sure that Tesseract version 4.0.0 or above is installed.

2. Install PHP Tesseract OCR

git clone https://github.com/2654709623/php-tesseract-ocr.git
cd php-tesseract-ocr
make
sudo make install

If there are multiple warnings in OSX compilation, please ignore them

Example

Because the function of PHPTesseractOCR is not a standard function of PHP, IDE can not be automatically completed. In order to facilitate development, you can click here to download IDE assistant, which can be automatically completed under IDE.

1. Simple Character Recognition

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$text=$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png')
->getUTF8Text();
echo $text;

2. Search for text blocks

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png');
$tesseract->getComponentImages('RIL_WORD',function ($x,$y,$w,$h,$text){
    echo "Result:{$text}X:{$x}Y:{$y}Width:{$w}Height:{$h}";
    echo '<br>';
});

3. Get result iterator

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png')->recognize(0);
$tesseract->getIterator('RIL_TEXTLINE',function ($text,$x1,$y1,$x2,$y2){
    echo "Text:{$text}X1:{$x1}Y1:{$y1}X2:{$x2}Y2:{$y2}";
    echo '<br>';
});
echo $tesseract->getUTF8Text();

4. Setting image recognition area

Help to improve recognition speed

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$text=$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png')
->setRectangle(100,100,100,100)
->getUTF8Text();
echo $text;

5. Setting Page Segmentation Mode

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO')
->setImage(__DIR__.'/img/1.png')
->recognize(0)
->analyseLayout()
echo $tesseract->getUTF8Text();

API

setVariable($name,$value)

Setting additional parameters

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
//Example1
$tesseract->setVariable('save_blob_choices','T');
//Example2
$tesseract->setVariable('tessedit_char_whitelist','0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ');
//Example3
$tesseract->setVariable('tessedit_char_blacklist','xyz');

setVariable Options Reference:http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version

init($dir,$lang,$mod='OEM_DEFAULT')

Tesseract initialization

Traineddata download:https://github.com/tesseract-ocr/tessdata

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
//Traineddata directory must / end
$tesseract->setVariable('save_blob_choices','T')->init(__DIR__.'/traineddata/tessdata-fast/','eng');
//Multiple languages
$tesseract->setVariable('save_blob_choices','T')->init(__DIR__.'/traineddata/tessdata-fast/','eng+chi_sim');
//Setting Engine Mode
$tesseract->setVariable('save_blob_choices','T')->init(__DIR__.'/traineddata/tessdata-raw/','eng','OEM_TESSERACT_LSTM_COMBINED');

Engine Mode Options:

  • OEM_DEFAULT(Default, based on what is available.)
  • OEM_LSTM_ONLY(Neural nets LSTM engine only.)
  • OEM_TESSERACT_LSTM_COMBINED(Legacy + LSTM engines.)
  • OEM_TESSERACT_ONLY(Legacy engine only.)

setPageSegMode($name)

Setting Paging Mode

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->setVariable('save_blob_choices','T')
->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO');

PageSegMode Options Reference:https://rmtheis.github.io/tess-two/javadoc/com/googlecode/tesseract/android/TessBaseAPI.PageSegMode.html

setImage($path)

Setting Recognition Pictures

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
//Support png, jpg, jpeg, tif, webp format
$tesseract->setVariable('save_blob_choices','T')
->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO')
->setImage(__DIR__.'/img/1.png');

setRectangle($left,$top,$width,$height)

Setting image recognition area

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->->setVariable('save_blob_choices','T')
->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO')
->setImage(__DIR__.'/img/1.png')
->setRectangle(100,100,100,100);

recognize($monitor)

After Recognize, the output is kept internally until the next SetImage

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->setVariable('save_blob_choices','T')
->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO')
->setImage(__DIR__.'/img/1.png')
->setRectangle(100,100,100,100)
//For the time being, only 0 or null is supported.
->recognize(0);

analyseLayout()

Application Paging Layout

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->setVariable('save_blob_choices','T')
->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO')
->setImage(__DIR__.'/img/1.png')
->setRectangle(100,100,100,100)
->recognize(0)
->analyseLayout();

orientation(&$orientation,&$writingDirection,&$textlineOrder,&$deskewAngle)

Get page layout analysis

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->setVariable('save_blob_choices','T')
->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setPageSegMode('PSM_AUTO')
->setImage(__DIR__.'/img/1.png')
->setRectangle(100,100,100,100)
->recognize(0)
->analyseLayout()
->orientation($orientation,$writingDirection,$textlineOrder,$deskewAngle);

getComponentImages($level,$callable)

Search for text blocks

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png');
$tesseract->getComponentImages('RIL_WORD',function ($x,$y,$w,$h,$text){
    echo "Result:{$text}X:{$x}Y:{$y}Width:{$w}Height:{$h}";
    echo '<br>';
});

PageIteratorLevel Options:

  • RIL_BLOCK(Block of text/image/separator line.)
  • RIL_PARA(Paragraph within a block.)
  • RIL_TEXTLINE(Line within a paragraph.)
  • RIL_WORD(Word within a textline.)
  • RIL_SYMBOL(Symbol/character within a word.)

getIterator($level,$callable)

Get result iterator

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png')->recognize(0);
$tesseract->getIterator('RIL_TEXTLINE',function ($text,$x1,$y1,$x2,$y2){
    echo "Text:{$text}X1:{$x1}Y1:{$y1}X2:{$x2}Y2:{$y2}";
    echo '<br>';
});

See getComponentImages for parameters

getUTF8Text()

Get UTF8 characters

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$text=$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
->setImage(__DIR__.'/img/1.png')
->getUTF8Text();
echo $text;

clear()

Free up recognition results and any stored image data

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
$tesseract->init(__DIR__.'/traineddata/tessdata-fast/','eng')
//Three images were recognized normally.
for($i=1;$i<=3;$i++){
    $tesseract->setImage(__DIR__.'/img/'.$i.'.png')
    echo $tesseract->getUTF8Text();
}
//Only one can be identified.
for($i=1;$i<=3;$i++){
   $tesseract->setImage(__DIR__.'/img/'.$i.'.png')
   echo $tesseract->getUTF8Text();
   $tesseract->clear();
}

version()

Get php tesseract version

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
echo $tesseract->version();

tesseract()

Get tesseract version

use tesseract_ocr\Tesseract;
$tesseract=new Tesseract();
echo $tesseract->tesseract();

License

Apache License Version 2.0 see http://www.apache.org/licenses/LICENSE-2.0.html