diff --git a/classes/enrich/base/base_enrich.php b/classes/enrich/base/base_enrich.php index 22d8c39..8ffa2b0 100644 --- a/classes/enrich/base/base_enrich.php +++ b/classes/enrich/base/base_enrich.php @@ -33,6 +33,12 @@ */ abstract class base_enrich { + /** Config object. + * + * @var mixed + */ + protected $config; + /** * The constructor for the class, will be overwritten in most cases. * diff --git a/classes/enrich/image/rekognition.php b/classes/enrich/image/rekognition.php index dd269c8..c932886 100644 --- a/classes/enrich/image/rekognition.php +++ b/classes/enrich/image/rekognition.php @@ -45,7 +45,7 @@ class rekognition extends base_enrich { * * @var array */ - protected $acceptedmime = array( + protected $supported = array( 'image/jpeg', 'image/png' ); diff --git a/classes/enrich/text/elastic.php b/classes/enrich/text/elastic.php new file mode 100644 index 0000000..6895547 --- /dev/null +++ b/classes/enrich/text/elastic.php @@ -0,0 +1,206 @@ +. + +namespace search_elastic\enrich\text; + +use search_elastic\enrich\base\base_enrich; +use search_elastic\esrequest; + +/** + * Extract text from files using Tika. + * + * @package search_elastic + * @copyright Dmitrii Metelkin + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ +class elastic extends base_enrich { + + /** + * Array of file mimetypes that enrichment class supports. + * + * @var array + */ + protected $acceptedmime = [ + 'application/pdf', + 'text/html', + 'application/msword', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.template', + 'application/vnd.ms-word.document.macroEnabled.12', + 'application/vnd.ms-excel', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.template', + 'application/vnd.ms-excel.sheet.macroEnabled.12', + 'application/vnd.ms-excel.template.macroEnabled.12', + 'application/vnd.ms-excel.addin.macroEnabled.12', + 'application/vnd.ms-excel.sheet.binary.macroEnabled.12', + 'application/vnd.ms-powerpoint', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + 'application/vnd.openxmlformats-officedocument.presentationml.template', + 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', + 'application/vnd.ms-powerpoint.addin.macroEnabled.12', + 'application/vnd.ms-powerpoint.presentation.macroEnabled.12', + 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12', + 'application/vnd.oasis.opendocument.text', + 'application/vnd.oasis.opendocument.spreadsheet', + 'application/vnd.oasis.opendocument.presentation', + 'application/epub+zip' + ]; + + /** + * Returns the step name. + * + * @return string human readable step name. + */ + public static function get_enrich_name() { + return get_string('elastic', 'search_elastic'); + } + + + /** + * Check if the tika server is ready. + * + * @return boolean + */ + private function elastic_server_ready() { + $returnval = false; + + // Check if we have a valid set of config. + if (!empty($this->config->elasticapikey) && !empty($this->config->elastichostname)) { + $url = trim($this->config->elastichostname, "/"); + } + + if (!empty($url)) { + $response = $this->get_client()->get($url); + if ($response->getStatusCode() == 200) { + $returnval = true; + } + } + + return $returnval; + } + + /** + * Checks if supplied file is can be analyzed by this enrichment class. + * + * @param \stored_file $file File to check. + * @return boolean + */ + public function can_analyze($file) { + $cananalyze = parent::can_analyze($file); + + // If we can analyze this type of file + // check if tika is configured and available. + if ($cananalyze) { + $cananalyze = $this->elastic_server_ready(); + } + + return $cananalyze; + } + + /** + * Use tika to extract text from file. + * + * @param \stored_file $file + * @param \search_elastic\esrequest $client client + * @return string + */ + public function extract_text($file, $client) { + $extractedtext = ''; + + $hostname = trim($this->config->elastichostname, "/"); + $id = $file->get_contenthash(); + $url = $hostname . '/' . $this->config->elasticindex . '/_doc/' . $id . '?pipeline=' . $this->config->elasticpipeline; + + $data = [ + $this->config->elasticfield => base64_encode($file->get_content()), + ]; + + $data = json_encode($data); + $response = $client->put($url, $data); + + if ($response->getStatusCode() == 200 || $response->getStatusCode() == 201) { + $url = $hostname . '/' . $this->config->elasticindex . '/_doc/' . $id; + $response = $client->get($url); + if ($response->getStatusCode() == 200) { + $jsoncontent = json_decode($response->getBody()); + + if (!empty($jsoncontent->found) && !empty($jsoncontent->_source->attachment->content)) { + $extractedtext .= strip_tags($jsoncontent->_source->attachment->content); + + } + } + } + + return $extractedtext; + } + + /** + * Analyse file and return results. + * + * @param \stored_file $file The image file to analyze. + * @return string $filetext Text of file description labels. + */ + public function analyze_file($file) { + return $this->extract_text($file, $this->get_client()); + } + + /** + * A callback to add fields to the enrich form, specific to enrichment class. + * + * @param \moodleform $form + * @param \MoodleQuickForm $mform + * @param mixed $customdata + * @param mixed $config + */ + public static function form_definition_extra($form, $mform, $customdata, $config) { + $mform->addElement('text', 'elastichostname', get_string ('elastichostname', 'search_elastic')); + $mform->setType('elastichostname', PARAM_URL); + $mform->addHelpButton('elastichostname', 'elastichostname', 'search_elastic'); + self::set_default('elastichostname', 'https://127.0.0.1', $mform, $customdata, $config); + + $mform->addElement('text', 'elasticapikey', get_string ('elasticapikey', 'search_elastic')); + $mform->setType('elasticapikey', PARAM_RAW_TRIMMED); + $mform->addHelpButton('elasticapikey', 'elasticpipeline', 'search_elastic'); + self::set_default('elasticapikey', '', $mform, $customdata, $config); + + $mform->addElement('text', 'elasticpipeline', get_string ('elasticpipeline', 'search_elastic')); + $mform->setType('elasticpipeline', PARAM_RAW_TRIMMED); + $mform->addHelpButton('elasticpipeline', 'elasticpipeline', 'search_elastic'); + self::set_default('elasticpipeline', 'attachment', $mform, $customdata, $config); + + $mform->addElement('text', 'elasticfield', get_string ('elasticfield', 'search_elastic')); + $mform->setType('elasticfield', PARAM_RAW_TRIMMED); + $mform->addHelpButton('elasticfield', 'elasticpipeline', 'search_elastic'); + self::set_default('elasticfield', 'data', $mform, $customdata, $config); + + $mform->addElement('text', 'elasticindex', get_string ('elasticindex', 'search_elastic')); + $mform->setType('elasticindex', PARAM_RAW_TRIMMED); + $mform->addHelpButton('elasticindex', 'elasticpipeline', 'search_elastic'); + self::set_default('elasticindex', 'my-index', $mform, $customdata, $config); + } + + /** + * Get a new client. + * + * @return \search_elastic\esrequest + */ + protected function get_client(): esrequest { + return new esrequest(false, (object) [ + 'apikey' => $this->config->elasticapikey + ]); + } +} diff --git a/classes/esrequest.php b/classes/esrequest.php index 66078c3..eb273f2 100644 --- a/classes/esrequest.php +++ b/classes/esrequest.php @@ -58,8 +58,13 @@ class esrequest { * @param \GuzzleHttp\HandlerStack $handler Optional custom Guzzle handler stack * @return void */ - public function __construct($handler = false) { - $this->config = get_config('search_elastic'); + public function __construct($handler = false, $config = false) { + if (empty($config) || !($config instanceof \stdClass)) { + $this->config = get_config('search_elastic'); + } else { + $this->config = $config; + } + $this->signing = (isset($this->config->signing) ? (bool)$this->config->signing : false); // Allow the caller to instantiate the Guzzle client with a custom handler. @@ -95,9 +100,7 @@ private function signrequest($request) { $signer = new \Aws\Signature\SignatureV4('es', $this->config->region); // Sign your request. - $signedrequest = $signer->signRequest($request, $credentials); - - return $signedrequest; + return $signer->signRequest($request, $credentials); } /** @@ -140,20 +143,17 @@ public function get($url) { $psr7request = $this->signrequest($psr7request); } - $response = $this->http_action($psr7request); - - return $response; - + return $this->http_action($psr7request); } /** * Process PUT requests to Elasticsearch. * * @param string $url - * @param array $params + * @param mixed $params * @return \GuzzleHttp\Psr7\Response */ - public function put($url, $params=null) { + public function put($url, $params = null) { $headers = $this->get_authorization_header(); $headers['content-type'] = 'application/json'; @@ -163,19 +163,16 @@ public function put($url, $params=null) { $psr7request = $this->signrequest($psr7request); } - $response = $this->http_action($psr7request); - - return $response; - + return $this->http_action($psr7request); } /** * Creates post API requests. * @param string $url - * @param unknown $params + * @param mixed $params * @return \Psr\Http\Message\ResponseInterface|NULL */ - public function post($url, $params) { + public function post($url, $params = null) { $headers = $this->get_authorization_header(); $headers['content-type'] = 'application/json'; @@ -185,10 +182,7 @@ public function post($url, $params) { $psr7request = $this->signrequest($psr7request); } - $response = $this->http_action($psr7request); - - return $response; - + return $this->http_action($psr7request); } /** @@ -211,16 +205,13 @@ public function postfile($url, $file) { $psr7request = new \GuzzleHttp\Psr7\Request('POST', $url, $headers, $multipart); - $response = $this->http_action($psr7request); - - return $response; - + return $this->http_action($psr7request); } /** * Creates delete API requests. * - * @param unknown $url + * @param string $url * @return \Psr\Http\Message\ResponseInterface|NULL */ public function delete($url) { @@ -232,10 +223,7 @@ public function delete($url) { $psr7request = $this->signrequest($psr7request); } - $response = $this->http_action($psr7request); - - return $response; - + return $this->http_action($psr7request); } /** diff --git a/cli/tika-test.txt b/cli/tika-test.txt new file mode 100644 index 0000000..d63654b --- /dev/null +++ b/cli/tika-test.txt @@ -0,0 +1 @@ +Tika test. \ No newline at end of file diff --git a/cli/tika_config_tester.php b/cli/tika_config_tester.php new file mode 100644 index 0000000..e173184 --- /dev/null +++ b/cli/tika_config_tester.php @@ -0,0 +1,127 @@ +. + +/** + * CLI config tester + * + * @package search + * @copyright 2023 David Castro + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ +define('CLI_SCRIPT', true); + +require(__DIR__.'/../../../../config.php'); +require_once($CFG->libdir.'/clilib.php'); // Cli only functions. + +list($options, $unrecognized) = cli_get_params([ + 'help' => false, + 'testfileid' => '', +], [ + 'h' => 'help', + 't' => 'testfileid', +]); + +if ($unrecognized) { + $unrecognized = implode("\n ", $unrecognized); + cli_error(get_string('cliunknowoption', 'admin', $unrecognized)); +} + +if ($options['help']) { + $help = " +Run Tika diagnostics. + +Options: +-h, --help Print out this help +-t, --testfileid (Optional) PDF or accepted file id to send to tika for analysis + +Examples: +\$ sudo -u www-data /usr/bin/php search/engine/elastic/cli/tika_config_tester.php -t= +"; + + echo $help; + die; +} + +/** + * Inspired by \search_elastic\enrich\text\tika::tika_server_ready. + * Outputs cli messages on error. + */ +function tika_server_ready() { + $tikahostname = get_config('search_elastic', 'tikahostname'); + $tikaport = get_config('search_elastic', 'tikaport'); + + $returnval = false; + $client = new \search_elastic\esrequest(); + $url = ''; + // Check if we have a valid set of config. + if (!empty($tikahostname) && !empty($tikaport)) { + $port = $tikaport; + $hostname = rtrim($tikahostname, "/"); + $url = $hostname . ':' . $port; + } else { + cli_writeln('tikahostname or tikaport are not set in elasticsearch config'); + } + + // Check we can reach Tika server. + if ($url !== '') { + $response = $client->get($url); + $responsecode = $response->getStatusCode(); + + if ($responsecode == 200) { + $returnval = true; + } else { + $error = 'Undetermined'; + if (method_exists($response, 'getBody')) { + // This might be transformed into a guzzleexception. + // We need to check if it is still a response. + $error = $response->getBody(); + } + cli_error("Making a GET request to $url resulted in error:\nHTTP Code: $responsecode\nResponse: $error"); + } + } + + return $returnval; +} + +//$canusetika = tika_server_ready(); +//if (!$canusetika) { +// cli_error("Tika cannot be used. Please verify plugin configuration."); +//} +cli_writeln('Connection to tika was successful!'); + +//$fileid = $options['testfileid']; +//if (empty($fileid)) { +// cli_writeln('No file id specified, exiting.'); +// exit(0); +//} + +$tika = new \search_elastic\enrich\text\elastic(get_config('search_elastic')); +$fs = get_file_storage(); + +$record = new \stdClass(); +$record->contextid = context_system::instance()->id; +$record->component = 'phpunit'; +$record->filearea = 'test'; +$record->itemid = 0; +$record->filepath = '/'; +$record->filename = 'tika-test-file.txt'; + +$fs = get_file_storage(); +$file = $fs->create_file_from_string($record, 'Tika test.'); +$text = $tika->analyze_file($file); +$file->delete(); + +cli_writeln('Text found in file ' . $file->get_filename() . ': ' . $text); diff --git a/enrich.php b/enrich.php index d0ff8dc..01a05c8 100644 --- a/enrich.php +++ b/enrich.php @@ -49,6 +49,35 @@ // Build the page output. echo $OUTPUT->header(); + +if (!empty($config->fileindexing) && !empty($config->fileindexselect)) { + $classname = $config->fileindexselect; + $processor = new $classname($config); + + $fs = get_file_storage(); + + $record = new \stdClass(); + $record->contextid = context_system::instance()->id; + $record->component = 'phpunit'; + $record->filearea = 'test'; + $record->itemid = 0; + $record->filepath = '/'; + $record->filename = 'test.txt'; + + $fs = get_file_storage(); + $file = $fs->create_file_from_string($record, 'Tika test.'); + $text = $processor->analyze_file($file); + + $file->delete(); + + if ($text == 'Tika test.') { + echo $OUTPUT->notification($processor->get_enrich_name() . ' Text extraction is working correctly.', 'success'); + } else { + echo $OUTPUT->notification($processor->get_enrich_name() . ' Text extraction is not working correctly.', 'error'); + + } +} + echo $OUTPUT->heading(get_string('enrichsettings', 'search_elastic')); echo html_writer::div(get_string('enrichdesc', 'search_elastic'), 'enrich_description'); echo html_writer::div($form->render(), 'form_container'); diff --git a/lang/en/search_elastic.php b/lang/en/search_elastic.php index 27e3208..7c25354 100644 --- a/lang/en/search_elastic.php +++ b/lang/en/search_elastic.php @@ -44,6 +44,17 @@
For more information, follow this link: {$a}'; $string['complexhelpurl'] = 'https://lucene.apache.org/core/2_9_4/queryparsersyntax.html'; +$string['elastic'] = 'Elastic cloud'; +$string['elastichostname'] = 'Elastic endpoint URL'; +$string['elastichostname_help'] = 'URL of Elastic cloud API end point'; +$string['elasticapikey'] = 'API key'; +$string['elasticapikey_help'] = 'Elastic API key for authorising requests'; +$string['elasticpipeline'] = 'Pipeline name'; +$string['elasticpipeline_help'] = 'Name of pipeline for text extraction'; +$string['elasticfield'] = 'Field name'; +$string['elasticfield_help'] = 'Configured field name for storing file content information'; +$string['elasticindex'] = 'Index name'; +$string['elasticindex_help'] = 'An index to store file processing results in'; $string['enrichdesc'] = 'Global Search can enrich the indexed data used in search by extracting text and other data from files. The data extracted from files in Moodle is controlled by the following groups of settings.'; $string['enrichsettings'] = 'Data enrichment settings'; diff --git a/tests/fixtures/test2.pdf b/tests/fixtures/test2.pdf new file mode 100644 index 0000000..397be43 Binary files /dev/null and b/tests/fixtures/test2.pdf differ