This repository has been archived by the owner on Sep 7, 2023. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
143 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
""" | ||
Duden | ||
@website https://www.duden.de | ||
@provide-api no | ||
@using-api no | ||
@results HTML (using search portal) | ||
@stable no (HTML can change) | ||
@parse url, title, content | ||
""" | ||
|
||
from lxml import html, etree | ||
import re | ||
from searx.engines.xpath import extract_text | ||
from searx.url_utils import quote | ||
from searx import logger | ||
|
||
categories = ['general'] | ||
paging = True | ||
language_support = False | ||
|
||
# search-url | ||
base_url = 'https://www.duden.de/' | ||
search_url = base_url + 'suchen/dudenonline/{query}?page={offset}' | ||
|
||
|
||
def request(query, params): | ||
'''pre-request callback | ||
params<dict>: | ||
method : POST/GET | ||
headers : {} | ||
data : {} # if method == POST | ||
url : '' | ||
category: 'search category' | ||
pageno : 1 # number of the requested page | ||
''' | ||
|
||
offset = (params['pageno'] - 1) | ||
params['url'] = search_url.format(offset=offset, query=quote(query)) | ||
return params | ||
|
||
|
||
def response(resp): | ||
'''post-response callback | ||
resp: requests response object | ||
''' | ||
results = [] | ||
|
||
dom = html.fromstring(resp.text) | ||
|
||
try: | ||
number_of_results_string = re.sub('[^0-9]', '', dom.xpath( | ||
'//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0] | ||
) | ||
|
||
results.append({'number_of_results': int(number_of_results_string)}) | ||
|
||
except: | ||
logger.debug("Couldn't read number of results.") | ||
pass | ||
|
||
for result in dom.xpath('//section[@class="wide" and not(contains(@style,"overflow:hidden"))]'): | ||
try: | ||
logger.debug("running for %s" % str(result)) | ||
link = result.xpath('.//h2/a')[0] | ||
url = link.attrib.get('href') | ||
title = result.xpath('string(.//h2/a)') | ||
content = extract_text(result.xpath('.//p')) | ||
# append result | ||
results.append({'url': url, | ||
'title': title, | ||
'content': content}) | ||
except: | ||
logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) | ||
continue | ||
|
||
return results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from collections import defaultdict | ||
import mock | ||
from searx.engines import duden | ||
from searx.testing import SearxTestCase | ||
from datetime import datetime | ||
|
||
|
||
class TestDudenEngine(SearxTestCase): | ||
|
||
def test_request(self): | ||
query = 'Haus' | ||
dic = defaultdict(dict) | ||
dic['pageno'] = 1 | ||
params = duden.request(query, dic) | ||
self.assertTrue('url' in params) | ||
self.assertTrue(query in params['url']) | ||
self.assertTrue('duden.de' in params['url']) | ||
|
||
def test_response(self): | ||
resp = mock.Mock(text='<html></html>') | ||
self.assertEqual(duden.response(resp), []) | ||
|
||
html = """ | ||
<section class="wide"> | ||
<h2><a href="https://this.is.the.url/" class="hidden-link"><strong>This is the title</strong> also here</a></h2> | ||
<p>This is the <strong>content</strong></p> | ||
<a href="https://this.is.the.url/">Zum vollständigen Artikel</a> | ||
</section> | ||
""" | ||
|
||
resp = mock.Mock(text=html) | ||
results = duden.response(resp) | ||
|
||
self.assertEqual(len(results), 1) | ||
self.assertEqual(type(results), list) | ||
|
||
# testing result (dictionary entry) | ||
r = results[0] | ||
self.assertEqual(r['url'], 'https://this.is.the.url/') | ||
self.assertEqual(r['title'], 'This is the title also here') | ||
self.assertEqual(r['content'], 'This is the content') |