In [1]:
import lucene
from lupyne import engine
import json
import unicodedata
import os
import re

In [2]:
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

<jcc.JCCEnv at 0x7f0b0082a410>

In [3]:
def str2ascii(string):
    string = unicodedata.normalize('NFD', string)
    string = string.encode('ascii', 'ignore')
    string = string.decode("utf-8")
    return string

def get_stop_words_re_patterns():
    file_path='./stop_words.txt'
    f = open(file_path, 'r', encoding='UTF-8')
    words = [str2ascii(line).rstrip() for line in f]
    f.close()
    patterns = []
    for stop_word in words:
        w = re.compile(f' {stop_word} ', re.IGNORECASE)
        patterns.append(w)
    return patterns   

In [4]:
indexer = engine.Indexer('lucene_index')
indexer.set('file_path', stored=True)
indexer.set('url', stored=True)
indexer.set('title', engine.Field.Text, stored=True)
indexer.set('content', engine.Field.Text, stored=True)

<Field: stored,indexed,tokenized>

In [5]:
def prepare_text(text, patterns):
    text = str2ascii(text)
    for p in patterns:
        text = p.sub(' ', text)
    return text

patterns = get_stop_words_re_patterns()
for base_path_n in range(1, 7):
    articles = []
    base_path = f'/usr/src//HTML/articles_output_part_{base_path_n}/'
    collection = iter(sorted(os.listdir(base_path)))
    for current_file_path in collection:
        if '_SUCCESS' not in current_file_path and not str(current_file_path).startswith('.'):
            with open(base_path + current_file_path, 'r', encoding='utf-8') as json_file:
                doc = json.load(json_file)
                doc['file_path'] = f'articles_output_part_{base_path_n}/' + current_file_path
                articles.append(doc)
    for article in articles:
        file_path = article['file_path']
        url = article['url']
        title = prepare_text(article['title'], patterns)
        content = prepare_text(article['content'], patterns)
        indexer.add(file_path=file_path, url=url, title=title, content=content)
    indexer.commit()

In [6]:
hits = indexer.search('vichrica', field='content', count=20)
for hit in hits:
    with open('/usr/src//HTML/' + hit['file_path'], 'r', encoding='utf-8') as json_file:
        doc = json.load(json_file)
        print(doc['url'])
        print(doc['title'])

https://www.aktuality.sk/clanok/868385/kauzy-burka-a-vichrica-maju-noveho-dozorujuceho-prokuratora/
Kauzy Búrka a Víchrica majú nového dozorujúceho prokurátora
https://www.aktuality.sk/clanok/625993/tyzden-sa-zacne-studenym-a-veternych-pocasim/
Týždeň sa začne studeným a veterných počasím
https://www.aktuality.sk/clanok/835492/ministerka-kolikova-pozastavila-vykon-funkcie-dvom-sudcom-z-akcie-vichrica/
Ministerka Kolíková pozastavila výkon funkcie dvom sudcom z akcie Víchrica
https://www.aktuality.sk/clanok/534376/vichrica-a-sneh-potrapia-slovensko-aj-dnes/
Víchrica a sneh potrápia Slovensko aj dnes
https://www.aktuality.sk/clanok/766352/cez-vikend-zasiahne-slovensko-silny-vietor/
Cez víkend zasiahne Slovensko silný vietor
https://www.aktuality.sk/clanok/139007/taliansko-burky-a-vichrica-si-vyziadali-jednu-obet/
Taliansko: Búrky a víchrica si vyžiadali jednu obeť
https://www.aktuality.sk/clanok/730727/na-severe-slovenska-moze-dnes-sprchnut/
Na severe Slovenska môže dnes spŕchnuť
https:/