### Preparation

In [1]:
from elasticsearch import Elasticsearch, helpers
from nlp_common.acts_reader import ActsReader
import json
from elasticsearch_dsl import Search, Q
import numpy as np

INDEX_NAME = 'polish-bills'

In [2]:
def create_index(client: Elasticsearch):
  if client.indices.exists(INDEX_NAME):
    print(f'Index {INDEX_NAME} already exists. Skipping...')
    return
  
  print(f'Index {INDEX_NAME} doesn\'t exists. Creating...')
  client.indices.create(
      index=INDEX_NAME,
      body={
        'settings': {
          'number_of_shards': 1,
          'number_of_replicas': 0,
          'analysis': {
            'analyzer': {
              'default': {
                'type': 'custom',
                'tokenizer': 'standard',
                'char_filter': [
                  'kodeks_synonyms'
                ],
                'filter': [
                  'morfologik_stem',
                  'lowercase'
                ]
              }
            },
            'char_filter': {
              'kodeks_synonyms': { 
                'type': 'mapping',
                'mappings': [
                  'kpk => kodeks postępowania karnego',
                  'kpc => kodeks postępowania cywilnego',
                  'kk => kodeks karny',
                  'kc => kodeks cywilny'
                ]
              }
            },
          }
        }
      },
    )


def index_bills(client: Elasticsearch):
  if client.indices.stats(INDEX_NAME)['_all']['primaries']['docs']['count'] > 0:
    print('Some docs were already indexed. Skipping...')
    return  
  
  print(f'Indexing bills...')
  reader = ActsReader('../ustawy')
  sources = [ {
    '_index': INDEX_NAME,
    '_source': {
      'name': name, 
      'text': bill
    }
  } for name, _, bill in reader.all_acts() ]
  
  helpers.bulk(client, sources)
    
  print('Indexing completed.')
  

In [3]:
client = Elasticsearch("http://localhost:9200")

resp = client.info()

print("Elastic info")
print(resp)

create_index(client)
index_bills(client)  

Elastic info
{'name': '85ef8b70e41e', 'cluster_name': 'docker-cluster', 'cluster_uuid': '_woZQSdVTj6ABS-nYRXEgg', 'version': {'number': '7.10.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '1c34507e66d7db1211f66f3513706fdf548736aa', 'build_date': '2020-12-05T01:00:33.671820Z', 'build_snapshot': False, 'lucene_version': '8.7.0', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}
Index polish-bills already exists. Skipping...
Some docs were already indexed. Skipping...


### Number of acts containing word 'ustawa'

In [4]:
s = Search(using=client, index=INDEX_NAME)\
        .query("match", text="ustawa")
total = s.count()
total

1178

#### act without 'ustawa' word?

In [5]:
s = Search(using=client, index=INDEX_NAME)\
        .query("match", text="ustawa")
total = s.count()
s = s[0:total]
ids = [ hit.meta.id for hit in s.execute() ]

s_not = Search(using=client, index=INDEX_NAME)\
    .filter('bool', must_not=[Q('ids', values=ids)])

for hit in s_not.execute():
    print(hit.name)
    print(hit.text)

1996_400.txt





Brak tekstu w postaci elektronicznej 



Indeed it doesn't have 'ustawa' word

### Number of occurences of word 'ustawa'

In [8]:
def occurence_counter(base_word):
    s = Search(using=client, index=INDEX_NAME)\
        .source(excludes=['*'])
    total = s.count()
    s = s[0:total]
    ids_response = s.execute()

    batch_size = 100
    offset = 0
    total_count = 0
    for batch in range(0,int((total+batch_size-1)/batch_size)):
        print(f'Analyzing batch {offset} - {offset + batch_size}')
        ids = [ hit.meta.id for hit in ids_response[offset:offset+batch_size]]

        response = client.mtermvectors(index=INDEX_NAME, ids=ids, fields='text')
        for bill in response['docs']:
            terms = bill['term_vectors']['text']['terms']
            if base_word in terms:
                total_count += terms[base_word]['term_freq']

        offset += batch_size
    
    return total_count

In [9]:
res = occurence_counter('ustawa')
res

Analyzing batch 0 - 100
Analyzing batch 100 - 200
Analyzing batch 200 - 300
Analyzing batch 300 - 400
Analyzing batch 400 - 500
Analyzing batch 500 - 600
Analyzing batch 600 - 700
Analyzing batch 700 - 800
Analyzing batch 800 - 900
Analyzing batch 900 - 1000
Analyzing batch 1000 - 1100
Analyzing batch 1100 - 1200


24934

### Number of occurences of word 'ustaw'

In [10]:
res = occurence_counter('ustaw')
res

Analyzing batch 0 - 100
Analyzing batch 100 - 200
Analyzing batch 200 - 300
Analyzing batch 300 - 400
Analyzing batch 400 - 500
Analyzing batch 500 - 600
Analyzing batch 600 - 700
Analyzing batch 700 - 800
Analyzing batch 800 - 900
Analyzing batch 900 - 1000
Analyzing batch 1000 - 1100
Analyzing batch 1100 - 1200
0


In [11]:
res = occurence_counter('ustawić')
res

Analyzing batch 0 - 100
Analyzing batch 100 - 200
Analyzing batch 200 - 300
Analyzing batch 300 - 400
Analyzing batch 400 - 500
Analyzing batch 500 - 600
Analyzing batch 600 - 700
Analyzing batch 700 - 800
Analyzing batch 800 - 900
Analyzing batch 900 - 1000
Analyzing batch 1000 - 1100
Analyzing batch 1100 - 1200


913

### Number of acts containing word 'kodeks postępowania cywilnego'

In [9]:
s = Search(using=client, index=INDEX_NAME)\
        .query(Q()"match", text="kodeks postępowania cywilnego")
total = s.count()
total

654

### Number of acts containing word 'wchodzi w życie'

In [19]:
s = Search(using=client, index=INDEX_NAME)\
        .query("match_phrase_prefix", text={ 'query': "wchodzi w życie", 'slop': 2})
total = s.count()
total

1174

### Most relevant docs for word 'konstytucja'

In [18]:
s = Search(using=client, index=INDEX_NAME)\
        .query("match", text="konstytucja")

response = s.execute()

for hit in response:
    print(hit.meta.score, hit.name)

6.8693404 1997_629.txt
6.664133 2000_443.txt
6.6332603 1997_604.txt
6.628133 1996_350.txt
6.2529135 1997_642.txt
6.0585814 2001_23.txt
5.9285665 1996_199.txt
5.8504066 1999_688.txt
5.4670467 1997_681.txt
5.412781 2001_1082.txt


### Excerpts containing the word 'konstytucja'

In [19]:
q= Q({
    'fuzzy': {
        'text': {
            'value': 'konstytucja',
            'fuzziness': 1
        }
    }
})

s = Search(using=client, index=INDEX_NAME)\
        .query("match", text="konstytucja")\
        .highlight('text', fragment_size=15)

response = s.execute()

for hit in response:
    print(hit.meta.score, hit.meta.highlight.text[:3])

6.8693404 ['i uchwalenia <em>Konstytucji</em>', 'i \nuchwalenia <em>Konstytucji</em>', 'projektu nowej <em>Konstytucji</em>']
6.664133 ['1 i art. 90 <em>Konstytucji</em>', '1 lub art. 90 <em>Konstytucji</em>', '1 lub art. 90 <em>Konstytucji</em>']
6.6332603 ['do zgodności z <em>Konstytucją</em>', 'politycznej z <em>Konstytucją</em>', 'sprzeczności z <em>Konstytucją</em>']
6.628133 ['Za naruszenie <em>Konstytucji</em>', 'za naruszenie <em>Konstytucji</em>', 'za naruszenie <em>Konstytucji</em>']
6.2529135 ['<em>Konstytucją</em>', 'organy państwowe, z <em>Konstytucją</em>', 'stwierdzenie zgodności z <em>Konstytucją</em>']
6.0585814 ['ogłasza się:\n   1) <em>Konstytucję</em>', 'zatwierdzającego zmianę <em>Konstytucji</em>', 'na określone w <em>Konstytucji</em>']
5.9285665 ['przygotowania i uchwalenia <em>Konstytucji</em>', 'przygotowania i uchwalenia <em>Konstytucji</em>', 'zarządza poddanie <em>Konstytucji</em>']
5.8504066 ['art. 118 ust. 2 <em>Konstytucji</em>', 'spraw, dla których <em>K