# Lab 2 - FTS

### Adam Księżyk

#### 28.10.2022

In [1]:
import functools
import requests
import os
from elasticsearch import Elasticsearch

from utils import read_documents

# Connect to Elasticsearch

In [2]:
HOST = "http://localhost:9200"
es = Elasticsearch(HOST)
es.info().body

{'name': 'N-20L6PF1MBSVT',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': '5ZKs25L6TviNkrMXcgbdlw',
 'version': {'number': '8.4.3',
  'build_flavor': 'default',
  'build_type': 'tar',
  'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73',
  'build_date': '2022-10-04T07:17:24.662462378Z',
  'build_snapshot': False,
  'lucene_version': '9.3.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

## Elasticsearch index

An index is created for all documents.

```
index/_doc/...
```

An analyzer is created per index

```
index/_analyzer
```

Searching

```
index/_search
```

In [3]:
ES_INDEX = "polish_bills_index"
ES_ANALYZER = "polish_bills_analyzer"
ES_DOC = "doc"

ES_SYNONYMS = [
    "kpk => kodeks postępowania karnego",
    "kpc => kodeks postępowania cywilnego",
    "kk => kodeks karny",
    "kc => kodeks cywilny"
]


In [4]:
if es.indices.exists(index=ES_INDEX):
    es.indices.delete(index=ES_INDEX)

res = es.indices.create(
    index=ES_INDEX,
    settings={
        'analysis': {
            'analyzer': {
                f'{ES_ANALYZER}': {
                    'type': 'custom',
                    'tokenizer': 'standard',
                    'filter': [
                        "lowercase",
                        "synonym",
                        "morfologik_stem"
                    ]
                }
            },
            'filter': {
                'synonym': {
                    'type': "synonym",
                    'expand': True,  # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
                    'synonyms': ES_SYNONYMS
                }
            }
        }
    },
    mappings={
        'properties': {
            'title': {
                'type': "text",
                'analyzer': f'{ES_ANALYZER}',
            }
        }
    }
)
res

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'polish_bills_index'})

# Upload documents

In [5]:
def map_document_to_action(doc_name: str) -> dict:
    return {
        'index': {
            '_id': doc_name,
        }
    }


def map_document_to_source(doc_name: str, doc_text: str) -> dict:
    return {
        'name': doc_name,
        'text': doc_text
    }


In [6]:
documents = read_documents()

In [7]:
bulk_documents = functools.reduce(
    lambda acc, x: acc + [map_document_to_action(x[0])] + [map_document_to_source(*x)],
    documents.items(),
    []
)


In [8]:
res = es.bulk(index=ES_INDEX, operations=bulk_documents)
print(f"Took: {res['took']}, errors: {res['errors']}")

Took: 2765, errors: False


# Determine the number of legislative acts containing the word ustawa (in any form)

In [9]:
res = es.count(index=ES_INDEX, query={'match': {'text': {'query': "ustawa"}}})
count_ustawa = res['count']
print(f'Documents containing the word "ustawa": {count_ustawa}')

Documents containing the word "ustawa": 1178


# Determine the number of occurrences of the word **ustawa** by searching for this particular form, including the other inflectional forms

In [10]:
ret = es.termvectors(
    index=ES_INDEX,
    id="1993_599.txt",
    fields=["text"],
    filter_path=["term_vectors.text.terms.ustawa.ttf"],
    term_statistics=True
)
ttf_ustawa = ret['term_vectors']['text']['terms']['ustawa']['ttf']
print(f'Number of occurrences of the word "ustawa": {ttf_ustawa}')

Number of occurrences of the word "ustawa": 3235


# Determine the number of occurrences of the word **ustaw** by searching for this particular form, including the other inflectional forms

In [11]:
ret = es.termvectors(
    index=ES_INDEX,
    id="1993_599.txt",
    fields=["text"],
    filter_path=["term_vectors.text.terms.ustaw.ttf"],
    term_statistics=True
)
ttf_ustaw = ret['term_vectors']['text']['terms']['ustaw']['ttf']
print(f'Number of occurrences of the word "ustaw": {ttf_ustaw}')

Number of occurrences of the word "ustaw": 909


# Determine the number of legislative acts containing the words kodeks postępowania cywilnego in the specified order, but in any inflection form

In [12]:
res = es.count(index=ES_INDEX, query={'match_phrase': {'text': "kodeks postępowania cywilnego"}})
count_kpc = res['count']
print(f'Number of documents containing the phrase "kodeks postępowania cywilnego" in any form but in specified order: {count_kpc}')

Number of documents containing the phrase "kodeks postępowania cywilnego" in any form but in specified order: 44


# Determine the number of legislative acts containing the words wchodzi w życie (in any form) allowing for up to 2 additional words in the searched phrase

In [13]:
res = es.count(index=ES_INDEX, query={'match_phrase': {'text': {'query': "wchodzi w życie", 'slop': 2}}})
count_wwz = res['count']
print(f'Number of documents containing the phrase "wchodzi w życie": {count_wwz}')

Number of documents containing the phrase "wchodzi w życie": 1174


# Determine the 10 documents that are the most relevant for the phrase konstytucja

In [14]:
res = es.search(
    index=ES_INDEX,
    query={'match': {'text': "konstytucja"}},
    filter_path=["hits.hits._id", "hits.hits._score"],
    size=10
)
res['hits']['hits']


[{'_id': '1997_629.txt', '_score': 9.632944},
 {'_id': '1999_688.txt', '_score': 7.644756}]

# Print the excerpts containing the word konstytucja (up to three excerpts per document) from the previous task

In [15]:
res = es.search(
    index=ES_INDEX,
    query={'match': {'text': "konstytucja"}},
    highlight={'fields': {'text': {'number_of_fragments': 3}}},
    filter_path=["hits.hits._id", "hits.hits.highlight"],
    size=10
)
res['hits']['hits']

[{'_id': '1997_629.txt',
  'highlight': {'text': ['Zasady, na których opierać się ma <em>Konstytucja</em> mogą\n                być poddane pod referendum.']}},
 {'_id': '1999_688.txt',
  'highlight': {'text': ['Projekt ustawy nie może dotyczyć spraw, dla których <em>Konstytucja</em>\nRzeczypospolitej Polskiej zastrzega wyłączną']}}]