# Lab 2 - FTS

### Adam Księżyk

#### 28.10.2022

In [1]:
import functools
import itertools
from elasticsearch import Elasticsearch

from utils import read_documents

# Connect to Elasticsearch

In [3]:
HOST = "http://localhost:9200"
es = Elasticsearch(HOST)
es.info().body

{'name': 'N-20L6PF1MBSVT',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': '5ZKs25L6TviNkrMXcgbdlw',
 'version': {'number': '8.4.3',
  'build_flavor': 'default',
  'build_type': 'tar',
  'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73',
  'build_date': '2022-10-04T07:17:24.662462378Z',
  'build_snapshot': False,
  'lucene_version': '9.3.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

# Create index

## Elasticsearch index

An index is created for all documents.

```
index/_doc/...
```

An analyzer is created per index

```
index/_analyzer
```

Searching

```
index/_search
```

In [4]:
ES_INDEX = "polish_bills_index"
ES_ANALYZER = "polish_bills_analyzer"
ES_DOC = "doc"

ES_SYNONYMS = [
    "kpk => kodeks postępowania karnego",
    "kpc => kodeks postępowania cywilnego",
    "kk => kodeks karny",
    "kc => kodeks cywilny"
]


In [4]:
if es.indices.exists(index=ES_INDEX):
    es.indices.delete(index=ES_INDEX)

res = es.indices.create(
    index=ES_INDEX,
    settings={
        'analysis': {
            'analyzer': {
                f'{ES_ANALYZER}': {
                    'type': 'custom',
                    'tokenizer': 'standard',
                    'filter': [
                        "lowercase",
                        "synonym",
                        "morfologik_stem"
                    ]
                }
            },
            'filter': {
                'synonym': {
                    'type': "synonym",
                    'expand': True,  # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
                    'synonyms': ES_SYNONYMS
                }
            }
        }
    },
    mappings={
        'properties': {
            'text': {
                'type': "text",
                'analyzer': f'{ES_ANALYZER}',
            }
        }
    }
)
res

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'polish_bills_index'})

# Upload documents

In [5]:
def map_document_to_action(doc_name: str) -> dict:
    return {
        'index': {
            '_id': doc_name,
        }
    }


def map_document_to_source(doc_name: str, doc_text: str) -> dict:
    return {
        'name': doc_name,
        'text': doc_text
    }


In [6]:
documents = list(read_documents().items())

In [7]:
def bulk_documents(documents: list[tuple[str, str]]) -> list[dict]:
    return functools.reduce(
        lambda acc, x: acc + [map_document_to_action(x[0])] + [map_document_to_source(*x)],
        documents,
        []
    )


In [8]:
n = 300
documents_chunked = [documents[i:i + n] for i in range(0, len(documents), n)]
n_chunks = len(documents_chunked)
for i, docs in enumerate(documents_chunked, 1):
    print(f"Uploading chunk {i}/{n_chunks}")
    res = es.bulk(index=ES_INDEX, operations=bulk_documents(docs))
    print(f"Took: {res['took']}, errors: {res['errors']}")

Uploading chunk 1/4
Took: 5016, errors: False
Uploading chunk 2/4
Took: 5678, errors: False
Uploading chunk 3/4
Took: 5640, errors: False
Uploading chunk 4/4
Took: 4982, errors: False


# Determine the number of legislative acts containing the word ustawa (in any form)

In [9]:
res = es.count(index=ES_INDEX, query={'match': {'text': {'query': "ustawa"}}})
count_ustawa = res['count']
print(f'Documents containing the word "ustawa": {count_ustawa}')

Documents containing the word "ustawa": 1178


# Determine the number of occurrences of the word **ustawa** by searching for this particular form, including the other inflectional forms

In [10]:
res = es.termvectors(
    index=ES_INDEX,
    id="1993_599.txt",
    fields=["text"],
    filter_path=["term_vectors.text.terms.ustawa.ttf"],
    term_statistics=True
)
ttf_ustawa = res['term_vectors']['text']['terms']['ustawa']['ttf']
print(f'Number of occurrences of the word "ustawa": {ttf_ustawa}')

Number of occurrences of the word "ustawa": 24934


# Determine the number of occurrences of the word **ustaw** by searching for this particular form, including the other inflectional forms

In [8]:
res = es.indices.analyze(index=ES_INDEX, analyzer=ES_ANALYZER, text="ustaw")
words_ustaw = [t['token'] for t in res['tokens']]
words_ustaw

['ustawa', 'ustawić']

In [10]:
ttf_ustaw = 0
for word in words_ustaw:
    res = es.termvectors(
        index=ES_INDEX,
        id="1993_599.txt",
        fields=["text"],
        filter_path=["term_vectors.text.terms.ustawić.ttf"],
        term_statistics=True
    )
    ttf_ustaw += res['term_vectors']['text']['terms']['ustawić']['ttf']
print(f'Number of occurrences of the word "ustaw": {ttf_ustaw}')

Number of occurrences of the word "ustaw": 1826


# Determine the number of legislative acts containing the words kodeks postępowania cywilnego in the specified order, but in any inflection form

In [12]:
res = es.count(index=ES_INDEX, query={'match_phrase': {'text': "kodeks postępowania cywilnego"}})
count_kpc = res['count']
print(f'Number of documents containing the phrase "kodeks postępowania cywilnego" in any form but in specified order: {count_kpc}')

Number of documents containing the phrase "kodeks postępowania cywilnego" in any form but in specified order: 99


# Determine the number of legislative acts containing the words wchodzi w życie (in any form) allowing for up to 2 additional words in the searched phrase

In [13]:
res = es.count(index=ES_INDEX, query={'match_phrase': {'text': {'query': "wchodzi w życie", 'slop': 2}}})
count_wwz = res['count']
print(f'Number of documents containing the phrase "wchodzi w życie": {count_wwz}')

Number of documents containing the phrase "wchodzi w życie": 1174


# Determine the 10 documents that are the most relevant for the phrase konstytucja

In [14]:
res = es.search(
    index=ES_INDEX,
    query={'match': {'text': "konstytucja"}},
    filter_path=["hits.hits._id", "hits.hits._score"],
    size=10
)
res['hits']['hits']


[{'_id': '1997_629.txt', '_score': 6.867635},
 {'_id': '2000_443.txt', '_score': 6.662749},
 {'_id': '1997_604.txt', '_score': 6.6320543},
 {'_id': '1996_350.txt', '_score': 6.626803},
 {'_id': '1997_642.txt', '_score': 6.251624},
 {'_id': '2001_23.txt', '_score': 6.0579295},
 {'_id': '1996_199.txt', '_score': 5.928016},
 {'_id': '1999_688.txt', '_score': 5.8496947},
 {'_id': '1997_681.txt', '_score': 5.466536},
 {'_id': '2001_1082.txt', '_score': 5.466536}]

# Print the excerpts containing the word konstytucja (up to three excerpts per document) from the previous task

In [15]:
res = es.search(
    index=ES_INDEX,
    query={'match': {'text': "konstytucja"}},
    highlight={'fields': {'text': {'number_of_fragments': 3}}},
    filter_path=["hits.hits._id", "hits.hits.highlight"],
    size=10
)
res['hits']['hits']

[{'_id': '1997_629.txt',
  'highlight': {'text': ['o zmianie ustawy konstytucyjnej o trybie przygotowania\n           i uchwalenia <em>Konstytucji</em> Rzeczypospolitej',
    'W ustawie  konstytucyjnej z  dnia 23 kwietnia 1992 r. o trybie przygotowania i \nuchwalenia <em>Konstytucji</em>',
    'Do zgłoszenia projektu <em>Konstytucji</em> załącza się wykaz \n                obywateli popierających zgłoszenie']}},
 {'_id': '2000_443.txt',
  'highlight': {'text': ['umowy międzynarodowej i nie wypełnia przesłanek określonych w art. 89\n     ust. 1 lub art. 90 <em>Konstytucji</em>',
    'międzynarodowej lub załącznika nie\n     wypełnia przesłanek określonych w art. 89 ust. 1 lub art. 90 <em>Konstytucji</em>',
    'co do zasadności wyboru\n  trybu ratyfikacji umowy międzynarodowej, o którym mowa w art. 89 ust. 2\n  <em>Konstytucji</em>']}},
 {'_id': '1997_604.txt',
  'highlight': {'text': ['Jeżeli Trybunał Konstytucyjny wyda orzeczenie o sprzeczności celów partii \n   politycznej z <em>Kons