In [1]:
# !pip install -U Elasticsearch
# !pip install csv2es

In [2]:
# pip install csv2es
from elasticsearch import helpers, Elasticsearch
import csv
import csv2es

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [4]:
df = spark.read.csv('handson_spark/DNPBA2017.csv', header=True)

In [5]:
df = df.select(['NUMERODN', 'CODESTAB', 'IDADEMAE', 'ESCMAEAGR1']).limit(20).toPandas()

In [6]:
# df.to_csv('DNPBA2017_es.csv', index=False)

In [28]:
es = Elasticsearch(['http://elk:9200/'], http_compress=True)

In [8]:
for index, rec in df.iterrows():
    print(rec.to_json())
    res = es.index(index="test-index", doc_type='nasc', body=rec.to_json())

{"NUMERODN":"72390242","CODESTAB":"2786095","IDADEMAE":"25","ESCMAEAGR1":"01"}
{"NUMERODN":"72396320","CODESTAB":"NA","IDADEMAE":"20","ESCMAEAGR1":"02"}
{"NUMERODN":"72374657","CODESTAB":"NA","IDADEMAE":"20","ESCMAEAGR1":"04"}
{"NUMERODN":"72376065","CODESTAB":"2755157","IDADEMAE":"28","ESCMAEAGR1":"06"}
{"NUMERODN":"72392109","CODESTAB":"7373120","IDADEMAE":"34","ESCMAEAGR1":"12"}
{"NUMERODN":"72391035","CODESTAB":"2755157","IDADEMAE":"23","ESCMAEAGR1":"12"}
{"NUMERODN":"72389213","CODESTAB":"2755157","IDADEMAE":"30","ESCMAEAGR1":"04"}
{"NUMERODN":"75447854","CODESTAB":"2755157","IDADEMAE":"16","ESCMAEAGR1":"04"}
{"NUMERODN":"69747564","CODESTAB":"2364816","IDADEMAE":"17","ESCMAEAGR1":"02"}
{"NUMERODN":"69726416","CODESTAB":"2777770","IDADEMAE":"33","ESCMAEAGR1":"08"}
{"NUMERODN":"69726448","CODESTAB":"2777770","IDADEMAE":"33","ESCMAEAGR1":"06"}
{"NUMERODN":"69726554","CODESTAB":"2777770","IDADEMAE":"28","ESCMAEAGR1":"06"}
{"NUMERODN":"69726562","CODESTAB":"2777770","IDADEMAE":"18","E

In [10]:
content = {
    "query": {
        "term": {
            "CODESTAB": "2786095"
         }
     },
}

In [12]:
content = {
    "query": {
        "range": {
            "IDADEMAE": {
                "gte": 20,
                "lt": 30
            }
        }
    }
}

In [14]:
content = {
    "query": {
        "bool": {
            "must": [
                {
                    "term": {
                        "IDADEMAE": 21
                    }
                },
                {
                    "term": {
                        "ESCMAEAGR1": "06"
                    }
                }
            ]
        }
    }
}

In [29]:
es.search(index="test-index", body=content)

{'took': 0,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 3.6686769,
  'hits': [{'_index': 'test-index',
    '_type': 'nasc',
    '_id': '1gVuzW8BYV5JYeU8EmXr',
    '_score': 3.6686769,
    '_source': {'NUMERODN': '69726563',
     'CODESTAB': '2777770',
     'IDADEMAE': '21',
     'ESCMAEAGR1': '06'}}]}}

In [23]:
def buscaExata(numerodn, codestab, idademae, escmaeagr1, startId=0): 
    
    global es
    
    content = {
        'size': 30,
        'query': {
            'bool': {
                'must': [
                    {'match_phrase': {'NUMERODN': '"' + numerodn + '"'}},
                    {'match_phrase': {'CODESTAB': '"' + codestab + '"'}}, 
                    {'match': {'IDADEMAE': idademae}},
                    {'match': {'ESCMAEAGR1': escmaeagr1}}
                ]
            }
        }
    }
    force = True
    while force:
        try:
            res = es.search(index="test-index", body=content)
            force = False
        except:
            pass
    return res['hits']['hits']

In [24]:
def buscaAproximada(numerodn, codestab, idademae, escmaeagr1, startId=0):
    
    global es
    
    content = {
        'size': 100,
        'query': {
            'bool': {
                'should': [
                    {'match': {'NUMERODN': {'query': numerodn, 'fuzziness':'AUTO', 'operator':'or', 'boost':'2'}}},
                    {'match': {'CODESTAB': {'query': codestab, 'fuzziness':'AUTO', 'operator':'or', 'boost':'2'}}},
                    {'match': {'IDADEMAE': {'query': idademae, 'fuzziness':'AUTO', 'operator':'or', 'boost':'0.5'}}},
                    {'match': {'ESCMAEAGR1': {'query': escmaeagr1, 'fuzziness':'AUTO', 'operator':'or'}}}
                ]
            }
        }
    }
    force = True
    while force:
        try:
            res = es.search(index="test-index", body=content)
            force = False
        except:
            pass
    return res['hits']['hits']

In [25]:
buscaExata("72390242", "2786095", "25", "01")

[{'_index': 'test-index',
  '_type': 'nasc',
  '_id': 'yQVuzW8BYV5JYeU8D2WH',
  '_score': 10.55623,
  '_source': {'NUMERODN': '72390242',
   'CODESTAB': '2786095',
   'IDADEMAE': '25',
   'ESCMAEAGR1': '01'}}]

In [19]:
buscaExata("72390242", "2786095", "33", "01")

[]

In [20]:
buscaAproximada("72390242", "2786095", "25", "01")

[{'_index': 'my-index',
  '_type': 'my-type',
  '_id': '3QVuzW8BYV5JYeU8GWXV',
  '_score': 14.514815,
  '_source': {'NUMERODN': '72390242',
   'CODESTAB': '2786095',
   'IDADEMAE': '25',
   'ESCMAEAGR1': '01'}}]

In [21]:
buscaAproximada("72390242", "2786095", "33", "01")

[{'_index': 'my-index',
  '_type': 'my-type',
  '_id': '3QVuzW8BYV5JYeU8GWXV',
  '_score': 13.195287,
  '_source': {'NUMERODN': '72390242',
   'CODESTAB': '2786095',
   'IDADEMAE': '25',
   'ESCMAEAGR1': '01'}},
 {'_index': 'my-index',
  '_type': 'my-type',
  '_id': '5gVuzW8BYV5JYeU8GWXV',
  '_score': 1.0641159,
  '_source': {'NUMERODN': '69726416',
   'CODESTAB': '2777770',
   'IDADEMAE': '33',
   'ESCMAEAGR1': '08'}},
 {'_index': 'my-index',
  '_type': 'my-type',
  '_id': '5wVuzW8BYV5JYeU8GWXV',
  '_score': 1.0641159,
  '_source': {'NUMERODN': '69726448',
   'CODESTAB': '2777770',
   'IDADEMAE': '33',
   'ESCMAEAGR1': '06'}}]