# Docs Samples

In [134]:
import getpass
_dir = '/Users/%s/Downloads/google_news/Olympics' % getpass.getuser()
print(_dir)

/Users/bage/Downloads/google_news/Olympics


In [135]:
import json
import os

doc_list = []

for filename in os.listdir(_dir):
    print(filename)
    with open(os.path.join(_dir, filename)) as f:
        doc = json.load(f)
        doc['content'] = doc['body']
        del(doc['body'])
        doc['content_length'] = len(doc['content'].split())
        doc_list.append(doc)

12_local_athletes_heading_to_Rio_2016_Olympics.json
2016_Rio_Olympics_womens_soccer_Brazil_players_give_hosts_a_winning_start_to_Olympics.json
Archery_adopts_new_scoring_system_and_floodlights_at.json
Beyond_2016_where_to_try_the_five_new_Olympic_sports_in_Rio_de_Janeiro.json
Elite_Japanese_Gymnast_Reportedly_Owed_5K_in_Data_Overages_After_Playing_Pokemon_Go_in_Rio_Before_Olympics.json
Gender_divides_in_the_language_of_sport.json
Google_Maps_is_adding_ofteninvisible_favelas_to_their_cartography_of_Rio_ahead_of_the_Olympics.json
Mexico_vs_Germany_Olympics_2016_Prediction_Preview_Squads_For_Soccer_Group_Match.json
Michael_Phelps_Selected_As_’s_Flag_Bearer_For_Rio_2016_Olympic_Games.json
Nadal_confirms_he_will_play_at__as_Djokovic_and_Murray_hit_the_courts.json
Olympic_refugee_team_arrives_in_athletes_village_to_rapturous_welcome.json
Olympics_2016_Somali_athletes_hard_road_to_Rio.json
Olympics_2016_When_is_the_gymnastics_and_how_to_watch_it.json
Rio_2016_Andy_Murray_to_be_Team_GB_flag_be

In [136]:
import pprint
pprint.pprint(doc_list[0])

{'category': 'Olympics',
 'content': 'With the opening ceremonies for the 31st Olympiad kicking off on '
            'Friday, excitement is building for the 2016 Olympic Games in Rio '
            'de Janeiro, Brazil.\n'
            '\n'
            'Locally, we are represented across a wide range of sports '
            'including rowing, boxing, and golf.\n'
            '\n'
            'Here is a list of our local athletes who will be competing over '
            'the next couple of weeks:',
 'content_length': 58,
 'title': '12 local athletes heading to Rio 2016 Olympics'}


# Connection

In [137]:
# create global connection
from elasticsearch_dsl.connections import connections
connections.create_connection(hosts='localhost', timeout=30)  # global elasticsearch connection

from elasticsearch import Elasticsearch
es = Elasticsearch(hosts='localhost', timeout=30)
print(connections, es)

<elasticsearch_dsl.connections.Connections object at 0x106aa9828> <Elasticsearch([{'host': 'localhost'}])>


# Type Mapping (Table Schema)

In [138]:
# noinspection PyUnresolvedReferences
import traceback

from elasticsearch_dsl import Index
from elasticsearch_dsl import analysis, DocType, String, Date, Integer, analyzer


class GoogleNews(DocType):
    __google_news_char_filter = analysis.char_filter('google_news_char_filter', 'html_strip', escaped_tags=[])
    __google_news_token_filter = analysis.token_filter('google_news_token_filter', 'stop', stopwords=[])
    google_news_token_analyzer = analyzer('google_news_token_analyzer',
                                            type='custom',
                                            char_filter=['html_strip', __google_news_char_filter],
                                            tokenizer='whitespace',  # tokens by whitespace
                                          filter=['lowercase', __google_news_token_filter],
                                            )

    index = 'google_news_v01'  # for searching
    doc_type = 'google_news'

    title = String()
    category = String(index='not_analyzed')
    content = String(
        analyzer=google_news_token_analyzer,  # separated sentece
        # term_vector='with_positions',
        # fields={'raw': String(index='not_analyzed')}
    )
    content_length = Integer(index='not_analyzed')

    def __init__(self, **kwargs):
        # print('url:', kwargs['url'], 'position:', kwargs['position'])
        if 'title' in kwargs:
            super().__init__(**kwargs)
            self.meta.index = self.index
            self.meta.id = kwargs['title']
            self.content_length = len(kwargs['content'].split())
            # print('self.meta.index:', self.meta.index)

    @classmethod
    def recreate_index(cls):
        new_index = Index(cls.index)
        try:
            new_index.doc_type(cls)
            new_index.delete(ignore=404)
        except:
            print(traceback.format_exc())
        try:
            new_index.create()
        except:
            print(traceback.format_exc())


# Indexing (python)
https://elasticsearch-py.readthedocs.io/en/master/

In [139]:
GoogleNews.recreate_index()
for doc in doc_list:
    res = es.index(index=GoogleNews.index, doc_type=GoogleNews.doc_type, body=doc)

# Manage with ES HQ (Web admin plugin)
http://localhost:9200/_plugin/hq/

# Searching (python)

In [140]:
import logging
class Searcher(object):
    def __init__(self, index=GoogleNews.index, doc_type=GoogleNews.doc_type):
        logging.getLogger("urllib3").setLevel(logging.CRITICAL)
        logging.getLogger("requests").setLevel(logging.CRITICAL)
        logging.getLogger('elasticsearch').setLevel(logging.CRITICAL)
        logging.getLogger('elasticsearch.trace').setLevel(logging.CRITICAL)

        self.es = Elasticsearch(hosts='localhost', timeout=300)
        self.index = index
        self.doc_type = doc_type
        
    def escape(self, keyword):     # escaping characters
        keyword = keyword.replace('_', ' ')
        for c in ['.', '%', '+', '-', '=', '&&', '||', '>', '<', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*',
                  '?', ':', '/']:  # '\\',ççç
            keyword = keyword.replace(c, r'\%s' % c)
        return keyword
    
    def search(self, q, field='content'):
        q = '%s:%s' % (field, q) # make query
        r = self.es.search(index=self.index, doc_type=self.doc_type, q=q) # q=Query String Query
        return r
    
    def search_all(self, q, field='content'):
        q = '%s:%s' % (field, q) # make query
        hits = []
        r = self.es.search(index=self.index, doc_type=self.doc_type, q=q) # q=Query String Query
        hits.extend(r['hits']['hits'])
        
        while r['hits']['total'] > len(hits): # need more hits
            _r = self.es.search(index=self.index, doc_type=self.doc_type, q=q, from_=len(hits)) # q=Query String Query
            hits.extend(_r['hits']['hits'])
            r['took'] += _r['took'] # sum elapsed time.
        
        r['hits']['hits']=hits
        return r
    
searcher = Searcher(index=GoogleNews.index, doc_type=GoogleNews.doc_type)

In [141]:
# search not_analyzed field. (term==field)
searcher.search('Games', field='content')

{'_shards': {'failed': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': 'AVZVn-gSqzY1t1OkhrzC',
    '_index': 'google_news_v01',
    '_score': 0.3675009,
    '_source': {'category': 'Olympics',
     'content': 'Lagoa Stadium, set on a lake in the heart of the city, will provide a unique environment for Olympic Games athletes and spectators',
     'content_length': 23,
     'title': "'s one-of-a-kind rowing venue provides challenges and opportunities"},
    '_type': 'google_news'},
   {'_id': 'AVZVn-gZqzY1t1OkhrzG',
    '_index': 'google_news_v01',
    '_score': 0.35625812,
    '_source': {'category': 'Olympics',
     'content': 'Superstar Jamaican sprinter, who has been keeping a low profile since arriving in Rio, posts homage to Olympic Games online',
     'content_length': 20,
     'title': 'Usain Bolt gets dressed up in colours'},
    '_type': 'google_news'},
   {'_id': 'AVZVn-fDqzY1t1OkhryU',
    '_index': 'google_news_v01',
    '_score': 0.21960393,
    '_source': {'cat

In [142]:
# search not_analyzed field. (term==field)
searcher.search('games', field='title')['hits']

{'hits': [{'_id': 'AVZVn-f8qzY1t1Okhryx',
   '_index': 'google_news_v01',
   '_score': 0.96232545,
   '_source': {'category': 'Olympics',
    'content': 'Rio 2016 officially gets under way with the opening ceremony on Friday - but the action kicks off on two days before that with the football.\n\nThe first group games in the women\'s tournament take place on Wednesday - so-called day minus 2 - with Sweden facing South Africa at 17:00 BST.\n\nThe men start on Thursday, when Iraq play Denmark (17:00 BST).\n\nHere\'s all you need to know...\n\nIt\'s simply for scheduling reasons, as it was at London 2012.\n\nWith 16 men\'s and 12 women\'s teams, there are 58 matches to get through, with only six rest days.\n\nBarcelona forward Neymar will captain a talented Brazil squad aiming to win its first Olympic gold medal.\n\nHe is joined by Lazio midfielder Felipe Anderson, Barca club-mate Rafinha and Paris Saint-Germain defender Marquinhos.\n\nTottenham forward Son Heung-min will represent South 

In [143]:
for row in searcher.search('games', field='content')['hits']['hits']:
    print(row['_source'])

{'category': 'Olympics', 'content_length': 23, 'title': "'s one-of-a-kind rowing venue provides challenges and opportunities", 'content': 'Lagoa Stadium, set on a lake in the heart of the city, will provide a unique environment for Olympic Games athletes and spectators'}
{'category': 'Olympics', 'content_length': 20, 'title': 'Usain Bolt gets dressed up in colours', 'content': 'Superstar Jamaican sprinter, who has been keeping a low profile since arriving in Rio, posts homage to Olympic Games online'}
{'category': 'Olympics', 'content_length': 58, 'title': '12 local athletes heading to Rio 2016 Olympics', 'content': 'With the opening ceremonies for the 31st Olympiad kicking off on Friday, excitement is building for the 2016 Olympic Games in Rio de Janeiro, Brazil.\n\nLocally, we are represented across a wide range of sports including rowing, boxing, and golf.\n\nHere is a list of our local athletes who will be competing over the next couple of weeks:'}
{'category': 'Olympics', 'content