In [1]:
import json
import elasticsearch
from elasticsearch.helpers import bulk

In [2]:
client = elasticsearch.Elasticsearch('localhost:9200')

In [3]:
INDEX_NAME = 'tweets2021'

In [4]:
INDEX_MAPPING = {
    "settings": {
        "number_of_shards": 3,
        "analysis": {
            "normalizer": {
                "hashtag_normalizer": {
                    "filter": [
                        "lowercase",
                        "asciifolding"
                    ],
                    "type": "custom",
                    "char_filter": []
                }
            },
            "analyzer": {
                "tweet_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["asciifolding", "lowercase", "stop_pt",
                                "stop_custom", "porter_stem"]
                }
            },
            "filter": {
                "stop_pt": {
                    "type": "stop",
                    "stopwords": "_portuguese_",
                },
                "stop_custom": {
                    "type": "stop",

                    "stopwords": [
                        'nao', 'vc', 'pq', 'ai', 'q', 'ta', 'http', 'pra', 't.co', 'sao', 'to', 'voce', 'so'
                    ]
                }
            }
        }
    },
    "mappings": {
        "dynamic": False,
        "properties": {
            "cashtags": {
                "type": "keyword",
                "normalizer": "hashtag_normalizer"
            },
            "conversation_id": {
                "type": "long"
            },
            "created_at": {
                "type": "text"
            },
            "date": {
                "type": "date",
                "format": "yyyy-MM-dd HH:mm:ss"
            },
            "day": {
                "type": "integer"
            },
            "essid": {
                "type": "keyword"
            },
            "geo_near": {
                "type": "geo_point"
            },
            "geo_tweet": {
                "type": "geo_point"
            },
            "hashtags": {
                "type": "keyword",
                "normalizer": "hashtag_normalizer"
            },
            "hour": {
                "type": "integer"
            },
            "id": {
                "type": "long"
            },
            "lang": {
                "type": "keyword"
            },
            "language": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "link": {
                "type": "text"
            },
            "location": {
                "type": "keyword"
            },
            "mentions": {
                "type": "nested",
                "properties": {
                    "id": {
                        "type": "long"
                    },
                    "name": {
                        "type": "text"
                    },
                    "screen_name": {
                        "type": "text"
                    }
                }
            },
            "name": {
                "type": "text"
            },
            "near": {
                "type": "text"
            },
            "nlikes": {
                "type": "integer"
            },
            "nreplies": {
                "type": "integer"
            },
            "nretweets": {
                "type": "integer"
            },
            "photos": {
                "type": "text"
            },
            "profile_image_url": {
                "type": "text"
            },
            "quote_url": {
                "type": "text"
            },
            "reply_to": {
                "type": "nested",
                "properties": {
                    "id": {
                        "type": "text",
                        "fields": {
                            "keyword": {
                                "type": "keyword",
                                "ignore_above": 256
                            }
                        }
                    },
                    "name": {
                        "type": "text",
                        "fields": {
                            "keyword": {
                                "type": "keyword",
                                "ignore_above": 256
                            }
                        }
                    },
                    "screen_name": {
                        "type": "text",
                        "fields": {
                            "keyword": {
                                "type": "keyword",
                                "ignore_above": 256
                            }
                        }
                    },
                    "user_id": {
                        "type": "keyword"
                    },
                    "username": {
                        "type": "keyword"
                    }
                }
            },
            "retweet": {
                "type": "text"
            },
            "retweet_date": {
                "type": "date",
                "format": "yyyy-MM-dd HH:mm:ss",
                "ignore_malformed": True
            },
            "retweet_id": {
                "type": "keyword"
            },
            "search": {
                "type": "text"
            },
            "source": {
                "type": "keyword"
            },
            "thumbnail": {
                "type": "text"
            },
            "timezone": {
                "type": "keyword"
            },
            "trans_dest": {
                "type": "keyword"
            },
            "trans_src": {
                "type": "keyword"
            },
            "translate": {
                "type": "text"
            },
            "tweet": {
                "type": "text",
                "analyzer": "tweet_analyzer",
                "fielddata": True,
                "fielddata_frequency_filter": {
                    "min": 0.01,
                    "min_segment_size": 10,
                }
            },
            "urls": {
                "type": "keyword"
            },
            "user_id_str": {
                "type": "keyword"
            },
            "user_rt": {
                "type": "keyword"
            },
            "user_rt_id": {
                "type": "keyword"
            },
            "username": {
                "type": "keyword",
                "normalizer": "hashtag_normalizer"
            },
            "video": {
                "type": "integer"
            }
        }
    }
}

In [8]:
# Apagar caso o indice exista
if client.indices.exists(INDEX_NAME):
    client.indices.delete(INDEX_NAME)


In [9]:
client.indices.create(INDEX_NAME, body=INDEX_MAPPING)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'tweets2021'}