Revert "refactor: Update ES index maps to use same maps of amundsen-c…

…ommon (#385)" This reverts commit 20c2fd2.
amundsen-io · Oct 20, 2020 · 960db70 · 960db70
1 parent 9595866
commit 960db70
Show file tree

Hide file tree

Showing 7 changed files with 253 additions and 10 deletions.
diff --git a/databuilder/publisher/elasticsearch_constants.py b/databuilder/publisher/elasticsearch_constants.py
@@ -0,0 +1,247 @@
+# Copyright Contributors to the Amundsen project.
+# SPDX-License-Identifier: Apache-2.0
+
+import textwrap
+
+# Documentation: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping.html
+# Setting type to "text" for all fields that would be used in search
+# Using Simple Analyzer to convert all text into search terms
+# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-simple-analyzer.html
+# Standard Analyzer is used for all text fields that don't explicitly specify an analyzer
+# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-standard-analyzer.html
+# TODO use amundsencommon for this when this project is updated to py3
+TABLE_ELASTICSEARCH_INDEX_MAPPING = textwrap.dedent(
+    """
+    {
+    "mappings":{
+        "table":{
+          "properties": {
+            "name": {
+              "type":"text",
+              "analyzer": "simple",
+              "fields": {
+                "raw": {
+                  "type": "keyword"
+                }
+              }
+            },
+            "schema": {
+              "type":"text",
+              "analyzer": "simple",
+              "fields": {
+                "raw": {
+                  "type": "keyword"
+                }
+              }
+            },
+            "display_name": {
+              "type": "keyword"
+            },
+            "last_updated_timestamp": {
+              "type": "date",
+              "format": "epoch_second"
+            },
+            "description": {
+              "type": "text",
+              "analyzer": "simple"
+            },
+            "column_names": {
+              "type":"text",
+              "analyzer": "simple",
+              "fields": {
+                "raw": {
+                  "type": "keyword"
+                }
+              }
+            },
+            "column_descriptions": {
+              "type": "text",
+              "analyzer": "simple"
+            },
+            "tags": {
+              "type": "keyword"
+            },
+            "badges": {
+              "type": "keyword"
+            },
+            "cluster": {
+              "type": "text",
+              "analyzer": "simple",
+              "fields": {
+                "raw": {
+                  "type": "keyword"
+                }
+              }
+            },
+            "database": {
+              "type": "text",
+              "analyzer": "simple",
+              "fields": {
+                "raw": {
+                  "type": "keyword"
+                }
+              }
+            },
+            "key": {
+              "type": "keyword"
+            },
+            "total_usage":{
+              "type": "long"
+            },
+            "unique_usage": {
+              "type": "long"
+            },
+            "programmatic_descriptions": {
+              "type": "text",
+              "analyzer": "simple"
+            }
+          }
+        }
+      }
+    }
+    """
+)
+
+DASHBOARD_ELASTICSEARCH_INDEX_MAPPING = textwrap.dedent(
+    """
+    {
+        "settings": {
+          "analysis": {
+            "normalizer": {
+              "lowercase_normalizer": {
+                "type": "custom",
+                "char_filter": [],
+                "filter": ["lowercase", "asciifolding"]
+              }
+            }
+          }
+        },
+        "mappings":{
+            "dashboard":{
+              "properties": {
+                "group_name": {
+                  "type":"text",
+                  "analyzer": "simple",
+                  "fields": {
+                    "raw": {
+                      "type": "keyword",
+                      "normalizer": "lowercase_normalizer"
+                    }
+                  }
+                },
+                "name": {
+                  "type":"text",
+                  "analyzer": "simple",
+                  "fields": {
+                    "raw": {
+                      "type": "keyword",
+                      "normalizer": "lowercase_normalizer"
+                    }
+                  }
+                },
+                "description": {
+                  "type":"text",
+                  "analyzer": "simple",
+                  "fields": {
+                    "raw": {
+                      "type": "keyword"
+                    }
+                  }
+                },
+                "group_description": {
+                  "type":"text",
+                  "analyzer": "simple",
+                  "fields": {
+                    "raw": {
+                      "type": "keyword"
+                    }
+                  }
+                },
+                "query_names": {
+                  "type":"text",
+                  "analyzer": "simple",
+                  "fields": {
+                    "raw": {
+                      "type": "keyword"
+                    }
+                  }
+                },
+                "chart_names": {
+                  "type":"text",
+                  "analyzer": "simple",
+                  "fields": {
+                    "raw": {
+                      "type": "keyword"
+                    }
+                  }
+                },
+                "tags": {
+                  "type": "keyword"
+                },
+                "badges": {
+                  "type": "keyword"
+                }
+              }
+            }
+          }
+        }
+    """
+)
+
+USER_ELASTICSEARCH_INDEX_MAPPING = textwrap.dedent(
+    """
+    {
+        "mappings":{
+            "user":{
+              "properties": {
+                "email": {
+                  "type":"text",
+                  "analyzer": "simple",
+                  "fields": {
+                    "raw": {
+                      "type": "keyword"
+                    }
+                  }
+                },
+                "first_name": {
+                  "type":"text",
+                  "analyzer": "simple",
+                  "fields": {
+                    "raw": {
+                      "type": "keyword"
+                    }
+                  }
+                },
+                "last_name": {
+                  "type":"text",
+                  "analyzer": "simple",
+                  "fields": {
+                    "raw": {
+                      "type": "keyword"
+                    }
+                  }
+                },
+                "full_name": {
+                  "type":"text",
+                  "analyzer": "simple",
+                  "fields": {
+                    "raw": {
+                      "type": "keyword"
+                    }
+                  }
+                },
+                "total_read":{
+                  "type": "long"
+                },
+                "total_own": {
+                  "type": "long"
+                },
+                "total_follow": {
+                  "type": "long"
+                }
+              }
+            }
+          }
+        }
+    """
+)
diff --git a/databuilder/publisher/elasticsearch_publisher.py b/databuilder/publisher/elasticsearch_publisher.py
@@ -8,9 +8,8 @@
 from pyhocon import ConfigTree
 from typing import List
 
-from amundsen_common.models.index_map import TABLE_INDEX_MAP as TABLE_ELASTICSEARCH_INDEX_MAPPING
-
 from databuilder.publisher.base_publisher import Publisher
+from databuilder.publisher.elasticsearch_constants import TABLE_ELASTICSEARCH_INDEX_MAPPING
 
 LOGGER = logging.getLogger(__name__)
 

diff --git a/docs/dashboard_ingestion_guide.md b/docs/dashboard_ingestion_guide.md
@@ -111,7 +111,7 @@ job = DefaultJob(conf=job_config,
 job.launch()
 ```
 
-*Note that `DASHBOARD_ELASTICSEARCH_INDEX_MAPPING` is defined [here](https://github.com/amundsen-io/amundsencommon/blob/master/amundsen_common/models/index_map.py).  
+*Note that `DASHBOARD_ELASTICSEARCH_INDEX_MAPPING` is defined [here](../databuilder/publisher/elasticsearch_constants.py).  
 
 
 ### 4. Remove stale data

diff --git a/example/scripts/sample_data_loader.py b/example/scripts/sample_data_loader.py
@@ -29,15 +29,15 @@
 from elasticsearch import Elasticsearch
 from pyhocon import ConfigFactory
 from sqlalchemy.ext.declarative import declarative_base
-from amundsen_common.models.index_map import DASHBOARD_ELASTICSEARCH_INDEX_MAPPING
-from amundsen_common.models.index_map import USER_INDEX_MAP as USER_ELASTICSEARCH_INDEX_MAPPING
 
 from databuilder.extractor.csv_extractor import CsvTableColumnExtractor, CsvExtractor
 from databuilder.extractor.neo4j_es_last_updated_extractor import Neo4jEsLastUpdatedExtractor
 from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor
 from databuilder.job.job import DefaultJob
 from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader
 from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader
+from databuilder.publisher.elasticsearch_constants import DASHBOARD_ELASTICSEARCH_INDEX_MAPPING, \
+    USER_ELASTICSEARCH_INDEX_MAPPING
 from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher
 from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher
 from databuilder.task.task import DefaultTask

diff --git a/example/scripts/sample_tableau_data_loader.py b/example/scripts/sample_tableau_data_loader.py
@@ -23,12 +23,12 @@
 from elasticsearch import Elasticsearch
 from pyhocon import ConfigFactory
 from sqlalchemy.ext.declarative import declarative_base
-from amundsen_common.models.index_map import DASHBOARD_ELASTICSEARCH_INDEX_MAPPING
 
 from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor
 from databuilder.job.job import DefaultJob
 from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader
 from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader
+from databuilder.publisher.elasticsearch_constants import DASHBOARD_ELASTICSEARCH_INDEX_MAPPING
 from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher
 from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher
 from databuilder.task.task import DefaultTask

diff --git a/requirements.txt b/requirements.txt
@@ -38,10 +38,6 @@ typing==3.6.4
 # Upstream url: https://pypi.org/project/elasticsearch/
 elasticsearch>=6.2.0,<7.0
 
-# A common package that holds the models deifnition and schemas that are used
-# accross different amundsen repositories.
-amundsen-common>=0.5.6,<1.0
-
 atomicwrites==1.1.5
 more-itertools==4.2.0
 pluggy>=0.6.0

diff --git a/setup.py b/setup.py
@@ -70,6 +70,7 @@
     install_requires=requirements,
     python_requires='>=3.6,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*',
     extras_require={
+        ':python_version=="2.7"': ['typing>=3.6'],  # allow typehinting PY2
         'all': all_deps,
         'kafka': kafka,  # To use with Kafka source extractor
         'cassandra': cassandra,