BITMAKER-3113: Allow data download via estela-web (#199)

* Allow data download via estela-web * Increase estela-cli download chunk size to 2MB * Allow downloading data in JSON, CSV, and TSV formats * Add restart policy for Kafka in docker-compose
bitmakerla · Jun 27, 2023 · 25f5fc5 · 25f5fc5
1 parent 0ea7c39
commit 25f5fc5
Show file tree

Hide file tree

Showing 12 changed files with 6,004 additions and 6,434 deletions.
diff --git a/database_adapters/db_adapters.py b/database_adapters/db_adapters.py
@@ -94,6 +94,11 @@ def delete_collection_data(self, database_name, collection_name):
             print(ex)
         return count
 
+    def get_collection_data(self, database_name, collection_name, limit=10000):
+        collection = self.client[database_name][collection_name]
+        result = collection.find({}, {"_id": False}).limit(limit)
+        return list(result)
+
     def get_all_collection_data(self, database_name, collection_name):
         collection = self.client[database_name][collection_name]
         result = collection.find({}, {"_id": False})

diff --git a/estela-api/api/views/job_data.py b/estela-api/api/views/job_data.py
@@ -1,10 +1,10 @@
 from django.conf import settings
 from drf_yasg import openapi
 from drf_yasg.utils import swagger_auto_schema
-from rest_framework import status, mixins
-from rest_framework.response import Response
-from rest_framework.exceptions import ParseError
+from rest_framework import mixins, status
 from rest_framework.decorators import action
+from rest_framework.exceptions import ParseError
+from rest_framework.response import Response
 from rest_framework.utils.urls import replace_query_param
 
 from api import errors
@@ -93,18 +93,7 @@ def list(self, request, *args, **kwargs):
             raise DataBaseError({"error": errors.UNABLE_CONNECT_DB})
 
         job = SpiderJob.objects.filter(jid=kwargs["jid"]).get()
-        if (
-            job.cronjob is not None
-            and job.cronjob.unique_collection
-            and data_type == "items"
-        ):
-            job_collection_name = "{}-scj{}-job_{}".format(
-                kwargs["sid"], job.cronjob.cjid, data_type
-            )
-        else:
-            job_collection_name = "{}-{}-job_{}".format(
-                kwargs["sid"], kwargs["jid"], data_type
-            )
+        job_collection_name = self.get_collection_name(job, data_type, **kwargs)
 
         count = spiderdata_db_client.get_estimated_document_count(
             kwargs["pid"], job_collection_name
@@ -145,6 +134,76 @@ def list(self, request, *args, **kwargs):
             }
         )
 
+    def get_collection_name(self, job, data_type, **kwargs):
+        if (
+            job.cronjob is not None
+            and job.cronjob.unique_collection
+            and data_type == "items"
+        ):
+            job_collection_name = "{}-scj{}-job_{}".format(
+                kwargs["sid"], job.cronjob.cjid, data_type
+            )
+        else:
+            job_collection_name = "{}-{}-job_{}".format(
+                kwargs["sid"], kwargs["jid"], data_type
+            )
+
+        return job_collection_name
+
+    @swagger_auto_schema(
+        methods=["GET"],
+        responses={
+            status.HTTP_200_OK: openapi.Response(
+                schema=openapi.Schema(
+                    type=openapi.TYPE_OBJECT,
+                    required=["results"],
+                    properties={
+                        "results": openapi.Schema(
+                            type=openapi.TYPE_ARRAY,
+                            items=openapi.Items(type=openapi.TYPE_OBJECT),
+                            description="Data items.",
+                        ),
+                    },
+                ),
+                description="",
+            ),
+        },
+        manual_parameters=[
+            openapi.Parameter(
+                "type",
+                openapi.IN_QUERY,
+                description="Spider job data type.",
+                type=openapi.TYPE_STRING,
+                required=False,
+            ),
+        ],
+    )
+    @action(detail=False, methods=["GET"])
+    def download(self, request, *args, **kwargs):
+        data_type = request.query_params.get("type", "items")
+
+        job = SpiderJob.objects.filter(jid=kwargs["jid"]).get()
+        job_collection_name = self.get_collection_name(job, data_type, **kwargs)
+
+        data = []
+        if data_type == "stats":
+            data = spiderdata_db_client.get_job_stats(
+                kwargs["pid"], job_collection_name
+            )
+        else:
+            docs_limit = max(
+                1,
+                (settings.MAX_WEB_DOWNLOAD_SIZE)
+                // spiderdata_db_client.get_estimated_document_size(
+                    kwargs["pid"], job_collection_name
+                ),
+            )
+            data = spiderdata_db_client.get_collection_data(
+                kwargs["pid"], job_collection_name, docs_limit
+            )
+
+        return Response({"results": data})
+
     @swagger_auto_schema(
         methods=["POST"],
         responses={status.HTTP_200_OK: DeleteJobDataSerializer()},

diff --git a/estela-api/config/settings/base.py b/estela-api/config/settings/base.py
@@ -205,9 +205,8 @@
 
 
 # API limit data download settings (bytes)
-MAX_DOWNLOADED_SIZE = 8 * 1024 * 1024
-MAX_CHUNK_SIZE = 512 * 1024
-
+MAX_CHUNK_SIZE = 2 * 1024 * 1024
+MAX_WEB_DOWNLOAD_SIZE = 100 * 1024 * 1024
 
 # Pagination settings used in api_app
 API_PAGE_SIZE = 100  # Paginator page size

diff --git a/estela-api/docs/api.yaml b/estela-api/docs/api.yaml
@@ -1194,6 +1194,54 @@ paths:
         in: path
         required: true
         type: string
+  /api/projects/{pid}/spiders/{sid}/jobs/{jid}/data/download:
+    get:
+      operationId: api_projects_spiders_jobs_data_download
+      description: ''
+      parameters:
+        - name: page
+          in: query
+          description: A page number within the paginated result set.
+          required: false
+          type: integer
+        - name: page_size
+          in: query
+          description: Number of results to return per page.
+          required: false
+          type: integer
+        - name: type
+          in: query
+          description: Spider job data type.
+          required: false
+          type: string
+      responses:
+        '200':
+          description: ''
+          schema:
+            required:
+              - results
+            type: object
+            properties:
+              results:
+                description: Data items.
+                type: array
+                items:
+                  type: object
+      tags:
+        - api
+    parameters:
+      - name: jid
+        in: path
+        required: true
+        type: string
+      - name: pid
+        in: path
+        required: true
+        type: string
+      - name: sid
+        in: path
+        required: true
+        type: string
   /api/projects/{pid}/usage:
     get:
       operationId: api_projects_usage

diff --git a/estela-web/package.json b/estela-web/package.json
@@ -12,6 +12,7 @@
     "moment": "^2.29.4",
     "node-fetch": "^2.6.7",
     "nth-check": "^2.0.1",
+    "papaparse": "^5.4.1",
     "react": "^17.0.2",
     "react-chartjs-2": "^5.1.0",
     "react-dom": "^17.0.2",