* Support injecting vineyard sidecar via vineyardctl API.

* Fix some typos in the FAQ doc. Signed-off-by: Ye Cao <caoye.cao@alibaba-inc.com>
alibaba · Apr 19, 2023 · 094ed07 · 094ed07
1 parent 94c799a
commit 094ed07
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 21 deletions.
diff --git a/coordinator/gscoordinator/cluster_builder.py b/coordinator/gscoordinator/cluster_builder.py
@@ -395,12 +395,10 @@ def get_dataset_container(self, volume_mounts):
     def get_engine_pod_spec(self):
         containers = []
         volumes = []
-
-        socket_volume = self.get_vineyard_socket_volume()
+
         shm_volume = self.get_shm_volume()
-        volumes.extend([socket_volume[0], shm_volume[0]])
-
-        engine_volume_mounts = [socket_volume[2], shm_volume[2]]
+        volumes=[shm_volume[0]]
+        engine_volume_mounts = [shm_volume[2]]
 
         if self._volumes and self._volumes is not None:
             udf_volumes = ResourceBuilder.get_user_defined_volumes(self._volumes)
@@ -428,13 +426,6 @@ def get_engine_pod_spec(self):
                 self.get_learning_container(volume_mounts=engine_volume_mounts)
             )
 
-        if self._vineyard_deployment is None:
-            containers.append(
-                self.get_vineyard_container(
-                    volume_mounts=[socket_volume[1], shm_volume[1]]
-                )
-            )
-
         if self._with_dataset:
             dataset_volume = self.get_dataset_volume()
             volumes.append(dataset_volume[0])
@@ -481,6 +472,7 @@ def get_engine_headless_service(self):
         service_spec = ResourceBuilder.get_service_spec(
             "ClusterIP", ports, self._engine_labels, None
         )
+
         # Necessary, create a headless service for statefulset
         service_spec.cluster_ip = "None"
         service = ResourceBuilder.get_service(

diff --git a/coordinator/gscoordinator/kubernetes_launcher.py b/coordinator/gscoordinator/kubernetes_launcher.py
@@ -153,6 +153,10 @@ def __init__(
         self._engine_mem = engine_mem
         self._vineyard_shared_mem = vineyard_shared_mem
 
+        self._vineyard_cpu = vineyard_cpu
+        self._vineyard_mem = vineyard_mem
+        self._vineyard_image = vineyard_image
+
         self._with_dataset = with_dataset
         self._preemptive = preemptive
         self._service_type = service_type
@@ -433,6 +437,56 @@ def _create_mars_scheduler(self):
         )
         self._resource_object.append(response)
 
+    # the function is used to inject vineyard as a sidecar container into the workload
+    # and return the new workload
+    def _inject_vineyard_as_sidecar(self, workload):
+        import vineyard
+
+        # add vineyard sidecar annotations to the workload
+        annotations = workload.spec.template.metadata.annotations
+        if annotations is None:
+            annotations = {}
+        annotations['sidecar.v6d.io/name'] = 'default'
+        workload.spec.template.metadata.annotations = annotations
+
+        # add vineyard sidecar labels to the workload
+        labels = workload.spec.template.metadata.labels
+        if labels is None:
+            labels = {}
+        labels['sidecar.v6d.io/enabled'] = "true"
+        workload.spec.template.metadata.labels = labels
+
+        workload_json = json.dumps(
+            self._api_client.sanitize_for_serialization(workload)
+        )
+
+        sts_name = self._engine_cluster.engine_stateful_set_name
+        svc_name = sts_name + "-headless"
+        pod0_dns = f"{sts_name}-0.{svc_name}"
+        etcd_endpoint = "http://" + pod0_dns + "." + self._namespace + ".svc.cluster.local" +":2379"
+        new_workload_json = vineyard.deploy.vineyardctl.inject(
+            resource=workload_json,
+            sidecar_volume_mountpath='/tmp/vineyard_workspace',
+            name=sts_name + '-vineyard-sidecar',
+            use_internal_etcd=True,
+            etcd_service_name=pod0_dns,
+            sidecar_etcdendpoint=etcd_endpoint,
+            sidecar_image=self._vineyard_image,
+            sidecar_size=self._vineyard_shared_mem,
+            sidecar_cpu=self._vineyard_cpu,
+            sidecar_memory=self._vineyard_mem,
+            deploy_rpc_service=False,
+            deploy_etcd_service=False,
+            output='json',
+            capture=True,
+        )
+
+        normalized_workload_json = json.loads(new_workload_json)
+        fake_kube_response = FakeKubeResponse(normalized_workload_json)
+
+        new_workload = self._api_client.deserialize(fake_kube_response, type(workload))
+        return new_workload
+
     def _create_engine_stateful_set(self):
         logger.info("Create engine headless services...")
         service = self._engine_cluster.get_engine_headless_service()
@@ -448,12 +502,14 @@ def _create_engine_stateful_set(self):
                 workload=stateful_set
             )
 
-        stateful_set.metadata.owner_references = self._owner_references
+        new_stateful_set = self._inject_vineyard_as_sidecar(stateful_set)
+        new_stateful_set.metadata.owner_references = self._owner_references
         response = self._apps_api.create_namespaced_stateful_set(
-            self._namespace, stateful_set
+            self._namespace, new_stateful_set
         )
         self._resource_object.append(response)
 
+
     def _create_frontend_deployment(self):
         logger.info("Creating frontend pods...")
         deployment = self._engine_cluster.get_interactive_frontend_deployment()

diff --git a/docs/frequently_asked_questions.rst b/docs/frequently_asked_questions.rst
@@ -55,13 +55,13 @@ If you don't find an answer to your question here, feel free to file a `Issues`_
 
     If GraphScope seems to get stuck, the possible cause might be:
 
-    - In the session launching stage, the most cases are waiting for Pods ready. The time consuming may be caused by a poor network connection during pulling image, or by failing to acquire the requested resources to launch a session.
+    - In the session launching stage, the most cases are waiting for Pods ready. The time consumption may be caused by a poor network connection during pulling image, or by failing to acquire the requested resources to launch a session.
     - In the graph loading stage, it is time consuming to load and build a large graph.
-    - When running a user-defined or built-in analytical algorithm, it may takes time to compile the algorithm over the loaded graph.
+    - When running a user-defined or built-in analytical algorithm, it may take time to compile the algorithm over the loaded graph.
 
 8. Why `No such file or directory` error when loading graph?
 
-    This mostly occur when you are deploying GraphScope in a Kubernetes cluster, the file must be visible to the ``engnine`` Pod of GraphScope. You may need to mount a volume to the Pods or use cloud storage providers.
+    This mostly occurs when you are deploying GraphScope in a Kubernetes cluster, the file must be visible to the ``engnine`` Pod of GraphScope. You may need to mount a volume to the Pods or use cloud storage providers.
 
     Specifically, if your cluster is deployed with `kind <https://kind.sigs.k8s.io>`_, you may need to setup `extra-mounts <https://kind.sigs.k8s.io/docs/user/configuration/#extra-mounts>`_ to mount your local directory to kind nodes.
 
@@ -87,17 +87,17 @@ If you don't find an answer to your question here, feel free to file a `Issues`_
 
     - Check: First use ``kubectl logs graphscope-store-zookeeper-0`` to check log. If the log shows ``mkdir: cannot create directory '/bitnami/zookeeper/data': Permission denied``.
 
-    - Reason: Normaly, the permission of NFS directories we created is ``root 755`` (depends on your sepcify environment), but the default user of graphscope-store is ``graphscope(1001)``, so these pods have no permission to write on NFS.
+    - Reason: Normally, the permission of NFS directories we created is ``root 755`` (depends on your specific environment), but the default user of graphscope-store is ``graphscope(1001)``, so these pods have no permission to write on NFS.
 
-    - Solution: There are two slutions to solve this.
+    - Solution: There are two solutions to solve this.
 
         The brutal one is using ``chmod 777`` on all related PV directories, this is efficient but not recommended in production environment.
 
         The elegant one is creating ``graphscope`` user and user group first, and then grant the access permission on ``graphscope`` to the related NFS directories.
 
 12. why ``Timeout Exception`` raised during launching GraphScope instance on kubernetes cluster?
 
-    It will take a few minutes for pulling image during the first time for launching GraphScope instance. Thus, the ``Timeout Exception`` may caused by a poor network connection.
+    It will take a few minutes for pulling image during the first time for launching GraphScope instance. Thus, the ``Timeout Exception`` may be caused by a poor network connection.
     You can increase the value of ``timeout_seconds`` parameter as your expectation by ``graphscope.set_option(timeout_seconds=600))``.
 
 13. Failed to run GraphScope (either in single machine or in docker container) due to failed connection to building blocks like etcd?
@@ -106,7 +106,7 @@ If you don't find an answer to your question here, feel free to file a `Issues`_
 
 14. How to print debug info in GAE Cython SDK Algorithms?
 
-    python3 print function is a convinent way to show useful debug info, use print with param flush=True then the stream is forcibly flushed.
+    python3 print function is a convenient way to show useful debug info, use print with param flush=True then the stream is forcibly flushed.
 
     More details please refer to `Python Documentation <https://docs.python.org/3.3/library/functions.html#print>`_.