-
Notifications
You must be signed in to change notification settings - Fork 1.7k
/
launch.cpp
860 lines (733 loc) · 28.2 KB
/
launch.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <errno.h>
#ifdef __linux__
#include <sched.h>
#include <signal.h>
#endif // __linux__
#include <string.h>
#include <iostream>
#include <set>
#include <string>
#include <glog/logging.h>
#include <glog/raw_logging.h>
#include <process/subprocess.hpp>
#include <stout/foreach.hpp>
#include <stout/os.hpp>
#include <stout/protobuf.hpp>
#include <stout/path.hpp>
#include <stout/unreachable.hpp>
#include <stout/os/write.hpp>
#include <mesos/mesos.hpp>
#include <mesos/type_utils.hpp>
#include <mesos/slave/containerizer.hpp>
#include "common/parse.hpp"
#include "common/status_utils.hpp"
#ifdef __linux__
#include "linux/capabilities.hpp"
#include "linux/fs.hpp"
#include "linux/ns.hpp"
#endif
#ifndef __WINDOWS__
#include "posix/rlimits.hpp"
#endif // __WINDOWS__
#include "slave/containerizer/mesos/launch.hpp"
#include "slave/containerizer/mesos/paths.hpp"
using std::cerr;
using std::cout;
using std::endl;
using std::set;
using std::string;
using std::vector;
#ifdef __linux__
using mesos::internal::capabilities::Capabilities;
using mesos::internal::capabilities::Capability;
using mesos::internal::capabilities::ProcessCapabilities;
#endif // __linux__
using mesos::slave::ContainerLaunchInfo;
namespace mesos {
namespace internal {
namespace slave {
const string MesosContainerizerLaunch::NAME = "launch";
MesosContainerizerLaunch::Flags::Flags()
{
add(&Flags::launch_info,
"launch_info",
"");
add(&Flags::pipe_read,
"pipe_read",
"The read end of the control pipe. This is a file descriptor \n"
"on Posix, or a handle on Windows. It's caller's responsibility \n"
"to make sure the file descriptor or the handle is inherited \n"
"properly in the subprocess. It's used to synchronize with the \n"
"parent process. If not specified, no synchronization will happen.");
add(&Flags::pipe_write,
"pipe_write",
"The write end of the control pipe. This is a file descriptor \n"
"on Posix, or a handle on Windows. It's caller's responsibility \n"
"to make sure the file descriptor or the handle is inherited \n"
"properly in the subprocess. It's used to synchronize with the \n"
"parent process. If not specified, no synchronization will happen.");
#ifndef __WINDOWS__
add(&Flags::runtime_directory,
"runtime_directory",
"The runtime directory for the container (used for checkpointing)");
#endif // __WINDOWS__
#ifdef __linux__
add(&Flags::namespace_mnt_target,
"namespace_mnt_target",
"The target 'pid' of the process whose mount namespace we'd like\n"
"to enter before executing the command.");
add(&Flags::unshare_namespace_mnt,
"unshare_namespace_mnt",
"Whether to launch the command in a new mount namespace.",
false);
#endif // __linux__
}
static Option<pid_t> containerPid = None();
static Option<string> containerStatusPath = None();
static Option<int> containerStatusFd = None();
static void exitWithSignal(int sig);
static void exitWithStatus(int status);
#ifndef __WINDOWS__
static void signalSafeWriteStatus(int status)
{
const string statusString = std::to_string(status);
ssize_t result =
os::signal_safe::write(containerStatusFd.get(), statusString);
if (result < 0) {
// NOTE: We use RAW_LOG instead of LOG because RAW_LOG doesn't
// allocate any memory or grab locks. And according to
// https://code.google.com/p/google-glog/issues/detail?id=161
// it should work in 'most' cases in signal handlers.
RAW_LOG(ERROR, "Failed to write container status '%d': %d", status, errno);
}
}
// When launching the executor with an 'init' process, we need to
// forward all relevant signals to it. The functions below help to
// enable this forwarding.
static void signalHandler(int sig)
{
// If we don't yet have a container pid, we treat
// receiving a signal like a failure and exit.
if (containerPid.isNone()) {
exitWithSignal(sig);
}
// Otherwise we simply forward the signal to `containerPid`. We
// purposefully ignore the error here since we have to remain async
// signal safe. The only possible error scenario relevant to us is
// ESRCH, but if that happens that means our pid is already gone and
// the process will exit soon. So we are safe.
os::kill(containerPid.get(), sig);
}
static Try<Nothing> installSignalHandlers()
{
// Install handlers for all standard POSIX signals
// (i.e. any signal less than `NSIG`).
for (int i = 1; i < NSIG; i++) {
// We don't want to forward the SIGCHLD signal, nor do we want to
// handle it ourselves because we reap all children inline in the
// `execute` function.
if (i == SIGCHLD) {
continue;
}
// We can't catch or ignore these signals, so we shouldn't try
// to register a handler for them.
if (i == SIGKILL || i == SIGSTOP) {
continue;
}
// The NSIG constant is used to determine the number of signals
// available on a system. However, Darwin, Linux, and BSD differ
// on their interpretation of of the value of NSIG. Linux, for
// example, sets it to 65, where Darwin sets it to 32. The reason
// for the discrepancy is that Linux includes the real-time
// signals in this count, where Darwin does not. However, even on
// linux, we are not able to arbitrarily install signal handlers
// for all the real-time signals -- they must have not been
// registered with the system first. For this reason, we
// standardize on verifying the installation of handlers for
// signals 1-31 (since these are defined in the POSIX standard),
// but we continue to attempt to install handlers up to the value
// of NSIG without verification.
const int posixLimit = 32;
if (os::signals::install(i, signalHandler) != 0 && i < posixLimit) {
return ErrnoError("Unable to register signal"
" '" + stringify(strsignal(i)) + "'");
}
}
return Nothing();
}
#endif // __WINDOWS__
static void exitWithSignal(int sig)
{
#ifndef __WINDOWS__
if (containerStatusFd.isSome()) {
signalSafeWriteStatus(W_EXITCODE(0, sig));
os::close(containerStatusFd.get());
}
#endif // __WINDOWS__
::_exit(EXIT_FAILURE);
}
static void exitWithStatus(int status)
{
#ifndef __WINDOWS__
if (containerStatusFd.isSome()) {
signalSafeWriteStatus(W_EXITCODE(status, 0));
os::close(containerStatusFd.get());
}
#endif // __WINDOWS__
::_exit(status);
}
int MesosContainerizerLaunch::execute()
{
if (flags.help) {
cerr << flags.usage();
return EXIT_SUCCESS;
}
#ifndef __WINDOWS__
// The existence of the `runtime_directory` flag implies that we
// want to checkpoint the container's status upon exit.
if (flags.runtime_directory.isSome()) {
containerStatusPath = path::join(
flags.runtime_directory.get(),
containerizer::paths::STATUS_FILE);
Try<int> open = os::open(
containerStatusPath.get(),
O_WRONLY | O_CREAT | O_CLOEXEC,
S_IRUSR | S_IWUSR);
if (open.isError()) {
cerr << "Failed to open file for writing the container status"
<< " '" << containerStatusPath.get() << "':"
<< " " << open.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
containerStatusFd = open.get();
}
// We need a signal fence here to ensure that `containerStatusFd` is
// actually written to memory and not just to a temporary register.
// Without this, it's possible that the signal handler we are about
// to install would never see the correct value since there's no
// guarantee that it is written to memory until this function
// completes (which won't happen for a really long time because we
// do a blocking `waitpid()` below).
std::atomic_signal_fence(std::memory_order_relaxed);
// Install signal handlers for all incoming signals.
Try<Nothing> signals = installSignalHandlers();
if (signals.isError()) {
cerr << "Failed to install signal handlers: " << signals.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
#endif // __WINDOWS__
if (flags.launch_info.isNone()) {
cerr << "Flag --launch_info is not specified" << endl;
exitWithStatus(EXIT_FAILURE);
}
Try<ContainerLaunchInfo> _launchInfo =
::protobuf::parse<ContainerLaunchInfo>(flags.launch_info.get());
if (_launchInfo.isError()) {
cerr << "Failed to parse launch info: " << _launchInfo.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
ContainerLaunchInfo launchInfo = _launchInfo.get();
if (!launchInfo.has_command()) {
cerr << "Launch command is not specified" << endl;
exitWithStatus(EXIT_FAILURE);
}
// Validate the command.
if (launchInfo.command().shell()) {
if (!launchInfo.command().has_value()) {
cerr << "Shell command is not specified" << endl;
exitWithStatus(EXIT_FAILURE);
}
} else {
if (!launchInfo.command().has_value()) {
cerr << "Executable path is not specified" << endl;
exitWithStatus(EXIT_FAILURE);
}
}
if ((flags.pipe_read.isSome() && flags.pipe_write.isNone()) ||
(flags.pipe_read.isNone() && flags.pipe_write.isSome())) {
cerr << "Flag --pipe_read and --pipe_write should either be "
<< "both set or both not set" << endl;
exitWithStatus(EXIT_FAILURE);
}
bool controlPipeSpecified =
flags.pipe_read.isSome() && flags.pipe_write.isSome();
if (controlPipeSpecified) {
int_fd pipe[2] = { flags.pipe_read.get(), flags.pipe_write.get() };
Try<Nothing> close = os::close(pipe[1]);
if (close.isError()) {
cerr << "Failed to close pipe[1]: " << close.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
// Do a blocking read on the pipe until the parent signals us to continue.
char dummy;
ssize_t length;
while ((length = os::read(pipe[0], &dummy, sizeof(dummy))) == -1 &&
errno == EINTR);
if (length != sizeof(dummy)) {
// There's a reasonable probability this will occur during
// agent restarts across a large/busy cluster.
cerr << "Failed to synchronize with agent "
<< "(it's probably exited)" << endl;
exitWithStatus(EXIT_FAILURE);
}
close = os::close(pipe[0]);
if (close.isError()) {
cerr << "Failed to close pipe[0]: " << close.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#ifndef __WINDOWS__
if (launchInfo.has_tty_slave_path()) {
Try<Nothing> setctty = os::setctty(STDIN_FILENO);
if (setctty.isError()) {
cerr << "Failed to set control tty: " << setctty.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#endif // __WINDOWS__
// Run additional preparation commands. These are run as the same
// user and with the environment as the agent.
foreach (const CommandInfo& command, launchInfo.pre_exec_commands()) {
if (!command.has_value()) {
cerr << "The 'value' of a preparation command is not specified" << endl;
exitWithStatus(EXIT_FAILURE);
}
cout << "Executing pre-exec command '"
<< JSON::protobuf(command) << "'" << endl;
int status = 0;
if (command.shell()) {
// Execute the command using the system shell.
status = os::system(command.value());
} else {
// Directly spawn all non-shell commands to prohibit users
// from injecting arbitrary shell commands in the arguments.
vector<string> args;
foreach (const string& arg, command.arguments()) {
args.push_back(arg);
}
status = os::spawn(command.value(), args);
}
if (!WSUCCEEDED(status)) {
cerr << "Failed to execute pre-exec command '"
<< JSON::protobuf(command) << "': "
<< WSTRINGIFY(status)
<< endl;
exitWithStatus(EXIT_FAILURE);
}
}
#ifndef __WINDOWS__
// NOTE: If 'user' is set, we will get the uid, gid, and the
// supplementary group ids associated with the specified user before
// changing the filesystem root. This is because after changing the
// filesystem root, the current process might no longer have access
// to /etc/passwd and /etc/group on the host.
Option<uid_t> uid;
Option<gid_t> gid;
vector<gid_t> gids;
// TODO(gilbert): For the case container user exists, support
// framework/task/default user -> container user mapping once
// user namespace and container capabilities is available for
// mesos container.
if (launchInfo.has_user()) {
Result<uid_t> _uid = os::getuid(launchInfo.user());
if (!_uid.isSome()) {
cerr << "Failed to get the uid of user '" << launchInfo.user() << "': "
<< (_uid.isError() ? _uid.error() : "not found") << endl;
exitWithStatus(EXIT_FAILURE);
}
// No need to change user/groups if the specified user is the same
// as that of the current process.
if (_uid.get() != os::getuid().get()) {
Result<gid_t> _gid = os::getgid(launchInfo.user());
if (!_gid.isSome()) {
cerr << "Failed to get the gid of user '" << launchInfo.user() << "': "
<< (_gid.isError() ? _gid.error() : "not found") << endl;
exitWithStatus(EXIT_FAILURE);
}
Try<vector<gid_t>> _gids = os::getgrouplist(launchInfo.user());
if (_gids.isError()) {
cerr << "Failed to get the supplementary gids of user '"
<< launchInfo.user() << "': "
<< (_gids.isError() ? _gids.error() : "not found") << endl;
exitWithStatus(EXIT_FAILURE);
}
uid = _uid.get();
gid = _gid.get();
gids = _gids.get();
}
}
#else
if (launchInfo.has_user()) {
cerr << "Switching user is not supported on Windows" << endl;
exitWithStatus(EXIT_FAILURE);
}
#endif // __WINDOWS__
#ifdef __linux__
// Initialize capabilities support if necessary.
Option<Capabilities> capabilitiesManager = None();
if (launchInfo.has_effective_capabilities() ||
launchInfo.has_bounding_capabilities()) {
Try<Capabilities> _capabilitiesManager = Capabilities::create();
if (_capabilitiesManager.isError()) {
cerr << "Failed to initialize capabilities support: "
<< _capabilitiesManager.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
capabilitiesManager = _capabilitiesManager.get();
// Prevent clearing of capabilities on `setuid`.
if (uid.isSome()) {
Try<Nothing> keepCaps = capabilitiesManager->setKeepCaps();
if (keepCaps.isError()) {
cerr << "Failed to set process control for keeping capabilities "
<< "on potential uid change: " << keepCaps.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
}
#else
if (launchInfo.has_effective_capabilities() ||
launchInfo.has_bounding_capabilities()) {
cerr << "Capabilities are not supported on non Linux system" << endl;
exitWithStatus(EXIT_FAILURE);
}
#endif // __linux__
#ifdef __linux__
if (flags.namespace_mnt_target.isSome()) {
string path = path::join(
"/proc",
stringify(flags.namespace_mnt_target.get()),
"ns",
"mnt");
Try<Nothing> setns = ns::setns(path, "mnt", false);
if (setns.isError()) {
cerr << "Failed to enter mount namespace: "
<< setns.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
if (flags.unshare_namespace_mnt) {
if (unshare(CLONE_NEWNS) != 0) {
cerr << "Failed to unshare mount namespace: "
<< os::strerror(errno) << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#endif // __linux__
#ifndef __WINDOWS__
// Change root to a new root, if provided.
if (launchInfo.has_rootfs()) {
cout << "Changing root to " << launchInfo.rootfs() << endl;
// Verify that rootfs is an absolute path.
Result<string> realpath = os::realpath(launchInfo.rootfs());
if (realpath.isError()) {
cerr << "Failed to determine if rootfs is an absolute path: "
<< realpath.error() << endl;
exitWithStatus(EXIT_FAILURE);
} else if (realpath.isNone()) {
cerr << "Rootfs path does not exist" << endl;
exitWithStatus(EXIT_FAILURE);
} else if (realpath.get() != launchInfo.rootfs()) {
cerr << "Rootfs path is not an absolute path" << endl;
exitWithStatus(EXIT_FAILURE);
}
#ifdef __linux__
Try<Nothing> chroot = fs::chroot::enter(launchInfo.rootfs());
#else
// For any other platform we'll just use POSIX chroot.
Try<Nothing> chroot = os::chroot(launchInfo.rootfs());
#endif // __linux__
if (chroot.isError()) {
cerr << "Failed to enter chroot '" << launchInfo.rootfs()
<< "': " << chroot.error();
exitWithStatus(EXIT_FAILURE);
}
}
#else
if (launchInfo.has_rootfs()) {
cerr << "Changing rootfs is not supported on Windows" << endl;
exitWithStatus(EXIT_FAILURE);
}
#endif // __WINDOWS__
#ifndef __WINDOWS__
// Setting resource limits for the process.
if (launchInfo.has_rlimits()) {
foreach (const RLimitInfo::RLimit& limit, launchInfo.rlimits().rlimits()) {
Try<Nothing> set = rlimits::set(limit);
if (set.isError()) {
cerr << "Failed to set rlimit: " << set.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
}
#else
if (launchInfo.has_rlimits()) {
cerr << "Rlimits are not supported on Windows" << endl;
exitWithStatus(EXIT_FAILURE);
}
#endif // __WINDOWS__
if (launchInfo.has_working_directory()) {
// If working directory does not exist (e.g., being removed from
// the container image), create an empty directory even it may
// not be used. Please note that this case can only be possible
// if an image has 'WORKDIR' specified in its manifest but that
// 'WORKDIR' does not exist in the image's rootfs.
//
// TODO(gilbert): Set the proper ownership to this working
// directory to make sure a specified non-root user has the
// permission to write to this working directory. Right now
// it is owned by root, and any non-root user will fail to
// write to this directory. Please note that this is identical
// to the semantic as docker daemon. The semantic can be
// verified by:
// 'docker run -ti -u nobody quay.io/spinnaker/front50:master bash'
// The ownership of '/workdir' is root. Creating any file under
// '/workdir' will fail for 'Permission denied'.
Try<Nothing> mkdir = os::mkdir(launchInfo.working_directory());
if (mkdir.isError()) {
cerr << "Failed to create working directory "
<< "'" << launchInfo.working_directory() << "': "
<< mkdir.error() << endl;
}
Try<Nothing> chdir = os::chdir(launchInfo.working_directory());
if (chdir.isError()) {
cerr << "Failed to chdir into current working directory "
<< "'" << launchInfo.working_directory() << "': "
<< chdir.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#ifndef __WINDOWS__
// Change user if provided. Note that we do that after executing the
// preparation commands so that those commands will be run with the
// same privilege as the mesos-agent.
if (uid.isSome()) {
Try<Nothing> setgid = os::setgid(gid.get());
if (setgid.isError()) {
cerr << "Failed to set gid to " << gid.get()
<< ": " << setgid.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
Try<Nothing> setgroups = os::setgroups(gids, uid);
if (setgroups.isError()) {
cerr << "Failed to set supplementary gids: "
<< setgroups.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
Try<Nothing> setuid = os::setuid(uid.get());
if (setuid.isError()) {
cerr << "Failed to set uid to " << uid.get()
<< ": " << setuid.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#endif // __WINDOWS__
#ifdef __linux__
if (capabilitiesManager.isSome()) {
Try<ProcessCapabilities> capabilities = capabilitiesManager->get();
if (capabilities.isError()) {
cerr << "Failed to get capabilities for the current process: "
<< capabilities.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
// After 'setuid', the 'effective' set is cleared. Since `SETPCAP`
// is required in the `effective` set of a process to change the
// bounding set, we need to restore it first so we can make the
// final capability changes.
capabilities->add(capabilities::EFFECTIVE, capabilities::SETPCAP);
Try<Nothing> setPcap = capabilitiesManager->set(capabilities.get());
if (setPcap.isError()) {
cerr << "Failed to add SETPCAP to the effective set: "
<< setPcap.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
// If the task has any effective capabilities, grant them to all
// the capability sets.
if (launchInfo.has_effective_capabilities()) {
set<Capability> target =
capabilities::convert(launchInfo.effective_capabilities());
capabilities->set(capabilities::AMBIENT, target);
capabilities->set(capabilities::EFFECTIVE, target);
capabilities->set(capabilities::PERMITTED, target);
capabilities->set(capabilities::INHERITABLE, target);
capabilities->set(capabilities::BOUNDING, target);
}
// If we also have bounding capabilities, apply that in preference to
// the effective capabilities.
if (launchInfo.has_bounding_capabilities()) {
set<Capability> bounding =
capabilities::convert(launchInfo.bounding_capabilities());
capabilities->set(capabilities::BOUNDING, bounding);
}
// Force the inherited set to be the same as the bounding set. If we
// are root and capabilities have not been specified, then this is a
// no-op. If capabilities have been specified, then we need to clip the
// inherited set to prevent file-based capabilities granting privileges
// outside the bounding set.
capabilities->set(
capabilities::INHERITABLE,
capabilities->get(capabilities::BOUNDING));
Try<Nothing> set = capabilitiesManager->set(capabilities.get());
if (set.isError()) {
cerr << "Failed to set process capabilities: " << set.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
}
#endif // __linux__
// Prepare the executable and the argument list for the child.
string executable(launchInfo.command().shell()
? os::Shell::name
: launchInfo.command().value().c_str());
os::raw::Argv argv(launchInfo.command().shell()
? vector<string>({
os::Shell::arg0,
os::Shell::arg1,
launchInfo.command().value()})
: vector<string>(
launchInfo.command().arguments().begin(),
launchInfo.command().arguments().end()));
// Prepare the environment for the child. If 'environment' is not
// specified, inherit the environment of the current process.
Option<os::raw::Envp> envp;
if (launchInfo.has_environment()) {
// TODO(tillt): `Environment::Variable` is not a string anymore,
// consider cleaning this up with the complete rollout of `Secrets`.
// This entire merging should be handled by the solution introduced
// by MESOS-7299.
hashmap<string, string> environment;
foreach (const Environment::Variable& variable,
launchInfo.environment().variables()) {
const string& name = variable.name();
const string& value = variable.value();
// TODO(tillt): Once we have a solution for MESOS-7292, allow
// logging of values.
if (environment.contains(name) && environment[name] != value) {
cout << "Overwriting environment variable '" << name << "'" << endl;
}
environment[name] = value;
}
if (!environment.contains("PATH")) {
environment["PATH"] = os::host_default_path();
}
#ifdef __WINDOWS__
// TODO(dpravat): (MESOS-6816) We should allow system environment variables
// to be overwritten if they are specified by the framework. This might
// cause applications to not work, but upon overriding system defaults, it
// becomes the overidder's problem.
Option<std::map<std::wstring, std::wstring>> systemEnvironment =
::internal::windows::get_system_env();
foreachpair (const std::wstring& key,
const std::wstring& value,
systemEnvironment.get()) {
environment[stringify(key)] = stringify(value);
}
#endif // __WINDOWS__
envp = os::raw::Envp(environment);
}
#ifndef __WINDOWS__
// If we have `containerStatusFd` set, then we need to fork-exec the
// command we are launching and checkpoint its status on exit. We
// use fork-exec directly (as opposed to `process::subprocess()`) to
// avoid initializing libprocess for this simple helper binary.
//
// TODO(klueska): Once we move the majority of `process::subprocess()`
// into stout, update the code below to use it.
if (containerStatusFd.isSome()) {
pid_t pid = ::fork();
if (pid == -1) {
cerr << "Failed to fork() the command: " << os::strerror(errno) << endl;
exitWithStatus(EXIT_FAILURE);
}
// If we are the parent...
if (pid > 0) {
// Set the global `containerPid` variable to enable signal forwarding.
//
// NOTE: We need a signal fence here to ensure that `containerPid`
// is actually written to memory and not just to a temporary register.
// Without this, it's possible that the signal handler would
// never notice the change since there's no guarantee that it is
// written out to memory until this function completes (which
// won't happen until it's too late because we loop inside a
// blocking `waitpid()` call below).
containerPid = pid;
std::atomic_signal_fence(std::memory_order_relaxed);
// Wait for the newly created process to finish.
int status = 0;
Result<pid_t> waitpid = None();
// Reap all descendants, but only continue once we reap the
// process we just launched.
while (true) {
waitpid = os::waitpid(-1, &status, 0);
if (waitpid.isError()) {
// If the error was an EINTR, we were interrupted by a
// signal and should just call `waitpid()` over again.
if (errno == EINTR) {
continue;
}
cerr << "Failed to os::waitpid(): " << waitpid.error() << endl;
exitWithStatus(EXIT_FAILURE);
}
if (waitpid.isNone()) {
cerr << "Calling os::waitpid() with blocking semantics"
<< "returned asynchronously" << endl;
exitWithStatus(EXIT_FAILURE);
}
// We only forward the signal if the child has terminated. If
// the child has stopped due to some signal (e.g., SIGSTOP),
// we will simply ignore it.
if (WIFSTOPPED(status)) {
continue;
}
if (pid == waitpid.get()) {
break;
}
}
signalSafeWriteStatus(status);
os::close(containerStatusFd.get());
::_exit(EXIT_SUCCESS);
}
}
#endif // __WINDOWS__
#ifndef __WINDOWS__
// Search executable in the current working directory as well.
// execvpe and execvp will only search executable from the current
// working directory if environment variable PATH is not set.
// TODO(aaron.wood): 'os::which' current does not work on Windows.
// Remove the ifndef guard once it's supported on Windows.
if (!path::absolute(executable) &&
launchInfo.has_working_directory()) {
Option<string> which = os::which(
executable,
launchInfo.working_directory());
if (which.isSome()) {
executable = which.get();
}
}
#endif // __WINDOWS__
if (envp.isSome()) {
os::execvpe(executable.c_str(), argv, envp.get());
} else {
os::execvp(executable.c_str(), argv);
}
// If we get here, the execvp call failed.
cerr << "Failed to execute command: " << os::strerror(errno) << endl;
exitWithStatus(EXIT_FAILURE);
UNREACHABLE();
}
} // namespace slave {
} // namespace internal {
} // namespace mesos {