/
instance_linux.go
358 lines (314 loc) · 10.5 KB
/
instance_linux.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
// Copyright (c) Contributors to the Apptainer project, established as
// Apptainer a Series of LF Projects LLC.
// For website terms of use, trademark policy, privacy policy and other
// project policies see https://lfprojects.org/policies
// Copyright (c) 2018-2020, Sylabs Inc. All rights reserved.
// This software is licensed under a 3-clause BSD license. Please consult the
// LICENSE.md file distributed with the sources of this project regarding your
// rights to use or distribute this software.
// Includes code from https://github.com/docker/cli
// Released under the Apache License Version 2.0
package apptainer
import (
"context"
"encoding/json"
"fmt"
"io"
"math"
"os"
"strings"
"syscall"
"text/tabwriter"
"time"
"github.com/apptainer/apptainer/internal/pkg/cgroups"
"github.com/apptainer/apptainer/internal/pkg/instance"
"github.com/apptainer/apptainer/pkg/sylog"
"github.com/apptainer/apptainer/pkg/util/fs/proc"
"github.com/buger/goterm"
units "github.com/docker/go-units"
libcgroups "github.com/opencontainers/runc/libcontainer/cgroups"
)
type instanceInfo struct {
Instance string `json:"instance"`
Pid int `json:"pid"`
Image string `json:"img"`
IP string `json:"ip"`
LogErrPath string `json:"logErrPath"`
LogOutPath string `json:"logOutPath"`
}
// PrintInstanceList fetches instance list, applying name and
// user filters, and prints it in a regular or a JSON format (if
// formatJSON is true) to the passed writer. Additionally, fetches
// log paths (if showLogs is true).
func PrintInstanceList(w io.Writer, name, user string, formatJSON bool, showLogs bool, all bool) error {
if formatJSON && showLogs {
sylog.Fatalf("more than one flags have been set")
}
tabWriter := tabwriter.NewWriter(w, 0, 8, 4, ' ', 0)
defer tabWriter.Flush()
ii, err := instance.List(user, name, instance.AppSubDir, all)
if err != nil {
return fmt.Errorf("could not retrieve instance list: %v", err)
}
if showLogs {
_, err := fmt.Fprintln(tabWriter, "INSTANCE NAME\tPID\tLOGS")
if err != nil {
return fmt.Errorf("could not write list header: %v", err)
}
for _, i := range ii {
_, err = fmt.Fprintf(tabWriter, "%s\t%d\t%s\n\t\t%s\n", i.Name, i.Pid, i.LogErrPath, i.LogOutPath)
if err != nil {
return fmt.Errorf("could not write instance info: %v", err)
}
}
return nil
}
if !formatJSON {
_, err := fmt.Fprintln(tabWriter, "INSTANCE NAME\tPID\tIP\tIMAGE")
if err != nil {
return fmt.Errorf("could not write list header: %v", err)
}
for _, i := range ii {
_, err = fmt.Fprintf(tabWriter, "%s\t%d\t%s\t%s\n", i.Name, i.Pid, i.IP, i.Image)
if err != nil {
return fmt.Errorf("could not write instance info: %v", err)
}
}
return nil
}
instances := make([]instanceInfo, len(ii))
for i := range instances {
instances[i].Image = ii[i].Image
instances[i].Pid = ii[i].Pid
instances[i].Instance = ii[i].Name
instances[i].IP = ii[i].IP
instances[i].LogErrPath = ii[i].LogErrPath
instances[i].LogOutPath = ii[i].LogOutPath
}
enc := json.NewEncoder(w)
enc.SetIndent("", "\t")
err = enc.Encode(
map[string][]instanceInfo{
"instances": instances,
})
if err != nil {
return fmt.Errorf("could not encode instance list: %v", err)
}
return nil
}
// WriteInstancePidFile fetches instance's PID and writes it to the pidFile,
// truncating it if it already exists. Note that the name should not be a glob,
// i.e. name should identify a single instance only, otherwise an error is returned.
func WriteInstancePidFile(name, pidFile string) error {
inst, err := instance.List("", name, instance.AppSubDir, true)
if err != nil {
return fmt.Errorf("could not retrieve instance list: %v", err)
}
if len(inst) != 1 {
return fmt.Errorf("unexpected instance count: %d", len(inst))
}
f, err := os.OpenFile(pidFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC|syscall.O_NOFOLLOW, 0o644)
if err != nil {
return fmt.Errorf("could not create pid file: %v", err)
}
defer f.Close()
_, err = fmt.Fprintf(f, "%d\n", inst[0].Pid)
if err != nil {
return fmt.Errorf("could not write pid file: %v", err)
}
return nil
}
// instanceListOrError is a private function to retrieve named instances or fail if there are no instances
// We wrap the error from instance.List to provide a more specific error message
func instanceListOrError(instanceUser, name string) ([]*instance.File, error) {
ii, err := instance.List(instanceUser, name, instance.AppSubDir, true)
if err != nil {
return ii, fmt.Errorf("could not retrieve instance list: %w", err)
}
if len(ii) == 0 {
return ii, fmt.Errorf("no instance found")
}
return ii, err
}
// calculate BlockIO counts up read/write totals
func calculateBlockIO(stats *libcgroups.BlkioStats) (float64, float64) {
var read, write float64
for _, entry := range stats.IoServiceBytesRecursive {
switch strings.ToLower(entry.Op) {
case "read":
read += float64(entry.Value)
case "write":
write += float64(entry.Value)
}
}
return read, write
}
// calculateMemoryUsage returns the current usage, limit, and percentage
func calculateMemoryUsage(stats *libcgroups.MemoryStats) (float64, float64, float64) {
// Note that there is also MaxUsage
memUsage := stats.Usage.Usage
memLimit := stats.Usage.Limit
memPercent := 0.0
// If there is no limit, show system RAM instead of max uint64...
if memLimit == math.MaxUint64 {
in := &syscall.Sysinfo_t{}
err := syscall.Sysinfo(in)
if err == nil {
memLimit = uint64(in.Totalram) * uint64(in.Unit)
}
}
if memLimit != 0 {
memPercent = float64(memUsage) / float64(memLimit) * 100.0
}
return float64(memUsage), float64(memLimit), memPercent
}
func calculateCPUUsage(prevTime, prevCPU uint64, cpuStats *libcgroups.CpuStats) (cpuPercent float64, curTime, curCPU uint64) {
// Update 1s interval CPU ns usage
curTime = uint64(time.Now().UnixNano())
curCPU = cpuStats.CpuUsage.TotalUsage
deltaCPU := float64(curCPU - prevCPU)
deltaTime := float64(curTime - prevTime)
cpuPercent = (deltaCPU / deltaTime) * 100
return cpuPercent, curTime, curCPU
}
// InstanceStats uses underlying cgroups to get statistics for a named instance
func InstanceStats(ctx context.Context, name, instanceUser string, formatJSON bool, noStream bool) error {
ii, err := instanceListOrError(instanceUser, name)
if err != nil {
return err
}
// Instance stats required 1 instance
if len(ii) != 1 {
return fmt.Errorf("query returned more than one instance (%d)", len(ii))
}
// Grab our instance to interact with!
i := ii[0]
if !formatJSON {
sylog.Infof("Stats for %s instance of %s (PID=%d)\n", i.Name, i.Image, i.Pid)
}
// If asking for json and not nostream, not possible
if formatJSON && !noStream {
sylog.Warningf("JSON output is only available for a single timepoint (--no-stream)")
noStream = true
}
// Cut out early if we do not have cgroups
if !i.Cgroup {
url := "the Apptainer instance user guide for instructions"
return fmt.Errorf("stats are only available if cgroups are enabled, see %s", url)
}
// Get a cgroupfs managed cgroup from the pid
manager, err := cgroups.GetManagerForPid(i.Pid)
if err != nil {
return fmt.Errorf("while getting cgroup manager for pid: %v", err)
}
// Otherwise print shortened table
tabWriter := tabwriter.NewWriter(os.Stdout, 0, 8, 4, ' ', 0)
defer tabWriter.Flush()
// Retrieve initial state, for first CPU measurement
stats, err := manager.GetStats()
if err != nil {
return fmt.Errorf("while getting stats for pid: %v", err)
}
prevCPU := stats.CpuStats.CpuUsage.TotalUsage
prevTime := uint64(time.Now().UnixNano())
cpuPercent := 0.0
for {
select {
case <-ctx.Done():
return nil
case <-time.After(1 * time.Second):
// Stream clears the terminal and reprint header and stats each time
if !noStream {
goterm.Clear()
goterm.MoveCursor(1, 1)
goterm.Flush()
}
// Retrieve new stats
stats, err := manager.GetStats()
if err != nil {
return fmt.Errorf("while getting stats for pid: %v", err)
}
// Do we want json?
if formatJSON {
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", "\t")
err = enc.Encode(stats)
return err
}
// Stats can be added from this set
// https://github.com/opencontainers/runc/blob/main/libcontainer/cgroups/stats.go
_, err = fmt.Fprintln(tabWriter, "INSTANCE NAME\tCPU USAGE\tMEM USAGE / LIMIT\tMEM %\tBLOCK I/O\tPIDS")
if err != nil {
return fmt.Errorf("could not write stats header: %v", err)
}
cpuPercent, prevTime, prevCPU = calculateCPUUsage(prevTime, prevCPU, &stats.CpuStats)
memUsage, memLimit, memPercent := calculateMemoryUsage(&stats.MemoryStats)
blockRead, blockWrite := calculateBlockIO(&stats.BlkioStats)
// Generate a shortened stats list
_, err = fmt.Fprintf(tabWriter, "%s\t%.2f%%\t%s / %s\t%.2f%s\t%s / %s\t%d\n", i.Name,
cpuPercent, units.BytesSize(memUsage), units.BytesSize(memLimit),
memPercent, "%", units.BytesSize(blockRead), units.BytesSize(blockWrite),
stats.PidsStats.Current)
tabWriter.Flush()
if err != nil {
return fmt.Errorf("could not write instance stats: %v", err)
}
// We don't want a stream, return after just one record
if noStream {
return nil
}
}
}
}
// StopInstance fetches instance list, applying name and
// user filters, and stops them by sending a signal sig. If an instance
// is still running after a grace period defined by timeout is expired,
// it will be forcibly killed.
func StopInstance(name, user string, sig syscall.Signal, timeout time.Duration) error {
ii, err := instanceListOrError(user, name)
if err != nil {
return err
}
stoppedPID := make(chan int, 1)
stopped := make([]int, 0)
for _, i := range ii {
go killInstance(i, sig, stoppedPID)
}
for {
select {
case pid := <-stoppedPID:
stopped = append(stopped, pid)
if len(stopped) == len(ii) {
return nil
}
case <-time.After(timeout):
killNext:
for _, i := range ii {
for _, pid := range stopped {
if i.Pid == pid {
continue killNext
}
}
sylog.Infof("Killing %s instance of %s (PID=%d) (Timeout)\n", i.Name, i.Image, i.Pid)
syscall.Kill(i.Pid, syscall.SIGKILL)
}
return nil
}
}
}
func killInstance(i *instance.File, sig syscall.Signal, stoppedPID chan<- int) {
sylog.Infof("Stopping %s instance of %s (PID=%d)\n", i.Name, i.Image, i.Pid)
syscall.Kill(i.Pid, sig)
for {
if err := syscall.Kill(i.PPid, 0); err == syscall.ESRCH {
stoppedPID <- i.Pid
break
}
if childs, err := proc.CountChilds(i.Pid); childs == 0 {
if err == nil {
syscall.Kill(i.Pid, syscall.SIGKILL)
}
}
time.Sleep(10 * time.Millisecond)
}
}