Skip to content

Commit

Permalink
Add k8s global memory indicators
Browse files Browse the repository at this point in the history
The kubelet will terminate end-user pods when the worker node has
'MemoryPressure' according to [1]. But confusingly, there exits two
reasons for pods being evicted:
- one is the whole machine's free memory is too low,
- the other is k8s itself calculation[2], e.i. memory.available[3]
  is too low.

To resolve such confusion for k8s users, collect and show k8s global
workingset memory to distinguish between these two causes.

Note:
1. Only collect k8s global memory stats is enough, this is because
   cgroupfs stats are propagated from child to parent. Thus the
   parent can always notice the change and then updates. And From
   v1.6 k8s[4], allocatable(/sys/fs/cgroup/memory/kubepods/) is more
   convincing than capacity(/sys/fs/cgroup/memory/).
2. There are two cgroup drivers or managers to control resources:
   cgroupfs and systemd[5]. We should take both into account.
   (The 'systemd' cgroup driver always ends with '.slice')
3. The difference between cgroupv1 and cgroupv2: different field names
   for memory.stat file, and memory.currentUsage storing in different
   files (cgv1's memory.usage_in_bytes v.s. cgv2's memory.current).

[1]https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#node-out-of-memory-behavior
[2]kubernetes/kubernetes#43916
[3]memory.available = memory.allocatable/capacity - memory.workingSet,
   memory.workingSet = memory.currentUsage - memory.inactivefile
[4]kubernetes/kubernetes#42204
   kubernetes/community#348
[5]https://kubernetes.io/docs/tasks/administer-cluster/kubeadm/configure-cgroup-driver/

Signed-off-by: Fei Li <lifei.shirley@bytedance.com>
Reported-by: Teng Hu <huteng.ht@bytedance.com>
  • Loading branch information
ShirleyFei authored and liutingjieni committed Apr 4, 2023
1 parent 639741d commit 5aca82a
Show file tree
Hide file tree
Showing 7 changed files with 386 additions and 0 deletions.
11 changes: 11 additions & 0 deletions deviate.c
Original file line number Diff line number Diff line change
Expand Up @@ -1459,6 +1459,17 @@ deviatsyst(struct sstat *cur, struct sstat *pre, struct sstat *dev,

dev->llc.nrllcs = cur->llc.nrllcs;

dev->k8smem.file = cur->k8smem.file;
dev->k8smem.anon = cur->k8smem.anon;
dev->k8smem.shmem = cur->k8smem.shmem;
dev->k8smem.filemapped = cur->k8smem.filemapped;
dev->k8smem.inactiveanon = cur->k8smem.inactiveanon;
dev->k8smem.activeanon = cur->k8smem.activeanon;
dev->k8smem.inactivefile = cur->k8smem.inactivefile;
dev->k8smem.activefile = cur->k8smem.activefile;
dev->k8smem.usagefile = cur->k8smem.usagefile;
dev->k8smem.workingset = cur->k8smem.workingset;

#if HTTPSTATS
/*
** application-specific counters
Expand Down
17 changes: 17 additions & 0 deletions man/atop.1
Original file line number Diff line number Diff line change
Expand Up @@ -1357,6 +1357,23 @@ the number of memory pages the system wrote to swap space (`swout'), and
the number of out-of-memory kills (`oomkill').
.PP
.TP 5
.B K8S
K8S global /sys/fs/cgroup/[memory/]kubepods/memory.stat.
.br
This line shows the number of file pages for k8s global memcg (`file'),
the number of mapped anonymous pages for k8s global memcg (`anon'),
the number of shmem pages (included tmpfs/GEM pages) for k8s global
memcg (`shmem'), the number of pagecache pages mapped into pagetables
for k8s global memcg (`fmap'), the number of lru inactive anon pages
for k8s global memcg (`inan'), the number of lru active anon pages
for k8s global memcg (`actan'), the number of lru inactive file pages
for k8s global memcg (`infl'), the number of lru active file pages
for k8s global memcg (`actfl'), the number of current usage file
for k8s global memcg, including usermem and kmem (`usage'), the
number of workingset file pages (from k8s vision: number of current
usage pages minus inactivefile pages) for k8s global memcg (`wkset').
.PP
.TP 5
.B PSI
Pressure Stall Information.
.br
Expand Down
153 changes: 153 additions & 0 deletions photosyst.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,14 @@
/* recognize numa node */
#define NUMADIR "/sys/devices/system/node"

/* recognize k8s global memory.stat and memory current usage */
#define K8S_MEMDIR_CGV1 "/sys/fs/cgroup/memory/kubepods"
#define K8S_MEMDIR_CGV2 "/sys/fs/cgroup/kubepods"
#define K8S_SYSTEMD_CM ".slice"
#define K8S_MEM_STAT "/memory.stat"
#define K8S_MEM_CGV1_USAGE "/memory.usage_in_bytes"
#define K8S_MEM_CGV2_USAGE "/memory.current"

/* recognize LLC monitor data */
#define LLCDIR "/sys/fs/resctrl/mon_data"
#define L3SIZE "/sys/devices/system/cpu/cpu0/cache/index3/size"
Expand Down Expand Up @@ -906,6 +914,151 @@ photosyst(struct sstat *si)
}
}

if ( supportflags & CGROUPV2 )
{
if ( (fp = fopen(K8S_MEMDIR_CGV2 K8S_MEM_STAT, "r")) != NULL ||
(fp = fopen(K8S_MEMDIR_CGV2 K8S_SYSTEMD_CM K8S_MEM_STAT, "r")) != NULL )
{
/* for cgroup v2 */
while ( fgets(linebuf, sizeof(linebuf), fp) != NULL )
{
nr = sscanf(linebuf, "%s %lld\n", nam, &cnts[0]);

if ( strcmp("file", nam) == EQ )
{
si->k8smem.file = cnts[0]/pagesize;
continue;
}
if ( strcmp("anon", nam) == EQ )
{
si->k8smem.anon = cnts[0]/pagesize;
continue;
}
if ( strcmp("shmem", nam) == EQ )
{
si->k8smem.shmem = cnts[0]/pagesize;
continue;
}
if ( strcmp("file_mapped", nam) == EQ )
{
si->k8smem.filemapped = cnts[0]/pagesize;
continue;
}
if ( strcmp("inactive_anon", nam) == EQ )
{
si->k8smem.inactiveanon = cnts[0]/pagesize;
continue;
}
if ( strcmp("active_anon", nam) == EQ )
{
si->k8smem.activeanon = cnts[0]/pagesize;
continue;
}
if ( strcmp("inactive_file", nam) == EQ )
{
si->k8smem.inactivefile = cnts[0]/pagesize;
continue;
}
if ( strcmp("active_file", nam) == EQ )
{
si->k8smem.activefile = cnts[0]/pagesize;
continue;
}
}

fclose(fp);
}

if ( (fp = fopen(K8S_MEMDIR_CGV2 K8S_MEM_CGV2_USAGE, "r")) != NULL ||
(fp = fopen(K8S_MEMDIR_CGV2 K8S_SYSTEMD_CM K8S_MEM_CGV2_USAGE, "r")) != NULL )
{
if ( fscanf(fp, "%lld", &cnts[0]) == 1 )
{
/*
** Refer to https://github.com/kubernetes/kubernetes/issues/43916,
** memory.available := node.status.capacity[memory] - node.stats.memory.workingSet
** && workingSet := $cgroupfs/memory.current - inactive_file
*/
si->k8smem.usagefile = cnts[0]/pagesize;
si->k8smem.workingset = si->k8smem.usagefile - si->k8smem.inactivefile;
}

fclose(fp);
}
}
else
{
if ( (fp = fopen(K8S_MEMDIR_CGV1 K8S_MEM_STAT, "r")) != NULL ||
(fp = fopen(K8S_MEMDIR_CGV1 K8S_SYSTEMD_CM K8S_MEM_STAT, "r")) != NULL )
{
/* for cgroup v1 */
while ( fgets(linebuf, sizeof(linebuf), fp) != NULL )
{
nr = sscanf(linebuf, "%s %lld\n", nam, &cnts[0]);

if ( strcmp("total_cache", nam) == EQ )
{
si->k8smem.file = cnts[0]/pagesize;
continue;
}
if ( strcmp("total_rss", nam) == EQ)
{
si->k8smem.anon = cnts[0]/pagesize;
continue;
}
if ( strcmp("total_shmem", nam) == EQ)
{
si->k8smem.shmem = cnts[0]/pagesize;
continue;
}
if ( strcmp("total_mapped_file", nam) == EQ)
{
si->k8smem.filemapped = cnts[0]/pagesize;
continue;
}
if ( strcmp("total_inactive_anon", nam) == EQ)
{
si->k8smem.inactiveanon = cnts[0]/pagesize;
continue;
}
if ( strcmp("total_active_anon", nam) == EQ)
{
si->k8smem.activeanon = cnts[0]/pagesize;
continue;
}
if ( strcmp("total_inactive_file", nam) == EQ)
{
si->k8smem.inactivefile = cnts[0]/pagesize;
continue;
}
if ( strcmp("total_active_file", nam) == EQ)
{
si->k8smem.activefile = cnts[0]/pagesize;
continue;
}
}

fclose(fp);
}

if ( (fp = fopen(K8S_MEMDIR_CGV1 K8S_MEM_CGV1_USAGE, "r")) != NULL ||
(fp = fopen(K8S_MEMDIR_CGV1 K8S_SYSTEMD_CM K8S_MEM_CGV1_USAGE, "r")) != NULL )
{
if ( fscanf(fp, "%lld", &cnts[0]) == 1 )
{
/*
** Refer to https://github.com/kubernetes/kubernetes/issues/43916,
** memory.available := node.status.capacity[memory] - node.stats.memory.workingSet
** && workingSet := $cgroupfs/memory.usage_in_bytes - total_inactive_file
*/
si->k8smem.usagefile = cnts[0]/pagesize;
si->k8smem.workingset = si->k8smem.usagefile - si->k8smem.inactivefile;
}

fclose(fp);
}
}

/*
** gather per numa memory-related statistics from the file
** /sys/devices/system/node/node0/meminfo, and store them in binary form.
Expand Down
14 changes: 14 additions & 0 deletions photosyst.h
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,19 @@ struct llcstat {
struct perllc perllc[MAXLLC];
};

struct k8smem {
count_t file; /* number of file pages for k8s global memcg */
count_t anon; /* number of mapped anonymous pages for k8s global memcg */
count_t shmem; /* number of shmem pages (included tmpfs/GEM pages) for k8s global memcg */
count_t filemapped; /* number of pagecache pages mapped into pagetables for k8s global memcg */
count_t inactiveanon; /* number of lru inactive anon pages for k8s global memcg */
count_t activeanon; /* number of lru active anon pages for k8s global memcg */
count_t inactivefile; /* number of lru inactive file pages for k8s global memcg */
count_t activefile; /* number of lru active file pages for k8s global memcg */
count_t usagefile; /* number of current usage pages for k8s global memcg */
count_t workingset; /* k8s vision: number of current usage pages minus inactivefile pages */
};

/************************************************************************/

struct sstat {
Expand All @@ -458,6 +471,7 @@ struct sstat {
struct gpustat gpu;
struct ifbstat ifb;
struct llcstat llc;
struct k8smem k8smem;

struct wwwstat www;
};
Expand Down
60 changes: 60 additions & 0 deletions showlinux.c
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,19 @@ sys_printdef *llcsyspdefs[] = {
&syspdef_BLANKBOX,
0
};
sys_printdef *k8smemsyspdefs[] = {
&syspdef_K8SFILE,
&syspdef_K8SANON,
&syspdef_K8SSHMEM,
&syspdef_K8SFILEMAPPED,
&syspdef_K8SINACTIVEANON,
&syspdef_K8SACTIVEANON,
&syspdef_K8SINACTIVEFILE,
&syspdef_K8SACTIVEFILE,
&syspdef_K8SUSAGEFILE,
&syspdef_K8SWORKINGSET,
0
};
sys_printdef *psisyspdefs[] = {
&syspdef_PSICPUSTOT,
&syspdef_PSIMEMSTOT,
Expand Down Expand Up @@ -525,6 +538,7 @@ sys_printpair swpline[MAXITEMS];
sys_printpair memnumaline[MAXITEMS];
sys_printpair cpunumaline[MAXITEMS];
sys_printpair llcline[MAXITEMS];
sys_printpair k8smemline[MAXITEMS];
sys_printpair pagline[MAXITEMS];
sys_printpair psiline[MAXITEMS];
sys_printpair contline[MAXITEMS];
Expand Down Expand Up @@ -1036,6 +1050,23 @@ pricumproc(struct sstat *sstat, struct devtstat *devtstat,
sstat, &extra);
}

if (k8smemline[0].f == 0)
{
make_sys_prints(k8smemline, MAXITEMS,
"K8SFILE:1 "
"K8SANON:1 "
"K8SSHMEM:1 "
"K8SFILEMAPPED:2 "
"K8SACTIVEANON:2 "
"K8SINACTIVEANON:2 "
"K8SACTIVEFILE:2 "
"K8SINACTIVEFILE:1 "
"K8SUSAGEFILE:1 "
"K8SWORKINGSET:1 ",
k8smemsyspdefs, "builtin k8smemline",
sstat, &extra);
}

if (pagline[0].f == 0)
{
make_sys_prints(pagline, MAXITEMS,
Expand Down Expand Up @@ -2020,6 +2051,28 @@ prisyst(struct sstat *sstat, int curline, int nsecs, int avgval,
}
}

/*
** k8s global memory.stat statistics
*/
if (fixedhead ||
sstat->k8smem.file ||
sstat->k8smem.anon ||
sstat->k8smem.shmem ||
sstat->k8smem.filemapped ||
sstat->k8smem.inactiveanon ||
sstat->k8smem.activeanon ||
sstat->k8smem.inactivefile ||
sstat->k8smem.activefile ||
sstat->k8smem.usagefile ||
sstat->k8smem.workingset )
{
if (screen)
move(curline, 0);

showsysline(k8smemline, sstat, &extra, "K8S", 0);
curline++;
}

/*
** PAGING statistics
*/
Expand Down Expand Up @@ -2998,6 +3051,13 @@ do_ownllcline(char *name, char *val)
NULL, NULL);
}

void
do_ownk8smemline(char *name, char *val)
{
make_sys_prints(k8smemline, MAXITEMS, val, k8smemsyspdefs, name,
NULL, NULL);
}

void
do_owndskline(char *name, char *val)
{
Expand Down
11 changes: 11 additions & 0 deletions showlinux.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ void do_ownpagline(char *, char *);
void do_ownmemnumaline(char *, char *);
void do_owncpunumaline(char *, char *);
void do_ownllcline(char *, char *);
void do_ownk8smemline(char *, char *);
void do_owndskline(char *, char *);
void do_ownnettransportline(char *, char *);
void do_ownnetnetline(char *, char *);
Expand Down Expand Up @@ -267,6 +268,16 @@ extern sys_printdef syspdef_NUMACPUGUEST;
extern sys_printdef syspdef_LLCMBMTOTAL;
extern sys_printdef syspdef_LLCMBMLOCAL;
extern sys_printdef syspdef_NUMLLC;
extern sys_printdef syspdef_K8SFILE;
extern sys_printdef syspdef_K8SANON;
extern sys_printdef syspdef_K8SSHMEM;
extern sys_printdef syspdef_K8SFILEMAPPED;
extern sys_printdef syspdef_K8SACTIVEANON;
extern sys_printdef syspdef_K8SINACTIVEANON;
extern sys_printdef syspdef_K8SACTIVEFILE;
extern sys_printdef syspdef_K8SINACTIVEFILE;
extern sys_printdef syspdef_K8SUSAGEFILE;
extern sys_printdef syspdef_K8SWORKINGSET;
extern sys_printdef syspdef_PAGSCAN;
extern sys_printdef syspdef_PAGSTEAL;
extern sys_printdef syspdef_PAGSTALL;
Expand Down

0 comments on commit 5aca82a

Please sign in to comment.