-
Notifications
You must be signed in to change notification settings - Fork 152
/
zalloc.c
10477 lines (9123 loc) · 278 KB
/
zalloc.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
/*
* @OSF_COPYRIGHT@
*/
/*
* Mach Operating System
* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
* All Rights Reserved.
*
* Permission to use, copy, modify and distribute this software and its
* documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie Mellon
* the rights to redistribute these changes.
*/
/*
*/
/*
* File: kern/zalloc.c
* Author: Avadis Tevanian, Jr.
*
* Zone-based memory allocator. A zone is a collection of fixed size
* data blocks for which quick allocation/deallocation is possible.
*/
#define ZALLOC_ALLOW_DEPRECATED 1
#if !ZALLOC_TEST
#include <mach/mach_types.h>
#include <mach/vm_param.h>
#include <mach/kern_return.h>
#include <mach/mach_host_server.h>
#include <mach/task_server.h>
#include <mach/machine/vm_types.h>
#include <machine/machine_routines.h>
#include <mach/vm_map.h>
#include <mach/sdt.h>
#if __x86_64__
#include <i386/cpuid.h>
#endif
#include <kern/bits.h>
#include <kern/btlog.h>
#include <kern/startup.h>
#include <kern/kern_types.h>
#include <kern/assert.h>
#include <kern/backtrace.h>
#include <kern/host.h>
#include <kern/macro_help.h>
#include <kern/sched.h>
#include <kern/locks.h>
#include <kern/sched_prim.h>
#include <kern/misc_protos.h>
#include <kern/thread_call.h>
#include <kern/zalloc_internal.h>
#include <kern/kalloc.h>
#include <kern/debug.h>
#include <prng/random.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <vm/vm_compressor.h> /* C_SLOT_PACKED_PTR* */
#include <pexpert/pexpert.h>
#include <machine/machparam.h>
#include <machine/machine_routines.h> /* ml_cpu_get_info */
#include <os/atomic.h>
#include <libkern/OSDebug.h>
#include <libkern/OSAtomic.h>
#include <libkern/section_keywords.h>
#include <sys/kdebug.h>
#include <san/kasan.h>
#include <libsa/stdlib.h>
#include <sys/errno.h>
#include <IOKit/IOBSD.h>
#if DEBUG
#define z_debug_assert(expr) assert(expr)
#else
#define z_debug_assert(expr) (void)(expr)
#endif
/* Returns pid of the task with the largest number of VM map entries. */
extern pid_t find_largest_process_vm_map_entries(void);
/*
* Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
* For any other pid we try to kill that process synchronously.
*/
extern boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
extern zone_t vm_map_entry_zone;
extern zone_t vm_object_zone;
extern zone_t ipc_service_port_label_zone;
ZONE_DEFINE_TYPE(percpu_u64_zone, "percpu.64", uint64_t,
ZC_PERCPU | ZC_ALIGNMENT_REQUIRED | ZC_KASAN_NOREDZONE);
#if CONFIG_KERNEL_TBI && KASAN_TBI
#define ZONE_MIN_ELEM_SIZE (sizeof(uint64_t) * 2)
#define ZONE_ALIGN_SIZE ZONE_MIN_ELEM_SIZE
#else /* CONFIG_KERNEL_TBI && KASAN_TBI */
#define ZONE_MIN_ELEM_SIZE sizeof(uint64_t)
#define ZONE_ALIGN_SIZE ZONE_MIN_ELEM_SIZE
#endif /* CONFIG_KERNEL_TBI && KASAN_TBI */
#define ZONE_MAX_ALLOC_SIZE (32 * 1024)
#if ZSECURITY_CONFIG(SAD_FENG_SHUI)
#define ZONE_CHUNK_ALLOC_SIZE (256 * 1024)
#endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
__enum_closed_decl(zm_len_t, uint16_t, {
ZM_CHUNK_FREE = 0x0,
/* 1 through 8 are valid lengths */
ZM_CHUNK_LEN_MAX = 0x8,
/* PGZ magical values */
ZM_PGZ_FREE = 0x0,
ZM_PGZ_ALLOCATED = 0xa, /* [a]llocated */
ZM_PGZ_GUARD = 0xb, /* oo[b] */
ZM_PGZ_DOUBLE_FREE = 0xd, /* [d]ouble_free */
/* secondary page markers */
ZM_SECONDARY_PAGE = 0xe,
ZM_SECONDARY_PCPU_PAGE = 0xf,
});
struct zone_page_metadata {
union {
struct {
/* The index of the zone this metadata page belongs to */
zone_id_t zm_index : 11;
/*
* Whether `zm_bitmap` is an inline bitmap
* or a packed bitmap reference
*/
uint16_t zm_inline_bitmap : 1;
/*
* Zones allocate in "chunks" of zone_t::z_chunk_pages
* consecutive pages, or zpercpu_count() pages if the
* zone is percpu.
*
* The first page of it has its metadata set with:
* - 0 if none of the pages are currently wired
* - the number of wired pages in the chunk
* (not scaled for percpu).
*
* Other pages in the chunk have their zm_chunk_len set
* to ZM_SECONDARY_PAGE or ZM_SECONDARY_PCPU_PAGE
* depending on whether the zone is percpu or not.
* For those, zm_page_index holds the index of that page
* in the run.
*
* Metadata used for PGZ pages can have 3 values:
* - ZM_PGZ_FREE: slot is free
* - ZM_PGZ_ALLOCATED: slot holds an allocated element
* at offset (zm_pgz_orig_addr & PAGE_MASK)
* - ZM_PGZ_DOUBLE_FREE: slot detected a double free
* (will panic).
*/
zm_len_t zm_chunk_len : 4;
};
uint16_t zm_bits;
};
union {
#define ZM_ALLOC_SIZE_LOCK 1u
uint16_t zm_alloc_size; /* first page only */
uint16_t zm_page_index; /* secondary pages only */
uint16_t zm_oob_offs; /* in guard pages */
};
union {
uint32_t zm_bitmap; /* most zones */
uint32_t zm_bump; /* permanent zones */
};
union {
struct {
zone_pva_t zm_page_next;
zone_pva_t zm_page_prev;
};
vm_offset_t zm_pgz_orig_addr;
struct zone_page_metadata *zm_pgz_slot_next;
};
};
static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing");
__enum_closed_decl(zone_addr_kind_t, uint32_t, {
ZONE_ADDR_FOREIGN,
ZONE_ADDR_NATIVE,
ZONE_ADDR_READONLY
});
#define ZONE_ADDR_KIND_COUNT 3
static const char * const zone_map_range_names[] = {
[ZONE_ADDR_FOREIGN] = "Foreign",
[ZONE_ADDR_NATIVE] = "Native",
[ZONE_ADDR_READONLY] = "Readonly",
};
/*!
* @typedef zone_element_t
*
* @brief
* Type that represents a "resolved" zone element.
*
* @description
* This type encodes an element pointer as a pair of:
* { chunk base, element index }.
*
* The chunk base is extracted with @c trunc_page()
* as it is always page aligned, and occupies the bits above @c PAGE_SHIFT.
*
* The other bits encode the element index in the chunk rather than its address.
*/
typedef struct zone_element {
vm_offset_t ze_value;
} zone_element_t;
/*!
* @typedef zone_magazine_t
*
* @brief
* Magazine of cached allocations.
*
* @field zm_cur how many elements this magazine holds (unused while loaded).
* @field zm_link linkage used by magazine depots.
* @field zm_elems an array of @c zc_mag_size() elements.
*/
typedef struct zone_magazine {
uint16_t zm_cur;
STAILQ_ENTRY(zone_magazine) zm_link;
zone_element_t zm_elems[0];
} *zone_magazine_t;
/*!
* @typedef zone_cache_t
*
* @brief
* Magazine of cached allocations.
*
* @discussion
* Below is a diagram of the caching system. This design is inspired by the
* paper "Magazines and Vmem: Extending the Slab Allocator to Many CPUs and
* Arbitrary Resources" by Jeff Bonwick and Jonathan Adams and the FreeBSD UMA
* zone allocator (itself derived from this seminal work).
*
* It is divided into 3 layers:
* - the per-cpu layer,
* - the recirculation depot layer,
* - the Zone Allocator.
*
* The per-cpu and recirculation depot layer use magazines (@c zone_magazine_t),
* which are stacks of up to @c zc_mag_size() elements.
*
* <h2>CPU layer</h2>
*
* The CPU layer (@c zone_cache_t) looks like this:
*
* ╭─ a ─ f ─┬───────── zm_depot ──────────╮
* │ ╭─╮ ╭─╮ │ ╭─╮ ╭─╮ ╭─╮ ╭─╮ ╭─╮ │
* │ │#│ │#│ │ │#│ │#│ │#│ │#│ │#│ │
* │ │#│ │ │ │ │#│ │#│ │#│ │#│ │#│ │
* │ │ │ │ │ │ │#│ │#│ │#│ │#│ │#│ │
* │ ╰─╯ ╰─╯ │ ╰─╯ ╰─╯ ╰─╯ ╰─╯ ╰─╯ │
* ╰─────────┴─────────────────────────────╯
*
* It has two pre-loaded magazines (a)lloc and (f)ree which we allocate from,
* or free to. Serialization is achieved through disabling preemption, and only
* the current CPU can acces those allocations. This is represented on the left
* hand side of the diagram above.
*
* The right hand side is the per-cpu depot. It consists of @c zm_depot_count
* full magazines, and is protected by the @c zm_depot_lock for access.
* The lock is expected to absolutely never be contended, as only the local CPU
* tends to access the local per-cpu depot in regular operation mode.
*
* However unlike UMA, our implementation allows for the zone GC to reclaim
* per-CPU magazines aggresively, which is serialized with the @c zm_depot_lock.
*
*
* <h2>Recirculation Depot</h2>
*
* The recirculation depot layer is a list similar to the per-cpu depot,
* however it is different in two fundamental ways:
*
* - it is protected by the regular zone lock,
* - elements referenced by the magazines in that layer appear free
* to the zone layer.
*
*
* <h2>Magazine circulation and sizing</h2>
*
* The caching system sizes itself dynamically. Operations that allocate/free
* a single element call @c zone_lock_nopreempt_check_contention() which records
* contention on the lock by doing a trylock and recording its success.
*
* This information is stored in the @c z_contention_cur field of the zone,
* and a windoed moving average is maintained in @c z_contention_wma.
* Each time a CPU registers any contention, it will also allow its own per-cpu
* cache to grow, incrementing @c zc_depot_max, which is how the per-cpu layer
* might grow into using its local depot.
*
* Note that @c zc_depot_max assume that the (a) and (f) pre-loaded magazines
* on average contain @c zc_mag_size() elements.
*
* When a per-cpu layer cannot hold more full magazines in its depot,
* then it will overflow about 1/3 of its depot into the recirculation depot
* (see @c zfree_cached_slow(). Conversely, when a depot is empty, then it will
* refill its per-cpu depot to about 1/3 of its size from the recirculation
* depot (see @c zalloc_cached_slow()).
*
* Lastly, the zone layer keeps track of the high and low watermark of how many
* elements have been free per period of time (including being part of the
* recirculation depot) in the @c z_elems_free_min and @c z_elems_free_max
* fields. A weighted moving average of the amplitude of this is maintained in
* the @c z_elems_free_wss which informs the zone GC on how to gently trim
* zones without hurting performance.
*
*
* <h2>Security considerations</h2>
*
* The zone caching layer has been designed to avoid returning elements in
* a strict LIFO behavior: @c zalloc() will allocate from the (a) magazine,
* and @c zfree() free to the (f) magazine, and only swap them when the
* requested operation cannot be fulfilled.
*
* The per-cpu overflow depot or the recirculation depots are similarly used
* in FIFO order.
*
* More importantly, when magazines flow through the recirculation depot,
* the elements they contain are marked as "free" in the zone layer bitmaps.
* Because allocations out of per-cpu caches verify the bitmaps at allocation
* time, this acts as a poor man's double-free quarantine. The magazines
* allow to avoid the cost of the bit-scanning involved in the zone-level
* @c zalloc_item() codepath.
*
*
* @field zc_alloc_cur denormalized number of elements in the (a) magazine
* @field zc_free_cur denormalized number of elements in the (f) magazine
* @field zc_alloc_elems a pointer to the array of elements in (a)
* @field zc_free_elems a pointer to the array of elements in (f)
*
* @field zc_depot_lock a lock to access @c zc_depot, @c zc_depot_cur.
* @field zc_depot a list of @c zc_depot_cur full magazines
* @field zc_depot_cur number of magazines in @c zc_depot
* @field zc_depot_max the maximum number of elements in @c zc_depot,
* protected by the zone lock.
*/
typedef struct zone_cache {
uint16_t zc_alloc_cur;
uint16_t zc_free_cur;
uint16_t zc_depot_cur;
uint16_t __zc_padding;
zone_element_t *zc_alloc_elems;
zone_element_t *zc_free_elems;
hw_lock_bit_t zc_depot_lock;
uint32_t zc_depot_max;
struct zone_depot zc_depot;
} *zone_cache_t;
#if !__x86_64__
static
#endif
__security_const_late struct {
struct zone_map_range zi_map_range[ZONE_ADDR_KIND_COUNT];
struct zone_map_range zi_meta_range; /* debugging only */
struct zone_map_range zi_bits_range; /* bits buddy allocator */
struct zone_map_range zi_pgz_range;
struct zone_page_metadata *zi_pgz_meta;
/*
* The metadata lives within the zi_meta_range address range.
*
* The correct formula to find a metadata index is:
* absolute_page_index - page_index(MIN(zi_map_range[*].min_address))
*
* And then this index is used to dereference zi_meta_range.min_address
* as a `struct zone_page_metadata` array.
*
* To avoid doing that substraction all the time in the various fast-paths,
* zi_meta_base are pre-offset with that minimum page index to avoid redoing
* that math all the time.
*
* Do note that the array might have a hole punched in the middle,
* see zone_metadata_init().
*/
struct zone_page_metadata *zi_meta_base;
} zone_info;
/*
* Initial array of metadata for stolen memory.
*
* The numbers here have to be kept in sync with vm_map_steal_memory()
* so that we have reserved enough metadata.
*
* After zone_init() has run (which happens while the kernel is still single
* threaded), the metadata is moved to its final dynamic location, and
* this array is unmapped with the rest of __startup_data at lockdown.
*/
#define ZONE_FOREIGN_META_INLINE_COUNT 64
__startup_data
static struct zone_page_metadata
zone_foreign_meta_array_startup[ZONE_FOREIGN_META_INLINE_COUNT];
__startup_data
static struct zone_map_range zone_early_steal;
/*
* The zone_locks_grp allows for collecting lock statistics.
* All locks are associated to this group in zinit.
* Look at tools/lockstat for debugging lock contention.
*/
static LCK_GRP_DECLARE(zone_locks_grp, "zone_locks");
static LCK_MTX_EARLY_DECLARE(zone_metadata_region_lck, &zone_locks_grp);
/*
* The zone metadata lock protects:
* - metadata faulting,
* - VM submap VA allocations,
* - early gap page queue list
*/
#define zone_meta_lock() lck_mtx_lock(&zone_metadata_region_lck);
#define zone_meta_unlock() lck_mtx_unlock(&zone_metadata_region_lck);
/*
* Exclude more than one concurrent garbage collection
*/
static LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc");
static LCK_MTX_EARLY_DECLARE(zone_gc_lock, &zone_gc_lck_grp);
static LCK_SPIN_DECLARE(zone_exhausted_lock, &zone_gc_lck_grp);
/*
* Panic logging metadata
*/
bool panic_include_zprint = false;
bool panic_include_kalloc_types = false;
zone_t kalloc_type_src_zone = ZONE_NULL;
zone_t kalloc_type_dst_zone = ZONE_NULL;
mach_memory_info_t *panic_kext_memory_info = NULL;
vm_size_t panic_kext_memory_size = 0;
vm_offset_t panic_fault_address = 0;
/*
* Protects zone_array, num_zones, num_zones_in_use, and
* zone_destroyed_bitmap
*/
static SIMPLE_LOCK_DECLARE(all_zones_lock, 0);
static zone_id_t num_zones_in_use;
zone_id_t _Atomic num_zones;
SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count;
/*
* Initial globals for zone stats until we can allocate the real ones.
* Those get migrated inside the per-CPU ones during zone_init() and
* this array is unmapped with the rest of __startup_data at lockdown.
*/
/* zone to allocate zone_magazine structs from */
static SECURITY_READ_ONLY_LATE(zone_t) zc_magazine_zone;
/*
* Until pid1 is made, zone caching is off,
* until compute_zone_working_set_size() runs for the firt time.
*
* -1 represents the "never enabled yet" value.
*/
static int8_t zone_caching_disabled = -1;
__startup_data
static struct zone_cache zone_cache_startup[MAX_ZONES];
__startup_data
static struct zone_stats zone_stats_startup[MAX_ZONES];
struct zone zone_array[MAX_ZONES];
SECURITY_READ_ONLY_LATE(zone_security_flags_t) zone_security_array[MAX_ZONES] = {
[0 ... MAX_ZONES - 1] = {
.z_allows_foreign = false,
.z_kheap_id = KHEAP_ID_NONE,
.z_noencrypt = false,
.z_submap_idx = Z_SUBMAP_IDX_GENERAL_0,
.z_kalloc_type = false,
.z_va_sequester = ZSECURITY_CONFIG(SEQUESTER),
},
};
SECURITY_READ_ONLY_LATE(uint16_t) zone_ro_elem_size[MAX_ZONES];
/* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */
static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count;
/* Used to keep track of destroyed slots in the zone_array */
static bitmap_t zone_destroyed_bitmap[BITMAP_LEN(MAX_ZONES)];
/* number of zone mapped pages used by all zones */
static size_t _Atomic zone_pages_wired;
static size_t _Atomic zone_pages_jetsam_threshold = ~0;
#if CONFIG_PROB_GZALLOC
static int32_t _Atomic zone_guard_pages;
#endif /* CONFIG_PROB_GZALLOC */
#define ZSECURITY_DEFAULT ( \
ZSECURITY_OPTIONS_KERNEL_DATA_MAP | \
0)
TUNABLE(zone_security_options_t, zsecurity_options, "zs", ZSECURITY_DEFAULT);
/* Time in (ms) after which we panic for zone exhaustions */
TUNABLE(int, zone_exhausted_timeout, "zet", 5000);
#if VM_TAG_SIZECLASSES
/* enable tags for zones that ask for it */
static TUNABLE(bool, zone_tagging_on, "-zt", false);
#endif /* VM_TAG_SIZECLASSES */
#if DEBUG || DEVELOPMENT
static int zalloc_simulate_vm_pressure;
TUNABLE(bool, zalloc_disable_copyio_check, "-no-copyio-zalloc-check", false);
#endif /* DEBUG || DEVELOPMENT */
/*
* Zone caching tunables
*
* zc_mag_size():
* size of magazines, larger to reduce contention at the expense of memory
*
* zc_auto_enable_threshold
* number of contentions per second after which zone caching engages
* automatically.
*
* 0 to disable.
*
* zc_grow_threshold
* numer of contentions per second after which the per-cpu depot layer
* grows at each newly observed contention without restriction.
*
* 0 to disable.
*
* zc_recirc_denom
* denominator of the fraction of per-cpu depot to migrate to/from
* the recirculation depot layer at a time. Default 3 (1/3).
*
* zc_defrag_ratio
* percentage of the working set to recirc size below which
* the zone is defragmented. Default is 66%.
*
* zc_defrag_threshold
* how much memory needs to be free before the auto-defrag is even considered.
* Default is 512k.
*
* zc_autogc_ratio
* percentage of the working set to min-free size below which
* the zone is auto-GCed to the working set size. Default is 20%.
*
* zc_autogc_threshold
* how much memory needs to be free before the auto-gc is even considered.
* Default is 4M.
*
* zc_free_batch_size
* The size of batches of frees/reclaim that can be done keeping
* the zone lock held (and preemption disabled).
*/
static TUNABLE(uint16_t, zc_magazine_size, "zc_mag_size", 8);
static TUNABLE(uint32_t, zc_auto_threshold, "zc_auto_enable_threshold", 20);
static TUNABLE(uint32_t, zc_grow_threshold, "zc_grow_threshold", 8);
static TUNABLE(uint32_t, zc_recirc_denom, "zc_recirc_denom", 3);
static TUNABLE(uint32_t, zc_defrag_ratio, "zc_defrag_ratio", 66);
static TUNABLE(uint32_t, zc_defrag_threshold, "zc_defrag_threshold", 512u << 10);
static TUNABLE(uint32_t, zc_autogc_ratio, "zc_autogc_ratio", 20);
static TUNABLE(uint32_t, zc_autogc_threshold, "zc_autogc_threshold", 4u << 20);
static TUNABLE(uint32_t, zc_free_batch_size, "zc_free_batch_size", 256);
static SECURITY_READ_ONLY_LATE(size_t) zone_pages_wired_max;
static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT];
static SECURITY_READ_ONLY_LATE(vm_map_t) zone_meta_submaps[2];
static char const * const zone_submaps_names[Z_SUBMAP_IDX_COUNT] = {
[Z_SUBMAP_IDX_VM] = "VM",
[Z_SUBMAP_IDX_READ_ONLY] = "RO",
#if ZSECURITY_CONFIG(SAD_FENG_SHUI)
[Z_SUBMAP_IDX_GENERAL_0] = "GEN0",
[Z_SUBMAP_IDX_GENERAL_1] = "GEN1",
[Z_SUBMAP_IDX_GENERAL_2] = "GEN2",
[Z_SUBMAP_IDX_GENERAL_3] = "GEN3",
#else
[Z_SUBMAP_IDX_GENERAL_0] = "GEN",
#endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
[Z_SUBMAP_IDX_DATA] = "DATA",
};
#if __x86_64__
#define ZONE_ENTROPY_CNT 8
#else
#define ZONE_ENTROPY_CNT 2
#endif
static struct zone_bool_gen {
struct bool_gen zbg_bg;
uint32_t zbg_entropy[ZONE_ENTROPY_CNT];
} zone_bool_gen[MAX_CPUS];
#if CONFIG_PROB_GZALLOC
/*
* Probabilistic gzalloc
* =====================
*
*
* Probabilistic guard zalloc samples allocations and will protect them by
* double-mapping the page holding them and returning the secondary virtual
* address to its callers.
*
* Its data structures are lazily allocated if the `pgz` or `pgz1` boot-args
* are set.
*
*
* Unlike GZalloc, PGZ uses a fixed amount of memory, and is compatible with
* most zalloc/kalloc features:
* - zone_require is functional
* - zone caching or zone tagging is compatible
* - non-blocking allocation work (they will always return NULL with gzalloc).
*
* PGZ limitations:
* - VA sequestering isn't respected, as the slots (which are in limited
* quantity) will be reused for any type, however the PGZ quarantine
* somewhat mitigates the impact.
* - zones with elements larger than a page cannot be protected.
*
*
* Tunables:
* --------
*
* pgz=1:
* Turn on probabilistic guard malloc for all zones
*
* (default on for DEVELOPMENT, off for RELEASE, or if pgz1... are specified)
*
* pgz_sample_rate=0 to 2^31
* average sample rate between two guarded allocations.
* 0 means every allocation.
*
* The default is a random number between 1000 and 10,000
*
* pgz_slots
* how many allocations to protect.
*
* Each costs:
* - a PTE in the pmap (when allocated)
* - 2 zone page meta's (every other page is a "guard" one, 32B total)
* - 64 bytes per backtraces.
* On LP64 this is <16K per 100 slots.
*
* The default is ~200 slots per G of physical ram (32k / G)
*
* TODO:
* - try harder to allocate elements at the "end" to catch OOB more reliably.
*
* pgz_quarantine
* how many slots should be free at any given time.
*
* PGZ will round robin through free slots to be reused, but free slots are
* important to detect use-after-free by acting as a quarantine.
*
* By default, PGZ will keep 33% of the slots around at all time.
*
* pgz1=<name>, pgz2=<name>, ..., pgzn=<name>...
* Specific zones for which to enable probabilistic guard malloc.
* There must be no numbering gap (names after the gap will be ignored).
*/
#if DEBUG || DEVELOPMENT
static TUNABLE(bool, pgz_all, "pgz", true);
#else
static TUNABLE(bool, pgz_all, "pgz", false);
#endif
static TUNABLE(uint32_t, pgz_sample_rate, "pgz_sample_rate", 0);
static TUNABLE(uint32_t, pgz_slots, "pgz_slots", UINT32_MAX);
static TUNABLE(uint32_t, pgz_quarantine, "pgz_quarantine", 0);
#endif /* CONFIG_PROB_GZALLOC */
static zone_t zone_find_largest(uint64_t *zone_size);
#endif /* !ZALLOC_TEST */
#pragma mark Zone metadata
#if !ZALLOC_TEST
static inline bool
zone_has_index(zone_t z, zone_id_t zid)
{
return zone_array + zid == z;
}
static zone_element_t
zone_element_encode(vm_offset_t base, vm_offset_t eidx)
{
return (zone_element_t){ .ze_value = base | eidx };
}
static vm_offset_t
zone_element_base(zone_element_t ze)
{
return trunc_page(ze.ze_value);
}
static vm_offset_t
zone_element_idx(zone_element_t ze)
{
return ze.ze_value & PAGE_MASK;
}
static vm_offset_t
zone_element_addr(zone_t z, zone_element_t ze, vm_offset_t esize)
{
vm_offset_t offs = zone_oob_offs(z);
return offs + zone_element_base(ze) + esize * zone_element_idx(ze);
}
__abortlike
void
zone_invalid_panic(zone_t zone)
{
panic("zone %p isn't in the zone_array", zone);
}
__abortlike
static void
zone_metadata_corruption(zone_t zone, struct zone_page_metadata *meta,
const char *kind)
{
panic("zone metadata corruption: %s (meta %p, zone %s%s)",
kind, meta, zone_heap_name(zone), zone->z_name);
}
__abortlike
static void
zone_invalid_element_addr_panic(zone_t zone, vm_offset_t addr)
{
panic("zone element pointer validation failed (addr: %p, zone %s%s)",
(void *)addr, zone_heap_name(zone), zone->z_name);
}
__abortlike
static void
zone_page_metadata_index_confusion_panic(zone_t zone, vm_offset_t addr,
struct zone_page_metadata *meta)
{
zone_security_flags_t zsflags = zone_security_config(zone), src_zsflags;
zone_id_t zidx;
zone_t src_zone;
if (zsflags.z_kalloc_type) {
panic_include_kalloc_types = true;
kalloc_type_dst_zone = zone;
}
zidx = meta->zm_index;
if (zidx >= os_atomic_load(&num_zones, relaxed)) {
panic("%p expected in zone %s%s[%d], but metadata has invalid zidx: %d",
(void *)addr, zone_heap_name(zone), zone->z_name, zone_index(zone),
zidx);
}
src_zone = &zone_array[zidx];
src_zsflags = zone_security_array[zidx];
if (src_zsflags.z_kalloc_type) {
panic_include_kalloc_types = true;
kalloc_type_src_zone = src_zone;
}
panic("%p not in the expected zone %s%s[%d], but found in %s%s[%d]",
(void *)addr, zone_heap_name(zone), zone->z_name, zone_index(zone),
zone_heap_name(src_zone), src_zone->z_name, zidx);
}
__abortlike
static void
zone_page_metadata_native_queue_corruption(zone_t zone, zone_pva_t *queue)
{
panic("foreign metadata index %d enqueued in native head %p from zone %s%s",
queue->packed_address, queue, zone_heap_name(zone),
zone->z_name);
}
__abortlike
static void
zone_page_metadata_list_corruption(zone_t zone, struct zone_page_metadata *meta)
{
panic("metadata list corruption through element %p detected in zone %s%s",
meta, zone_heap_name(zone), zone->z_name);
}
__abortlike __unused
static void
zone_invalid_foreign_addr_panic(zone_t zone, vm_offset_t addr)
{
panic("addr %p being freed to foreign zone %s%s not from foreign range",
(void *)addr, zone_heap_name(zone), zone->z_name);
}
__abortlike
static void
zone_page_meta_accounting_panic(zone_t zone, struct zone_page_metadata *meta,
const char *kind)
{
panic("accounting mismatch (%s) for zone %s%s, meta %p", kind,
zone_heap_name(zone), zone->z_name, meta);
}
__abortlike
static void
zone_meta_double_free_panic(zone_t zone, zone_element_t ze, const char *caller)
{
panic("%s: double free of %p to zone %s%s", caller,
(void *)zone_element_addr(zone, ze, zone_elem_size(zone)),
zone_heap_name(zone), zone->z_name);
}
__abortlike
static void
zone_accounting_panic(zone_t zone, const char *kind)
{
panic("accounting mismatch (%s) for zone %s%s", kind,
zone_heap_name(zone), zone->z_name);
}
#define zone_counter_sub(z, stat, value) ({ \
if (os_sub_overflow((z)->stat, value, &(z)->stat)) { \
zone_accounting_panic(z, #stat " wrap-around"); \
} \
(z)->stat; \
})
static inline void
zone_elems_free_add(zone_t z, uint32_t count)
{
uint32_t n = (z->z_elems_free += count);
if (z->z_elems_free_max < n) {
z->z_elems_free_max = n;
}
}
static inline void
zone_elems_free_sub(zone_t z, uint32_t count)
{
uint32_t n = zone_counter_sub(z, z_elems_free, count);
if (z->z_elems_free_min > n) {
z->z_elems_free_min = n;
}
}
static inline uint16_t
zone_meta_alloc_size_add(zone_t z, struct zone_page_metadata *m,
vm_offset_t esize)
{
if (os_add_overflow(m->zm_alloc_size, (uint16_t)esize, &m->zm_alloc_size)) {
zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around");
}
return m->zm_alloc_size;
}
static inline uint16_t
zone_meta_alloc_size_sub(zone_t z, struct zone_page_metadata *m,
vm_offset_t esize)
{
if (os_sub_overflow(m->zm_alloc_size, esize, &m->zm_alloc_size)) {
zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around");
}
return m->zm_alloc_size;
}
__abortlike
static void
zone_nofail_panic(zone_t zone)
{
panic("zalloc(Z_NOFAIL) can't be satisfied for zone %s%s (potential leak)",
zone_heap_name(zone), zone->z_name);
}
#if __arm64__
// <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to
#define zone_range_load(r, rmin, rmax) \
asm("ldp %[rmin], %[rmax], [%[range]]" \
: [rmin] "=r"(rmin), [rmax] "=r"(rmax) \
: [range] "r"(r), "m"((r)->min_address), "m"((r)->max_address))
#else
#define zone_range_load(r, rmin, rmax) \
({ rmin = (r)->min_address; rmax = (r)->max_address; })
#endif
__attribute__((overloadable))
__header_always_inline bool
zone_range_contains(const struct zone_map_range *r, vm_offset_t addr)
{
vm_offset_t rmin, rmax;
#if CONFIG_KERNEL_TBI
addr = VM_KERNEL_TBI_FILL(addr);
#endif /* CONFIG_KERNEL_TBI */
/*
* The `&` is not a typo: we really expect the check to pass,
* so encourage the compiler to eagerly load and test without branches
*/
zone_range_load(r, rmin, rmax);
return (addr >= rmin) & (addr < rmax);
}
__attribute__((overloadable))
__header_always_inline bool
zone_range_contains(const struct zone_map_range *r, vm_offset_t addr, vm_offset_t size)
{
vm_offset_t rmin, rmax;
#if CONFIG_KERNEL_TBI
addr = VM_KERNEL_TBI_FILL(addr);
#endif /* CONFIG_KERNEL_TBI */
/*
* The `&` is not a typo: we really expect the check to pass,
* so encourage the compiler to eagerly load and test without branches
*/
zone_range_load(r, rmin, rmax);
return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax);
}
__header_always_inline bool
zone_spans_ro_va(vm_offset_t addr_start, vm_offset_t addr_end)
{
vm_offset_t rmin, rmax;
#if CONFIG_KERNEL_TBI
addr_start = VM_KERNEL_STRIP_UPTR(addr_start);
addr_end = VM_KERNEL_STRIP_UPTR(addr_end);
#endif /* CONFIG_KERNEL_TBI */
zone_range_load(&zone_info.zi_map_range[ZONE_ADDR_READONLY], rmin, rmax);
/*
* Either the start and the end are leftward of the read-only range, or they
* are both completely rightward. If neither, then they span over the range.
*/
if ((addr_start < rmin) && (addr_end < rmin)) {
/* Leftward */
return false;
} else if ((addr_start > rmax) && (addr_end > rmax)) {
/* Rightward */
return false;
}
return true;
}
__header_always_inline vm_size_t
zone_range_size(const struct zone_map_range *r)
{