-
Notifications
You must be signed in to change notification settings - Fork 331
Expand file tree
/
Copy pathzalloc.c
More file actions
11357 lines (9803 loc) · 293 KB
/
zalloc.c
File metadata and controls
11357 lines (9803 loc) · 293 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
/*
* @OSF_COPYRIGHT@
*/
/*
* Mach Operating System
* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
* All Rights Reserved.
*
* Permission to use, copy, modify and distribute this software and its
* documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie Mellon
* the rights to redistribute these changes.
*/
/*
*/
/*
* File: kern/zalloc.c
* Author: Avadis Tevanian, Jr.
*
* Zone-based memory allocator. A zone is a collection of fixed size
* data blocks for which quick allocation/deallocation is possible.
*/
#define ZALLOC_ALLOW_DEPRECATED 1
#if !ZALLOC_TEST
#include <mach/mach_types.h>
#include <mach/vm_param.h>
#include <mach/kern_return.h>
#include <mach/mach_host_server.h>
#include <mach/task_server.h>
#include <mach/machine/vm_types.h>
#include <machine/machine_routines.h>
#include <mach/vm_map.h>
#include <mach/sdt.h>
#if __x86_64__
#include <i386/cpuid.h>
#endif
#include <kern/bits.h>
#include <kern/btlog.h>
#include <kern/startup.h>
#include <kern/kern_types.h>
#include <kern/assert.h>
#include <kern/backtrace.h>
#include <kern/host.h>
#include <kern/macro_help.h>
#include <kern/sched.h>
#include <kern/locks.h>
#include <kern/sched_prim.h>
#include <kern/misc_protos.h>
#include <kern/thread_call.h>
#include <kern/zalloc_internal.h>
#include <kern/kalloc.h>
#include <kern/debug.h>
#include <prng/random.h>
#include <vm/pmap.h>
#include <vm/vm_map_internal.h>
#include <vm/vm_memtag.h>
#include <vm/vm_kern_internal.h>
#include <vm/vm_page_internal.h>
#include <vm/vm_pageout_internal.h>
#include <vm/vm_compressor_xnu.h> /* C_SLOT_PACKED_PTR* */
#include <pexpert/pexpert.h>
#include <machine/machparam.h>
#include <machine/machine_routines.h> /* ml_cpu_get_info */
#include <os/atomic.h>
#include <libkern/OSDebug.h>
#include <libkern/OSAtomic.h>
#include <libkern/section_keywords.h>
#include <sys/kdebug.h>
#include <sys/code_signing.h>
#include <san/kasan.h>
#include <libsa/stdlib.h>
#include <sys/errno.h>
#include <IOKit/IOBSD.h>
#include <arm64/amcc_rorgn.h>
#if DEBUG
#define z_debug_assert(expr) assert(expr)
#else
#define z_debug_assert(expr) (void)(expr)
#endif
#if CONFIG_PROB_GZALLOC && CONFIG_SPTM
#error This is not a supported configuration
#endif
/* Returns pid of the task with the largest number of VM map entries. */
extern pid_t find_largest_process_vm_map_entries(void);
/*
* Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
* For any other pid we try to kill that process synchronously.
*/
extern boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
extern zone_t vm_object_zone;
extern zone_t ipc_service_port_label_zone;
ZONE_DEFINE_TYPE(percpu_u64_zone, "percpu.64", uint64_t,
ZC_PERCPU | ZC_ALIGNMENT_REQUIRED | ZC_KASAN_NOREDZONE);
#if ZSECURITY_CONFIG(ZONE_TAGGING)
#define ZONE_MIN_ELEM_SIZE (sizeof(uint64_t) * 2)
#define ZONE_ALIGN_SIZE ZONE_MIN_ELEM_SIZE
#else /* ZSECURITY_CONFIG_ZONE_TAGGING */
#define ZONE_MIN_ELEM_SIZE sizeof(uint64_t)
#define ZONE_ALIGN_SIZE ZONE_MIN_ELEM_SIZE
#endif /* ZSECURITY_CONFIG_ZONE_TAGGING */
#define ZONE_MAX_ALLOC_SIZE (32 * 1024)
#if ZSECURITY_CONFIG(SAD_FENG_SHUI)
#define ZONE_CHUNK_ALLOC_SIZE (256 * 1024)
#define ZONE_GUARD_DENSE (32 * 1024)
#define ZONE_GUARD_SPARSE (64 * 1024)
#endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
#if XNU_PLATFORM_MacOSX
#define ZONE_MAP_MAX (32ULL << 30)
#define ZONE_MAP_VA_SIZE (128ULL << 30)
#else /* XNU_PLATFORM_MacOSX */
#define ZONE_MAP_MAX (8ULL << 30)
#define ZONE_MAP_VA_SIZE (24ULL << 30)
#endif /* !XNU_PLATFORM_MacOSX */
__enum_closed_decl(zm_len_t, uint16_t, {
ZM_CHUNK_FREE = 0x0,
/* 1 through 8 are valid lengths */
ZM_CHUNK_LEN_MAX = 0x8,
/* PGZ magical values */
ZM_PGZ_FREE = 0x0,
ZM_PGZ_ALLOCATED = 0xa, /* [a]llocated */
ZM_PGZ_GUARD = 0xb, /* oo[b] */
ZM_PGZ_DOUBLE_FREE = 0xd, /* [d]ouble_free */
/* secondary page markers */
ZM_SECONDARY_PAGE = 0xe,
ZM_SECONDARY_PCPU_PAGE = 0xf,
});
static_assert(MAX_ZONES < (1u << 10), "MAX_ZONES must fit in zm_index");
struct zone_page_metadata {
union {
struct {
/* The index of the zone this metadata page belongs to */
zone_id_t zm_index : 10;
/*
* This chunk ends with a guard page.
*/
uint16_t zm_guarded : 1;
/*
* Whether `zm_bitmap` is an inline bitmap
* or a packed bitmap reference
*/
uint16_t zm_inline_bitmap : 1;
/*
* Zones allocate in "chunks" of zone_t::z_chunk_pages
* consecutive pages, or zpercpu_count() pages if the
* zone is percpu.
*
* The first page of it has its metadata set with:
* - 0 if none of the pages are currently wired
* - the number of wired pages in the chunk
* (not scaled for percpu).
*
* Other pages in the chunk have their zm_chunk_len set
* to ZM_SECONDARY_PAGE or ZM_SECONDARY_PCPU_PAGE
* depending on whether the zone is percpu or not.
* For those, zm_page_index holds the index of that page
* in the run, and zm_subchunk_len the remaining length
* within the chunk.
*
* Metadata used for PGZ pages can have 3 values:
* - ZM_PGZ_FREE: slot is free
* - ZM_PGZ_ALLOCATED: slot holds an allocated element
* at offset (zm_pgz_orig_addr & PAGE_MASK)
* - ZM_PGZ_DOUBLE_FREE: slot detected a double free
* (will panic).
*/
zm_len_t zm_chunk_len : 4;
};
uint16_t zm_bits;
};
union {
#define ZM_ALLOC_SIZE_LOCK 1u
uint16_t zm_alloc_size; /* first page only */
struct {
uint8_t zm_page_index; /* secondary pages only */
uint8_t zm_subchunk_len; /* secondary pages only */
};
uint16_t zm_oob_offs; /* in guard pages */
};
union {
uint32_t zm_bitmap; /* most zones */
uint32_t zm_bump; /* permanent zones */
};
union {
struct {
zone_pva_t zm_page_next;
zone_pva_t zm_page_prev;
};
vm_offset_t zm_pgz_orig_addr;
struct zone_page_metadata *zm_pgz_slot_next;
};
};
static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing");
/*!
* @typedef zone_magazine_t
*
* @brief
* Magazine of cached allocations.
*
* @field zm_next linkage used by magazine depots.
* @field zm_elems an array of @c zc_mag_size() elements.
*/
struct zone_magazine {
zone_magazine_t zm_next;
smr_seq_t zm_seq;
vm_offset_t zm_elems[0];
};
/*!
* @typedef zone_cache_t
*
* @brief
* Magazine of cached allocations.
*
* @discussion
* Below is a diagram of the caching system. This design is inspired by the
* paper "Magazines and Vmem: Extending the Slab Allocator to Many CPUs and
* Arbitrary Resources" by Jeff Bonwick and Jonathan Adams and the FreeBSD UMA
* zone allocator (itself derived from this seminal work).
*
* It is divided into 3 layers:
* - the per-cpu layer,
* - the recirculation depot layer,
* - the Zone Allocator.
*
* The per-cpu and recirculation depot layer use magazines (@c zone_magazine_t),
* which are stacks of up to @c zc_mag_size() elements.
*
* <h2>CPU layer</h2>
*
* The CPU layer (@c zone_cache_t) looks like this:
*
* ╭─ a ─ f ─┬───────── zm_depot ──────────╮
* │ ╭─╮ ╭─╮ │ ╭─╮ ╭─╮ ╭─╮ ╭─╮ ╭─╮ │
* │ │#│ │#│ │ │#│ │#│ │#│ │#│ │#│ │
* │ │#│ │ │ │ │#│ │#│ │#│ │#│ │#│ │
* │ │ │ │ │ │ │#│ │#│ │#│ │#│ │#│ │
* │ ╰─╯ ╰─╯ │ ╰─╯ ╰─╯ ╰─╯ ╰─╯ ╰─╯ │
* ╰─────────┴─────────────────────────────╯
*
* It has two pre-loaded magazines (a)lloc and (f)ree which we allocate from,
* or free to. Serialization is achieved through disabling preemption, and only
* the current CPU can acces those allocations. This is represented on the left
* hand side of the diagram above.
*
* The right hand side is the per-cpu depot. It consists of @c zm_depot_count
* full magazines, and is protected by the @c zm_depot_lock for access.
* The lock is expected to absolutely never be contended, as only the local CPU
* tends to access the local per-cpu depot in regular operation mode.
*
* However unlike UMA, our implementation allows for the zone GC to reclaim
* per-CPU magazines aggresively, which is serialized with the @c zm_depot_lock.
*
*
* <h2>Recirculation Depot</h2>
*
* The recirculation depot layer is a list similar to the per-cpu depot,
* however it is different in two fundamental ways:
*
* - it is protected by the regular zone lock,
* - elements referenced by the magazines in that layer appear free
* to the zone layer.
*
*
* <h2>Magazine circulation and sizing</h2>
*
* The caching system sizes itself dynamically. Operations that allocate/free
* a single element call @c zone_lock_nopreempt_check_contention() which records
* contention on the lock by doing a trylock and recording its success.
*
* This information is stored in the @c z_recirc_cont_cur field of the zone,
* and a windowed moving average is maintained in @c z_contention_wma.
* The periodically run function @c compute_zone_working_set_size() will then
* take this into account to decide to grow the number of buckets allowed
* in the depot or shrink it based on the @c zc_grow_level and @c zc_shrink_level
* thresholds.
*
* The per-cpu layer will attempt to work with its depot, finding both full and
* empty magazines cached there. If it can't get what it needs, then it will
* mediate with the zone recirculation layer. Such recirculation is done in
* batches in order to amortize lock holds.
* (See @c {zalloc,zfree}_cached_depot_recirculate()).
*
* The recirculation layer keeps a track of what the minimum amount of magazines
* it had over time was for each of the full and empty queues. This allows for
* @c compute_zone_working_set_size() to return memory to the system when a zone
* stops being used as much.
*
* <h2>Security considerations</h2>
*
* The zone caching layer has been designed to avoid returning elements in
* a strict LIFO behavior: @c zalloc() will allocate from the (a) magazine,
* and @c zfree() free to the (f) magazine, and only swap them when the
* requested operation cannot be fulfilled.
*
* The per-cpu overflow depot or the recirculation depots are similarly used
* in FIFO order.
*
* @field zc_depot_lock a lock to access @c zc_depot, @c zc_depot_cur.
* @field zc_alloc_cur denormalized number of elements in the (a) magazine
* @field zc_free_cur denormalized number of elements in the (f) magazine
* @field zc_alloc_elems a pointer to the array of elements in (a)
* @field zc_free_elems a pointer to the array of elements in (f)
*
* @field zc_depot a list of @c zc_depot_cur full magazines
*/
typedef struct zone_cache {
hw_lck_ticket_t zc_depot_lock;
uint16_t zc_alloc_cur;
uint16_t zc_free_cur;
vm_offset_t *zc_alloc_elems;
vm_offset_t *zc_free_elems;
struct zone_depot zc_depot;
smr_t zc_smr;
zone_smr_free_cb_t XNU_PTRAUTH_SIGNED_FUNCTION_PTR("zc_free") zc_free;
} __attribute__((aligned(64))) * zone_cache_t;
#if !__x86_64__
static
#endif
__security_const_late struct {
struct mach_vm_range zi_map_range; /* all zone submaps */
struct mach_vm_range zi_ro_range; /* read-only range */
struct mach_vm_range zi_meta_range; /* debugging only */
struct mach_vm_range zi_bits_range; /* bits buddy allocator */
struct mach_vm_range zi_xtra_range; /* vm tracking metadata */
struct mach_vm_range zi_pgz_range;
struct zone_page_metadata *zi_pgz_meta;
/*
* The metadata lives within the zi_meta_range address range.
*
* The correct formula to find a metadata index is:
* absolute_page_index - page_index(zi_map_range.min_address)
*
* And then this index is used to dereference zi_meta_range.min_address
* as a `struct zone_page_metadata` array.
*
* To avoid doing that substraction all the time in the various fast-paths,
* zi_meta_base are pre-offset with that minimum page index to avoid redoing
* that math all the time.
*/
struct zone_page_metadata *zi_meta_base;
} zone_info;
__startup_data static struct mach_vm_range zone_map_range;
__startup_data static vm_map_size_t zone_meta_size;
__startup_data static vm_map_size_t zone_bits_size;
__startup_data static vm_map_size_t zone_xtra_size;
/*
* Initial array of metadata for stolen memory.
*
* The numbers here have to be kept in sync with vm_map_steal_memory()
* so that we have reserved enough metadata.
*
* After zone_init() has run (which happens while the kernel is still single
* threaded), the metadata is moved to its final dynamic location, and
* this array is unmapped with the rest of __startup_data at lockdown.
*/
#define ZONE_EARLY_META_INLINE_COUNT 64
__startup_data
static struct zone_page_metadata
zone_early_meta_array_startup[ZONE_EARLY_META_INLINE_COUNT];
__startup_data __attribute__((aligned(PAGE_MAX_SIZE)))
static uint8_t zone_early_pages_to_cram[PAGE_MAX_SIZE * 16];
/*
* The zone_locks_grp allows for collecting lock statistics.
* All locks are associated to this group in zinit.
* Look at tools/lockstat for debugging lock contention.
*/
LCK_GRP_DECLARE(zone_locks_grp, "zone_locks");
static LCK_MTX_DECLARE(zone_metadata_region_lck, &zone_locks_grp);
/*
* The zone metadata lock protects:
* - metadata faulting,
* - VM submap VA allocations,
* - early gap page queue list
*/
#define zone_meta_lock() lck_mtx_lock(&zone_metadata_region_lck);
#define zone_meta_unlock() lck_mtx_unlock(&zone_metadata_region_lck);
/*
* Exclude more than one concurrent garbage collection
*/
static LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc");
static LCK_MTX_DECLARE(zone_gc_lock, &zone_gc_lck_grp);
static LCK_SPIN_DECLARE(zone_exhausted_lock, &zone_gc_lck_grp);
/*
* Panic logging metadata
*/
bool panic_include_zprint = false;
bool panic_include_kalloc_types = false;
zone_t kalloc_type_src_zone = ZONE_NULL;
zone_t kalloc_type_dst_zone = ZONE_NULL;
mach_memory_info_t *panic_kext_memory_info = NULL;
vm_size_t panic_kext_memory_size = 0;
vm_offset_t panic_fault_address = 0;
/*
* Protects zone_array, num_zones, num_zones_in_use, and
* zone_destroyed_bitmap
*/
static SIMPLE_LOCK_DECLARE(all_zones_lock, 0);
static zone_id_t num_zones_in_use;
zone_id_t _Atomic num_zones;
SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count;
/*
* Initial globals for zone stats until we can allocate the real ones.
* Those get migrated inside the per-CPU ones during zone_init() and
* this array is unmapped with the rest of __startup_data at lockdown.
*/
/* zone to allocate zone_magazine structs from */
static SECURITY_READ_ONLY_LATE(zone_t) zc_magazine_zone;
/*
* Until pid1 is made, zone caching is off,
* until compute_zone_working_set_size() runs for the firt time.
*
* -1 represents the "never enabled yet" value.
*/
static int8_t zone_caching_disabled = -1;
__startup_data
static struct zone_stats zone_stats_startup[MAX_ZONES];
struct zone zone_array[MAX_ZONES];
SECURITY_READ_ONLY_LATE(zone_security_flags_t) zone_security_array[MAX_ZONES] = {
[0 ... MAX_ZONES - 1] = {
.z_kheap_id = KHEAP_ID_NONE,
.z_noencrypt = false,
.z_submap_idx = Z_SUBMAP_IDX_GENERAL_0,
.z_kalloc_type = false,
.z_sig_eq = 0,
#if ZSECURITY_CONFIG(ZONE_TAGGING)
.z_tag = 1,
#else /* ZSECURITY_CONFIG(ZONE_TAGGING) */
.z_tag = 0,
#endif /* ZSECURITY_CONFIG(ZONE_TAGGING) */
},
};
SECURITY_READ_ONLY_LATE(struct zone_size_params) zone_ro_size_params[ZONE_ID__LAST_RO + 1];
SECURITY_READ_ONLY_LATE(zone_cache_ops_t) zcache_ops[ZONE_ID__FIRST_DYNAMIC];
#if DEBUG || DEVELOPMENT
unsigned int
zone_max_zones(void)
{
return MAX_ZONES;
}
#endif
/* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */
static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count;
/* Used to keep track of destroyed slots in the zone_array */
static bitmap_t zone_destroyed_bitmap[BITMAP_LEN(MAX_ZONES)];
/* number of zone mapped pages used by all zones */
static size_t _Atomic zone_pages_jetsam_threshold = ~0;
size_t zone_pages_wired;
size_t zone_guard_pages;
/* Time in (ms) after which we panic for zone exhaustions */
TUNABLE(int, zone_exhausted_timeout, "zet", 5000);
static bool zone_share_always = true;
static TUNABLE_WRITEABLE(uint32_t, zone_early_thres_mul, "zone_early_thres_mul", 5);
#if VM_TAG_SIZECLASSES
/*
* Zone tagging allows for per "tag" accounting of allocations for the kalloc
* zones only.
*
* There are 3 kinds of tags that can be used:
* - pre-registered VM_KERN_MEMORY_*
* - dynamic tags allocated per call sites in core-kernel (using vm_tag_alloc())
* - per-kext tags computed by IOKit (using the magic Z_VM_TAG_BT_BIT marker).
*
* The VM tracks the statistics in lazily allocated structures.
* See vm_tag_will_update_zone(), vm_tag_update_zone_size().
*
* If for some reason the requested tag cannot be accounted for,
* the tag is forced to VM_KERN_MEMORY_KALLOC which is pre-allocated.
*
* Each allocated element also remembers the tag it was assigned,
* which lets zalloc/zfree update statistics correctly.
*/
/* enable tags for zones that ask for it */
static TUNABLE(bool, zone_tagging_on, "-zt", false);
/*
* Array of all sizeclasses used by kalloc variants so that we can
* have accounting per size class for each kalloc callsite
*/
static uint16_t zone_tags_sizeclasses[VM_TAG_SIZECLASSES];
#endif /* VM_TAG_SIZECLASSES */
#if DEBUG || DEVELOPMENT
static int zalloc_simulate_vm_pressure;
#endif /* DEBUG || DEVELOPMENT */
#define Z_TUNABLE(t, n, d) \
TUNABLE(t, _##n, #n, d); \
__pure2 static inline t n(void) { return _##n; }
/*
* Zone caching tunables
*
* zc_mag_size():
* size of magazines, larger to reduce contention at the expense of memory
*
* zc_enable_level
* number of contentions per second after which zone caching engages
* automatically.
*
* 0 to disable.
*
* zc_grow_level
* number of contentions per second x cpu after which the number of magazines
* allowed in the depot can grow. (in "Z_WMA_UNIT" units).
*
* zc_shrink_level
* number of contentions per second x cpu below which the number of magazines
* allowed in the depot will shrink. (in "Z_WMA_UNIT" units).
*
* zc_pcpu_max
* maximum memory size in bytes that can hang from a CPU,
* which will affect how many magazines are allowed in the depot.
*
* The alloc/free magazines are assumed to be on average half-empty
* and to count for "1" unit of magazines.
*
* zc_autotrim_size
* Size allowed to hang extra from the recirculation depot before
* auto-trim kicks in.
*
* zc_autotrim_buckets
*
* How many buckets in excess of the working-set are allowed
* before auto-trim kicks in for empty buckets.
*
* zc_free_batch_size
* The size of batches of frees/reclaim that can be done keeping
* the zone lock held (and preemption disabled).
*/
Z_TUNABLE(uint16_t, zc_mag_size, 8);
static Z_TUNABLE(uint32_t, zc_enable_level, 10);
static Z_TUNABLE(uint32_t, zc_grow_level, 5 * Z_WMA_UNIT);
static Z_TUNABLE(uint32_t, zc_shrink_level, Z_WMA_UNIT / 2);
static Z_TUNABLE(uint32_t, zc_pcpu_max, 128 << 10);
static Z_TUNABLE(uint32_t, zc_autotrim_size, 16 << 10);
static Z_TUNABLE(uint32_t, zc_autotrim_buckets, 8);
static Z_TUNABLE(uint32_t, zc_free_batch_size, 128);
static SECURITY_READ_ONLY_LATE(size_t) zone_pages_wired_max;
static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT];
static SECURITY_READ_ONLY_LATE(vm_map_t) zone_meta_map;
static char const * const zone_submaps_names[Z_SUBMAP_IDX_COUNT] = {
[Z_SUBMAP_IDX_VM] = "VM",
[Z_SUBMAP_IDX_READ_ONLY] = "RO",
#if ZSECURITY_CONFIG(SAD_FENG_SHUI)
[Z_SUBMAP_IDX_GENERAL_0] = "GEN0",
[Z_SUBMAP_IDX_GENERAL_1] = "GEN1",
[Z_SUBMAP_IDX_GENERAL_2] = "GEN2",
[Z_SUBMAP_IDX_GENERAL_3] = "GEN3",
#else
[Z_SUBMAP_IDX_GENERAL_0] = "GEN",
#endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
[Z_SUBMAP_IDX_DATA] = "DATA",
};
#if __x86_64__
#define ZONE_ENTROPY_CNT 8
#else
#define ZONE_ENTROPY_CNT 2
#endif
static struct zone_bool_gen {
struct bool_gen zbg_bg;
uint32_t zbg_entropy[ZONE_ENTROPY_CNT];
} zone_bool_gen[MAX_CPUS];
#if CONFIG_PROB_GZALLOC
/*
* Probabilistic gzalloc
* =====================
*
*
* Probabilistic guard zalloc samples allocations and will protect them by
* double-mapping the page holding them and returning the secondary virtual
* address to its callers.
*
* Its data structures are lazily allocated if the `pgz` or `pgz1` boot-args
* are set.
*
*
* Unlike GZalloc, PGZ uses a fixed amount of memory, and is compatible with
* most zalloc/kalloc features:
* - zone_require is functional
* - zone caching or zone tagging is compatible
* - non-blocking allocation work (they will always return NULL with gzalloc).
*
* PGZ limitations:
* - VA sequestering isn't respected, as the slots (which are in limited
* quantity) will be reused for any type, however the PGZ quarantine
* somewhat mitigates the impact.
* - zones with elements larger than a page cannot be protected.
*
*
* Tunables:
* --------
*
* pgz=1:
* Turn on probabilistic guard malloc for all zones
*
* (default on for DEVELOPMENT, off for RELEASE, or if pgz1... are specified)
*
* pgz_sample_rate=0 to 2^31
* average sample rate between two guarded allocations.
* 0 means every allocation.
*
* The default is a random number between 1000 and 10,000
*
* pgz_slots
* how many allocations to protect.
*
* Each costs:
* - a PTE in the pmap (when allocated)
* - 2 zone page meta's (every other page is a "guard" one, 32B total)
* - 64 bytes per backtraces.
* On LP64 this is <16K per 100 slots.
*
* The default is ~200 slots per G of physical ram (32k / G)
*
* TODO:
* - try harder to allocate elements at the "end" to catch OOB more reliably.
*
* pgz_quarantine
* how many slots should be free at any given time.
*
* PGZ will round robin through free slots to be reused, but free slots are
* important to detect use-after-free by acting as a quarantine.
*
* By default, PGZ will keep 33% of the slots around at all time.
*
* pgz1=<name>, pgz2=<name>, ..., pgzn=<name>...
* Specific zones for which to enable probabilistic guard malloc.
* There must be no numbering gap (names after the gap will be ignored).
*/
#if DEBUG || DEVELOPMENT
static TUNABLE(bool, pgz_all, "pgz", true);
#else
static TUNABLE(bool, pgz_all, "pgz", false);
#endif
static TUNABLE(uint32_t, pgz_sample_rate, "pgz_sample_rate", 0);
static TUNABLE(uint32_t, pgz_slots, "pgz_slots", UINT32_MAX);
static TUNABLE(uint32_t, pgz_quarantine, "pgz_quarantine", 0);
#endif /* CONFIG_PROB_GZALLOC */
static zone_t zone_find_largest(uint64_t *zone_size);
#endif /* !ZALLOC_TEST */
#pragma mark Zone metadata
#if !ZALLOC_TEST
static inline bool
zone_has_index(zone_t z, zone_id_t zid)
{
return zone_array + zid == z;
}
__abortlike
void
zone_invalid_panic(zone_t zone)
{
panic("zone %p isn't in the zone_array", zone);
}
__abortlike
static void
zone_metadata_corruption(zone_t zone, struct zone_page_metadata *meta,
const char *kind)
{
panic("zone metadata corruption: %s (meta %p, zone %s%s)",
kind, meta, zone_heap_name(zone), zone->z_name);
}
__abortlike
static void
zone_invalid_element_addr_panic(zone_t zone, vm_offset_t addr)
{
panic("zone element pointer validation failed (addr: %p, zone %s%s)",
(void *)addr, zone_heap_name(zone), zone->z_name);
}
__abortlike
static void
zone_page_metadata_index_confusion_panic(zone_t zone, vm_offset_t addr,
struct zone_page_metadata *meta)
{
zone_security_flags_t zsflags = zone_security_config(zone), src_zsflags;
zone_id_t zidx;
zone_t src_zone;
if (zsflags.z_kalloc_type) {
panic_include_kalloc_types = true;
kalloc_type_dst_zone = zone;
}
zidx = meta->zm_index;
if (zidx >= os_atomic_load(&num_zones, relaxed)) {
panic("%p expected in zone %s%s[%d], but metadata has invalid zidx: %d",
(void *)addr, zone_heap_name(zone), zone->z_name, zone_index(zone),
zidx);
}
src_zone = &zone_array[zidx];
src_zsflags = zone_security_array[zidx];
if (src_zsflags.z_kalloc_type) {
panic_include_kalloc_types = true;
kalloc_type_src_zone = src_zone;
}
panic("%p not in the expected zone %s%s[%d], but found in %s%s[%d]",
(void *)addr, zone_heap_name(zone), zone->z_name, zone_index(zone),
zone_heap_name(src_zone), src_zone->z_name, zidx);
}
__abortlike
static void
zone_page_metadata_list_corruption(zone_t zone, struct zone_page_metadata *meta)
{
panic("metadata list corruption through element %p detected in zone %s%s",
meta, zone_heap_name(zone), zone->z_name);
}
__abortlike
static void
zone_page_meta_accounting_panic(zone_t zone, struct zone_page_metadata *meta,
const char *kind)
{
panic("accounting mismatch (%s) for zone %s%s, meta %p", kind,
zone_heap_name(zone), zone->z_name, meta);
}
__abortlike
static void
zone_meta_double_free_panic(zone_t zone, vm_offset_t addr, const char *caller)
{
panic("%s: double free of %p to zone %s%s", caller,
(void *)addr, zone_heap_name(zone), zone->z_name);
}
__abortlike
static void
zone_accounting_panic(zone_t zone, const char *kind)
{
panic("accounting mismatch (%s) for zone %s%s", kind,
zone_heap_name(zone), zone->z_name);
}
#define zone_counter_sub(z, stat, value) ({ \
if (os_sub_overflow((z)->stat, value, &(z)->stat)) { \
zone_accounting_panic(z, #stat " wrap-around"); \
} \
(z)->stat; \
})
static inline uint16_t
zone_meta_alloc_size_add(zone_t z, struct zone_page_metadata *m,
vm_offset_t esize)
{
if (os_add_overflow(m->zm_alloc_size, (uint16_t)esize, &m->zm_alloc_size)) {
zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around");
}
return m->zm_alloc_size;
}
static inline uint16_t
zone_meta_alloc_size_sub(zone_t z, struct zone_page_metadata *m,
vm_offset_t esize)
{
if (os_sub_overflow(m->zm_alloc_size, esize, &m->zm_alloc_size)) {
zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around");
}
return m->zm_alloc_size;
}
__abortlike
static void
zone_nofail_panic(zone_t zone)
{
panic("zalloc(Z_NOFAIL) can't be satisfied for zone %s%s (potential leak)",
zone_heap_name(zone), zone->z_name);
}
__header_always_inline bool
zone_spans_ro_va(vm_offset_t addr_start, vm_offset_t addr_end)
{
const struct mach_vm_range *ro_r = &zone_info.zi_ro_range;
struct mach_vm_range r = { addr_start, addr_end };
return mach_vm_range_intersects(ro_r, &r);
}
#define from_range(r, addr, size) \
__builtin_choose_expr(__builtin_constant_p(size) ? (size) == 1 : 0, \
mach_vm_range_contains(r, (mach_vm_offset_t)(addr)), \
mach_vm_range_contains(r, (mach_vm_offset_t)(addr), size))
#define from_ro_map(addr, size) \
from_range(&zone_info.zi_ro_range, addr, size)
#define from_zone_map(addr, size) \
from_range(&zone_info.zi_map_range, addr, size)
__header_always_inline bool
zone_pva_is_null(zone_pva_t page)
{
return page.packed_address == 0;
}
__header_always_inline bool
zone_pva_is_queue(zone_pva_t page)
{
// actual kernel pages have the top bit set
return (int32_t)page.packed_address > 0;
}
__header_always_inline bool
zone_pva_is_equal(zone_pva_t pva1, zone_pva_t pva2)
{
return pva1.packed_address == pva2.packed_address;
}
__header_always_inline zone_pva_t *
zone_pageq_base(void)
{
extern zone_pva_t data_seg_start[] __SEGMENT_START_SYM("__DATA");
/*
* `-1` so that if the first __DATA variable is a page queue,
* it gets a non 0 index
*/
return data_seg_start - 1;
}
__header_always_inline void
zone_queue_set_head(zone_t z, zone_pva_t queue, zone_pva_t oldv,
struct zone_page_metadata *meta)
{
zone_pva_t *queue_head = &zone_pageq_base()[queue.packed_address];
if (!zone_pva_is_equal(*queue_head, oldv)) {
zone_page_metadata_list_corruption(z, meta);
}
*queue_head = meta->zm_page_next;
}
__header_always_inline zone_pva_t
zone_queue_encode(zone_pva_t *headp)
{
return (zone_pva_t){ (uint32_t)(headp - zone_pageq_base()) };
}
__header_always_inline zone_pva_t
zone_pva_from_addr(vm_address_t addr)
{
// cannot use atop() because we want to maintain the sign bit
return (zone_pva_t){ (uint32_t)((intptr_t)addr >> PAGE_SHIFT) };
}
__header_always_inline vm_address_t
zone_pva_to_addr(zone_pva_t page)
{
// cause sign extension so that we end up with the right address
return (vm_offset_t)(int32_t)page.packed_address << PAGE_SHIFT;
}
__header_always_inline struct zone_page_metadata *
zone_pva_to_meta(zone_pva_t page)
{
return &zone_info.zi_meta_base[page.packed_address];
}
__header_always_inline zone_pva_t
zone_pva_from_meta(struct zone_page_metadata *meta)
{
return (zone_pva_t){ (uint32_t)(meta - zone_info.zi_meta_base) };
}
__header_always_inline struct zone_page_metadata *
zone_meta_from_addr(vm_offset_t addr)
{
return zone_pva_to_meta(zone_pva_from_addr(addr));
}
__header_always_inline zone_id_t
zone_index_from_ptr(const void *ptr)
{
return zone_pva_to_meta(zone_pva_from_addr((vm_offset_t)ptr))->zm_index;
}
__header_always_inline vm_offset_t
zone_meta_to_addr(struct zone_page_metadata *meta)
{
return ptoa((int32_t)(meta - zone_info.zi_meta_base));
}
__attribute__((overloadable))
__header_always_inline void
zone_meta_validate(zone_t z, struct zone_page_metadata *meta, vm_address_t addr)
{
if (!zone_has_index(z, meta->zm_index)) {
zone_page_metadata_index_confusion_panic(z, addr, meta);
}
}
__attribute__((overloadable))
__header_always_inline void
zone_meta_validate(zone_t z, struct zone_page_metadata *meta)
{
zone_meta_validate(z, meta, zone_meta_to_addr(meta));
}