-
Notifications
You must be signed in to change notification settings - Fork 1.1k
/
docstring.rs
1635 lines (1530 loc) · 68.5 KB
/
docstring.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// This gives tons of false positives in this file because of
// "reStructuredText."
#![allow(clippy::doc_markdown)]
use std::{borrow::Cow, collections::VecDeque};
use ruff_python_parser::ParseError;
use {once_cell::sync::Lazy, regex::Regex};
use {
ruff_formatter::{write, FormatOptions, IndentStyle, LineWidth, Printed},
ruff_python_trivia::{is_python_whitespace, PythonWhitespace},
ruff_source_file::Locator,
ruff_text_size::{Ranged, TextLen, TextRange, TextSize},
};
use crate::{prelude::*, DocstringCodeLineWidth, FormatModuleError};
use super::{NormalizedString, QuoteChar};
/// Format a docstring by trimming whitespace and adjusting the indentation.
///
/// Summary of changes we make:
/// * Normalize the string like all other strings
/// * Ignore docstring that have an escaped newline
/// * Trim all trailing whitespace, except for a chaperone space that avoids quotes or backslashes
/// in the last line.
/// * Trim leading whitespace on the first line, again except for a chaperone space
/// * If there is only content in the first line and after that only whitespace, collapse the
/// docstring into one line
/// * Adjust the indentation (see below)
///
/// # Docstring indentation
///
/// Unlike any other string, like black we change the indentation of docstring lines.
///
/// We want to preserve the indentation inside the docstring relative to the suite statement/block
/// indent that the docstring statement is in, but also want to apply the change of the outer
/// indentation in the docstring, e.g.
/// ```python
/// def sparkle_sky():
/// """Make a pretty sparkly sky.
/// * * ✨ *. .
/// * * ✨ .
/// . * . ✨ * . .
/// """
/// ```
/// should become
/// ```python
/// def sparkle_sky():
/// """Make a pretty sparkly sky.
/// * * ✨ *. .
/// * * ✨ .
/// . * . ✨ * . .
/// """
/// ```
/// We can't compute the full indentation here since we don't know what the block indent of
/// the doc comment will be yet and which we can only have added by formatting each line
/// separately with a hard line break. This means we need to strip shared indentation from
/// docstring while preserving the in-docstring bigger-than-suite-statement indentation. Example:
/// ```python
/// def f():
/// """first line
/// line a
/// line b
/// """
/// ```
/// The docstring indentation is 2, the block indents will change this to 4 (but we can't
/// determine this at this point). The indentation of line a is 2, so we trim ` line a`
/// to `line a`. For line b it's 5, so we trim it to `line b` and pad with 5-2=3 spaces to
/// ` line b`. The closing quotes, being on their own line, are stripped get only the
/// default indentation. Fully formatted:
/// ```python
/// def f():
/// """first line
/// line a
/// line b
/// """
/// ```
///
/// Tabs are counted by padding them to the next multiple of 8 according to
/// [`str.expandtabs`](https://docs.python.org/3/library/stdtypes.html#str.expandtabs). When
/// we see indentation that contains a tab or any other none ascii-space whitespace we rewrite the
/// string.
///
/// Additionally, if any line in the docstring has less indentation than the docstring
/// (effectively a negative indentation wrt. to the current level), we pad all lines to the
/// level of the docstring with spaces.
/// ```python
/// def f():
/// """first line
/// line a
/// line b
/// line c
/// """
/// ```
/// Here line a is 3 columns negatively indented, so we pad all lines by an extra 3 spaces:
/// ```python
/// def f():
/// """first line
/// line a
/// line b
/// line c
/// """
/// ```
pub(crate) fn format(
normalized: &NormalizedString,
f: &mut PyFormatter,
is_module: bool,
) -> FormatResult<()> {
let docstring = &normalized.text;
// Black doesn't change the indentation of docstrings that contain an escaped newline
if contains_unescaped_newline(docstring) {
return normalized.fmt(f);
}
// is_borrowed is unstable :/
let already_normalized = matches!(docstring, Cow::Borrowed(_));
let mut lines = docstring.lines().peekable();
// Start the string
write!(
f,
[
normalized.prefix,
normalized.quotes,
source_position(normalized.start()),
]
)?;
// We track where in the source docstring we are (in source code byte offsets)
let mut offset = normalized.start();
// The first line directly after the opening quotes has different rules than the rest, mainly
// that we remove all leading whitespace as there's no indentation
let first = lines.next().unwrap_or_default();
// Black trims whitespace using [`str.strip()`](https://docs.python.org/3/library/stdtypes.html#str.strip)
// https://github.com/psf/black/blob/b4dca26c7d93f930bbd5a7b552807370b60d4298/src/black/strings.py#L77-L85
// So we use the unicode whitespace definition through `trim_{start,end}` instead of the python
// tokenizer whitespace definition in `trim_whitespace_{start,end}`.
let trim_end = first.trim_end();
let trim_both = trim_end.trim_start();
// Edge case: The first line is `""" "content`, so we need to insert chaperone space that keep
// inner quotes and closing quotes from getting to close to avoid `""""content`
if trim_both.starts_with(normalized.quotes.quote_char.as_char()) {
space().fmt(f)?;
}
if !trim_end.is_empty() {
// For the first line of the docstring we strip the leading and trailing whitespace, e.g.
// `""" content ` to `"""content`
let leading_whitespace = trim_end.text_len() - trim_both.text_len();
let trimmed_line_range =
TextRange::at(offset, trim_end.text_len()).add_start(leading_whitespace);
if already_normalized {
source_text_slice(trimmed_line_range).fmt(f)?;
} else {
text(trim_both, Some(trimmed_line_range.start())).fmt(f)?;
}
}
offset += first.text_len();
// Check if we have a single line (or empty) docstring
if docstring[first.len()..].trim().is_empty() {
// For `"""\n"""` or other whitespace between the quotes, black keeps a single whitespace,
// but `""""""` doesn't get one inserted.
if needs_chaperone_space(normalized, trim_end)
|| (trim_end.is_empty() && !docstring.is_empty())
{
space().fmt(f)?;
}
normalized.quotes.fmt(f)?;
return Ok(());
}
hard_line_break().fmt(f)?;
// We know that the normalized string has \n line endings
offset += "\n".text_len();
// If some line of the docstring is less indented than the function body, we pad all lines to
// align it with the docstring statement. Conversely, if all lines are over-indented, we strip
// the extra indentation. We call this stripped indentation since it's relative to the block
// indent printer-made indentation.
let stripped_indentation_length = lines
.clone()
// We don't want to count whitespace-only lines as miss-indented
.filter(|line| !line.trim().is_empty())
.map(indentation_length)
.min()
.unwrap_or_default();
DocstringLinePrinter {
f,
action_queue: VecDeque::new(),
offset,
stripped_indentation_length,
already_normalized,
quote_char: normalized.quotes.quote_char,
code_example: CodeExample::default(),
}
.add_iter(lines)?;
// Same special case in the last line as for the first line
let trim_end = docstring
.as_ref()
.trim_end_matches(|c: char| c.is_whitespace() && c != '\n');
if needs_chaperone_space(normalized, trim_end) {
space().fmt(f)?;
}
if is_module && trim_end.ends_with('\n') {
hard_line_break().fmt(f)?;
}
write!(f, [source_position(normalized.end()), normalized.quotes])
}
fn contains_unescaped_newline(haystack: &str) -> bool {
let mut rest = haystack;
while let Some(index) = memchr::memchr(b'\\', rest.as_bytes()) {
rest = &rest[index + 1..].trim_whitespace_start();
if rest.starts_with('\n') {
return true;
}
}
false
}
/// An abstraction for printing each line of a docstring.
struct DocstringLinePrinter<'ast, 'buf, 'fmt, 'src> {
f: &'fmt mut PyFormatter<'ast, 'buf>,
/// A queue of actions to perform.
///
/// Whenever we process a line, it is possible for it to generate multiple
/// actions to take. The most basic, and most common case, is for the line
/// to just simply be printed as-is. But in some cases, a line is part of
/// a code example that we'd like to reformat. In those cases, the actions
/// can be more complicated.
///
/// Actions are pushed on to the end of the queue and popped from the
/// beginning.
action_queue: VecDeque<CodeExampleAddAction<'src>>,
/// The source offset of the beginning of the line that is currently being
/// printed.
offset: TextSize,
/// Indentation alignment based on the least indented line in the
/// docstring.
stripped_indentation_length: TextSize,
/// Whether the docstring is overall already considered normalized. When it
/// is, the formatter can take a fast path.
already_normalized: bool,
/// The quote character used by the docstring being printed.
quote_char: QuoteChar,
/// The current code example detected in the docstring.
code_example: CodeExample<'src>,
}
impl<'ast, 'buf, 'fmt, 'src> DocstringLinePrinter<'ast, 'buf, 'fmt, 'src> {
/// Print all of the lines in the given iterator to this
/// printer's formatter.
///
/// Note that callers may treat the first line specially, such that the
/// iterator given contains all lines except for the first.
fn add_iter(
&mut self,
mut lines: std::iter::Peekable<std::str::Lines<'src>>,
) -> FormatResult<()> {
while let Some(line) = lines.next() {
let line = InputDocstringLine {
line,
offset: self.offset,
next: lines.peek().copied(),
};
// We know that the normalized string has \n line endings.
self.offset += line.line.text_len() + "\n".text_len();
self.add_one(line)?;
}
self.code_example.finish(&mut self.action_queue);
self.run_action_queue()
}
/// Adds the given line to this printer.
///
/// Depending on what's in the line, this may or may not print the line
/// immediately to the underlying buffer. If the line starts or is part
/// of an existing code snippet, then the lines will get buffered until
/// the code snippet is complete.
fn add_one(&mut self, line: InputDocstringLine<'src>) -> FormatResult<()> {
// Just pass through the line as-is without looking for a code snippet
// when docstring code formatting is disabled. And also when we are
// formatting a code snippet so as to avoid arbitrarily nested code
// snippet formatting. We avoid this because it's likely quite tricky
// to get right 100% of the time, although perhaps not impossible. It's
// not clear that it's worth the effort to support.
if !self.f.options().docstring_code().is_enabled() || self.f.context().docstring().is_some()
{
return self.print_one(&line.as_output());
}
self.code_example.add(line, &mut self.action_queue);
self.run_action_queue()
}
/// Process any actions in this printer's queue until the queue is empty.
fn run_action_queue(&mut self) -> FormatResult<()> {
while let Some(action) = self.action_queue.pop_front() {
match action {
CodeExampleAddAction::Print { original } => {
self.print_one(&original.as_output())?;
}
CodeExampleAddAction::Kept => {}
CodeExampleAddAction::Reset { code } => {
for codeline in code {
self.print_one(&codeline.original.as_output())?;
}
}
CodeExampleAddAction::Format { mut kind } => {
let Some(formatted_lines) = self.format(&mut kind)? else {
// Since we've failed to emit these lines, we need to
// put them back in the queue but have them jump to the
// front of the queue to get processed before any other
// action.
self.action_queue.push_front(CodeExampleAddAction::Reset {
code: kind.into_code(),
});
continue;
};
self.already_normalized = false;
match kind {
CodeExampleKind::Doctest(CodeExampleDoctest { ps1_indent, .. }) => {
let mut lines = formatted_lines.into_iter();
let Some(first) = lines.next() else { continue };
self.print_one(
&first.map(|line| std::format!("{ps1_indent}>>> {line}")),
)?;
for docline in lines {
self.print_one(
&docline.map(|line| std::format!("{ps1_indent}... {line}")),
)?;
}
}
CodeExampleKind::Rst(litblock) => {
let Some(min_indent) = litblock.min_indent else {
continue;
};
// This looks suspicious, but it's consistent with the whitespace
// normalization that will occur anyway.
let indent = " ".repeat(min_indent.to_usize());
for docline in formatted_lines {
self.print_one(
&docline.map(|line| std::format!("{indent}{line}")),
)?;
}
}
CodeExampleKind::Markdown(fenced) => {
// This looks suspicious, but it's consistent with the whitespace
// normalization that will occur anyway.
let indent = " ".repeat(fenced.opening_fence_indent.to_usize());
for docline in formatted_lines {
self.print_one(
&docline.map(|line| std::format!("{indent}{line}")),
)?;
}
}
}
}
}
}
Ok(())
}
/// Prints the single line given.
///
/// This mostly just handles indentation and ensuring line breaks are
/// inserted as appropriate before passing it on to the formatter to
/// print to the buffer.
fn print_one(&mut self, line: &OutputDocstringLine<'_>) -> FormatResult<()> {
let trim_end = line.line.trim_end();
if trim_end.is_empty() {
return if line.is_last {
// If the doc string ends with ` """`, the last line is
// ` `, but we don't want to insert an empty line (but close
// the docstring).
Ok(())
} else {
empty_line().fmt(self.f)
};
}
let tab_or_non_ascii_space = trim_end
.chars()
.take_while(|c| c.is_whitespace())
.any(|c| c != ' ');
if tab_or_non_ascii_space {
// We strip the indentation that is shared with the docstring
// statement, unless a line was indented less than the docstring
// statement, in which case we strip only this much indentation to
// implicitly pad all lines by the difference, or all lines were
// overindented, in which case we strip the additional whitespace
// (see example in [`format_docstring`] doc comment). We then
// prepend the in-docstring indentation to the string.
let indent_len = indentation_length(trim_end) - self.stripped_indentation_length;
let in_docstring_indent = " ".repeat(usize::from(indent_len)) + trim_end.trim_start();
text(&in_docstring_indent, Some(line.offset)).fmt(self.f)?;
} else {
// Take the string with the trailing whitespace removed, then also
// skip the leading whitespace.
let trimmed_line_range = TextRange::at(line.offset, trim_end.text_len())
.add_start(self.stripped_indentation_length);
if self.already_normalized {
source_text_slice(trimmed_line_range).fmt(self.f)?;
} else {
// All indents are ascii spaces, so the slicing is correct.
text(
&trim_end[usize::from(self.stripped_indentation_length)..],
Some(trimmed_line_range.start()),
)
.fmt(self.f)?;
}
}
// We handled the case that the closing quotes are on their own line
// above (the last line is empty except for whitespace). If they are on
// the same line as content, we don't insert a line break.
if !line.is_last {
hard_line_break().fmt(self.f)?;
}
Ok(())
}
/// Given a code example, format them and return
/// the formatted code as a sequence of owned docstring lines.
///
/// This may mutate the code example in place if extracting the lines of
/// code requires adjusting which part of each line is used for the actual
/// code bit.
///
/// This routine generally only returns an error when the recursive call
/// to the formatter itself returns a `FormatError`. In all other cases
/// (for example, if the code snippet is invalid Python or even if the
/// resulting reformatted code snippet is invalid Python), then `Ok(None)`
/// is returned. In this case, callers should assume that a reformatted
/// code snippet is unavailable and bail out of trying to format it.
///
/// Currently, when the above cases happen and `Ok(None)` is returned, the
/// routine is silent about it. So from the user's perspective, this will
/// fail silently. Ideally, this would at least emit a warning message,
/// but at time of writing, it wasn't clear to me how to best do that.
fn format(
&mut self,
kind: &mut CodeExampleKind<'_>,
) -> FormatResult<Option<Vec<OutputDocstringLine<'static>>>> {
use ruff_python_parser::AsMode;
let line_width = match self.f.options().docstring_code_line_width() {
DocstringCodeLineWidth::Fixed(width) => width,
DocstringCodeLineWidth::Dynamic => {
let global_line_width = self.f.options().line_width().value();
let indent_width = self.f.options().indent_width();
let indent_level = self.f.context().indent_level();
let current_indent = indent_level
.to_ascii_spaces(indent_width)
.saturating_add(kind.extra_indent_ascii_spaces());
let width = std::cmp::max(1, global_line_width.saturating_sub(current_indent));
LineWidth::try_from(width).expect("width is capped at a minimum of 1")
}
};
let code = kind.code();
let (Some(unformatted_first), Some(unformatted_last)) = (code.first(), code.last()) else {
return Ok(None);
};
let codeblob = code
.iter()
.map(|line| line.code)
.collect::<Vec<&str>>()
.join("\n");
let options = self
.f
.options()
.clone()
.with_line_width(line_width)
// It's perhaps a little odd to be hard-coding the indent
// style here, but I believe it is necessary as a result
// of the whitespace normalization otherwise done in
// docstrings. Namely, tabs are rewritten with ASCII
// spaces. If code examples in docstrings are formatted
// with tabs and those tabs end up getting rewritten, this
// winds up screwing with the indentation in ways that
// results in formatting no longer being idempotent. Since
// tabs will get erased anyway, we just clobber them here
// instead of later, and as a result, get more consistent
// results.
.with_indent_style(IndentStyle::Space);
let printed = match docstring_format_source(options, self.quote_char, &codeblob) {
Ok(printed) => printed,
Err(FormatModuleError::FormatError(err)) => return Err(err),
Err(FormatModuleError::ParseError(_) | FormatModuleError::PrintError(_)) => {
return Ok(None);
}
};
// This is a little hokey, but we want to determine whether the
// reformatted code snippet will lead to an overall invalid docstring.
// So attempt to parse it as Python code, but ensure it is wrapped
// within a docstring using the same quotes as the docstring we're in
// right now.
//
// This is an unfortunate stop-gap to attempt to prevent us from
// writing invalid Python due to some oddity of the code snippet within
// a docstring. As we fix corner cases over time, we can perhaps
// remove this check. See the `doctest_invalid_skipped` tests in
// `docstring_code_examples.py` for when this check is relevant.
let wrapped = match self.quote_char {
QuoteChar::Single => std::format!("'''{}'''", printed.as_code()),
QuoteChar::Double => {
std::format!(r#""""{}""""#, printed.as_code())
}
};
let result = ruff_python_parser::parse(&wrapped, self.f.options().source_type().as_mode());
// If the resulting code is not valid, then reset and pass through
// the docstring lines as-is.
if result.is_err() {
return Ok(None);
}
let mut lines = printed
.as_code()
.lines()
.map(|line| OutputDocstringLine {
line: Cow::Owned(line.to_string()),
offset: unformatted_first.original.offset,
is_last: false,
})
.collect::<Vec<_>>();
if let Some(reformatted_last) = lines.last_mut() {
reformatted_last.is_last = unformatted_last.original.is_last();
}
Ok(Some(lines))
}
}
/// Represents a single line in a docstring.
///
/// This type is only used to represent the original lines in a docstring.
/// Specifically, the line contained in this type has no changes from the input
/// source.
#[derive(Clone, Copy, Debug)]
struct InputDocstringLine<'src> {
/// The actual text of the line, not including the line terminator.
///
/// In practice, this line is borrowed when it corresponds to an original
/// unformatted line in a docstring, and owned when it corresponds to a
/// reformatted line (e.g., from a code snippet) in a docstring.
line: &'src str,
/// The offset into the source document which this line corresponds to.
offset: TextSize,
/// For any input line that isn't the last line, this contains a reference
/// to the line immediately following this one.
///
/// This is `None` if and only if this is the last line in the docstring.
next: Option<&'src str>,
}
impl<'src> InputDocstringLine<'src> {
/// Borrow this input docstring line as an output docstring line.
fn as_output(&self) -> OutputDocstringLine<'src> {
OutputDocstringLine {
line: Cow::Borrowed(self.line),
offset: self.offset,
is_last: self.is_last(),
}
}
/// Whether this is the last line in the docstring or not.
fn is_last(&self) -> bool {
self.next.is_none()
}
}
/// Represents a single reformatted code line in a docstring.
///
/// An input source line may be cheaply converted to an output source line.
/// This is the common case: an input source line is printed pretty much as it
/// is, with perhaps some whitespace normalization applied. The less common
/// case is that the output docstring line owns its `line` because it was
/// produced by reformatting a code snippet.
#[derive(Clone, Debug)]
struct OutputDocstringLine<'src> {
/// The output line.
///
/// This is an owned variant in precisely the cases where it corresponds to
/// a line from a reformatted code snippet. In other cases, it is borrowed
/// from the input docstring line as-is.
line: Cow<'src, str>,
/// The offset into the source document which this line corresponds to.
/// Currently, this is an estimate.
offset: TextSize,
/// Whether this is the last line in a docstring or not. This is determined
/// by whether the last line in the code snippet was also the last line in
/// the docstring. If it was, then it follows that the last line in the
/// reformatted code snippet is also the last line in the docstring.
is_last: bool,
}
impl<'src> OutputDocstringLine<'src> {
/// Return this reformatted line, but with the given function applied to
/// the text of the line.
fn map(self, mut map: impl FnMut(&str) -> String) -> OutputDocstringLine<'static> {
OutputDocstringLine {
line: Cow::Owned(map(&self.line)),
..self
}
}
}
/// A single code example extracted from a docstring.
///
/// This represents an intermediate state from when the code example was first
/// found all the way up until the point at which the code example has finished
/// and is reformatted.
///
/// Its default state is "empty." That is, that no code example is currently
/// being collected.
#[derive(Debug, Default)]
struct CodeExample<'src> {
/// The kind of code example being collected, or `None` if no code example
/// has been observed.
///
/// The kind is split out into a separate type so that we can pass it
/// around and have a guarantee that a code example actually exists.
kind: Option<CodeExampleKind<'src>>,
}
impl<'src> CodeExample<'src> {
/// Attempt to add an original line from a docstring to this code example.
///
/// Based on the line and the internal state of whether a code example is
/// currently being collected or not, this will push an "action" to the
/// given queue for the caller to perform. The typical case is a "print"
/// action, which instructs the caller to just print the line as though it
/// were not part of a code snippet.
fn add(
&mut self,
original: InputDocstringLine<'src>,
queue: &mut VecDeque<CodeExampleAddAction<'src>>,
) {
match self.kind.take() {
// There's no existing code example being built, so we look for
// the start of one or otherwise tell the caller we couldn't find
// anything.
None => {
self.add_start(original, queue);
}
Some(CodeExampleKind::Doctest(doctest)) => {
let Some(doctest) = doctest.add_code_line(original, queue) else {
self.add_start(original, queue);
return;
};
self.kind = Some(CodeExampleKind::Doctest(doctest));
}
Some(CodeExampleKind::Rst(litblock)) => {
let Some(litblock) = litblock.add_code_line(original, queue) else {
self.add_start(original, queue);
return;
};
self.kind = Some(CodeExampleKind::Rst(litblock));
}
Some(CodeExampleKind::Markdown(fenced)) => {
let Some(fenced) = fenced.add_code_line(original, queue) else {
// For Markdown, the last line in a block should be printed
// as-is. Especially since the last line in many Markdown
// fenced code blocks is identical to the start of a code
// block. So if we try to start a new code block with
// the last line, we risk opening another Markdown block
// inappropriately.
return;
};
self.kind = Some(CodeExampleKind::Markdown(fenced));
}
}
}
/// Finish the code example by generating any final actions if applicable.
///
/// This typically adds an action when the end of a code example coincides
/// with the end of the docstring.
fn finish(&mut self, queue: &mut VecDeque<CodeExampleAddAction<'src>>) {
let Some(kind) = self.kind.take() else { return };
queue.push_back(CodeExampleAddAction::Format { kind });
}
/// Looks for the start of a code example. If one was found, then the given
/// line is kept and added as part of the code example. Otherwise, the line
/// is pushed onto the queue unchanged to be printed as-is.
///
/// # Panics
///
/// This panics when the existing code-example is any non-None value. That
/// is, this routine assumes that there is no ongoing code example being
/// collected and looks for the beginning of another code example.
fn add_start(
&mut self,
original: InputDocstringLine<'src>,
queue: &mut VecDeque<CodeExampleAddAction<'src>>,
) {
assert!(self.kind.is_none(), "expected no existing code example");
if let Some(doctest) = CodeExampleDoctest::new(original) {
self.kind = Some(CodeExampleKind::Doctest(doctest));
queue.push_back(CodeExampleAddAction::Kept);
} else if let Some(litblock) = CodeExampleRst::new(original) {
self.kind = Some(CodeExampleKind::Rst(litblock));
queue.push_back(CodeExampleAddAction::Print { original });
} else if let Some(fenced) = CodeExampleMarkdown::new(original) {
self.kind = Some(CodeExampleKind::Markdown(fenced));
queue.push_back(CodeExampleAddAction::Print { original });
} else {
queue.push_back(CodeExampleAddAction::Print { original });
}
}
}
/// The kind of code example observed in a docstring.
#[derive(Debug)]
enum CodeExampleKind<'src> {
/// Code found in Python "doctests."
///
/// Documentation describing doctests and how they're recognized can be
/// found as part of the Python standard library:
/// https://docs.python.org/3/library/doctest.html.
///
/// (You'll likely need to read the [regex matching] used internally by the
/// doctest module to determine more precisely how it works.)
///
/// [regex matching]: https://github.com/python/cpython/blob/0ff6368519ed7542ad8b443de01108690102420a/Lib/doctest.py#L611-L622
Doctest(CodeExampleDoctest<'src>),
/// Code found from a reStructuredText "[literal block]" or "[code block
/// directive]".
///
/// [literal block]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#literal-blocks
/// [code block directive]: https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#directive-code-block
Rst(CodeExampleRst<'src>),
/// Code found from a Markdown "[fenced code block]".
///
/// [fenced code block]: https://spec.commonmark.org/0.30/#fenced-code-blocks
Markdown(CodeExampleMarkdown<'src>),
}
impl<'src> CodeExampleKind<'src> {
/// Return the lines of code collected so far for this example.
///
/// This is borrowed mutably because it may need to mutate the code lines
/// based on the state accrued so far.
fn code(&mut self) -> &[CodeExampleLine<'src>] {
match *self {
CodeExampleKind::Doctest(ref doctest) => &doctest.lines,
CodeExampleKind::Rst(ref mut litblock) => litblock.indented_code(),
CodeExampleKind::Markdown(ref fenced) => &fenced.lines,
}
}
/// Consume this code example and return only the lines that have been
/// accrued so far.
///
/// This is useful when the code example being collected has been
/// determined to be invalid, and one wants to "give up" and print the
/// original lines through unchanged without attempting formatting.
fn into_code(self) -> Vec<CodeExampleLine<'src>> {
match self {
CodeExampleKind::Doctest(doctest) => doctest.lines,
CodeExampleKind::Rst(litblock) => litblock.lines,
CodeExampleKind::Markdown(fenced) => fenced.lines,
}
}
/// This returns any extra indent that will be added after formatting this
/// code example.
///
/// The extra indent is expressed in units of ASCII space characters.
fn extra_indent_ascii_spaces(&self) -> u16 {
match *self {
CodeExampleKind::Doctest(_) => 4,
_ => 0,
}
}
}
/// State corresponding to a single doctest code example found in a docstring.
#[derive(Debug)]
struct CodeExampleDoctest<'src> {
/// The lines that have been seen so far that make up the doctest.
lines: Vec<CodeExampleLine<'src>>,
/// The indent observed in the first doctest line.
///
/// More precisely, this corresponds to the whitespace observed before
/// the starting `>>> ` (the "PS1 prompt").
ps1_indent: &'src str,
}
impl<'src> CodeExampleDoctest<'src> {
/// Looks for a valid doctest PS1 prompt in the line given.
///
/// If one was found, then state for a new doctest code example is
/// returned, along with the code example line.
fn new(original: InputDocstringLine<'src>) -> Option<CodeExampleDoctest<'src>> {
let trim_start = original.line.trim_start();
// Prompts must be followed by an ASCII space character[1].
//
// [1]: https://github.com/python/cpython/blob/0ff6368519ed7542ad8b443de01108690102420a/Lib/doctest.py#L809-L812
let code = trim_start.strip_prefix(">>> ")?;
let indent_len = original
.line
.len()
.checked_sub(trim_start.len())
.expect("suffix is <= original");
let lines = vec![CodeExampleLine { original, code }];
let ps1_indent = &original.line[..indent_len];
let doctest = CodeExampleDoctest { lines, ps1_indent };
Some(doctest)
}
/// Looks for a valid doctest PS2 prompt in the line given. If one is
/// found, it is added to this code example and ownership of the example is
/// returned to the caller. In this case, callers should continue trying to
/// add PS2 prompt lines.
///
/// But if one isn't found, then the given line is not part of the code
/// example and ownership of this example is not returned.
///
/// In either case, relevant actions will be added to the given queue to
/// process.
fn add_code_line(
mut self,
original: InputDocstringLine<'src>,
queue: &mut VecDeque<CodeExampleAddAction<'src>>,
) -> Option<CodeExampleDoctest<'src>> {
let Some((ps2_indent, ps2_after)) = original.line.split_once("...") else {
queue.push_back(self.into_format_action());
return None;
};
// PS2 prompts must have the same indentation as their
// corresponding PS1 prompt.[1] While the 'doctest' Python
// module will error in this case, we just treat this line as a
// non-doctest line.
//
// [1]: https://github.com/python/cpython/blob/0ff6368519ed7542ad8b443de01108690102420a/Lib/doctest.py#L733
if self.ps1_indent != ps2_indent {
queue.push_back(self.into_format_action());
return None;
}
// PS2 prompts must be followed by an ASCII space character unless
// it's an otherwise empty line[1].
//
// [1]: https://github.com/python/cpython/blob/0ff6368519ed7542ad8b443de01108690102420a/Lib/doctest.py#L809-L812
let code = match ps2_after.strip_prefix(' ') {
None if ps2_after.is_empty() => "",
None => {
queue.push_back(self.into_format_action());
return None;
}
Some(code) => code,
};
self.lines.push(CodeExampleLine { original, code });
queue.push_back(CodeExampleAddAction::Kept);
Some(self)
}
/// Consume this doctest and turn it into a formatting action.
fn into_format_action(self) -> CodeExampleAddAction<'src> {
CodeExampleAddAction::Format {
kind: CodeExampleKind::Doctest(self),
}
}
}
/// State corresponding to a single reStructuredText literal block or
/// code-block directive.
///
/// While a literal block and code-block directive are technically two
/// different reStructuredText constructs, we use one type to represent
/// both because they are exceptionally similar. Basically, they are
/// the same with two main differences:
///
/// 1. Literal blocks are began with a line that ends with `::`. Code block
/// directives are began with a line like `.. code-block:: python`.
/// 2. Code block directives permit a list of options as a "field list"
/// immediately after the opening line. Literal blocks have no options.
///
/// Otherwise, everything else, including the indentation structure, is the
/// same.
#[derive(Debug)]
struct CodeExampleRst<'src> {
/// The lines that have been seen so far that make up the block.
lines: Vec<CodeExampleLine<'src>>,
/// The indent of the line "opening" this block measured via
/// `indentation_length`.
///
/// It can either be the indent of a line ending with `::` (for a literal
/// block) or the indent of a line starting with `.. ` (a directive).
///
/// The content body of a block needs to be indented more than the line
/// opening the block, so we use this indentation to look for indentation
/// that is "more than" it.
opening_indent: TextSize,
/// The minimum indent of the block measured via `indentation_length`.
///
/// This is `None` until the first such line is seen. If no such line is
/// found, then we consider it an invalid block and bail out of trying to
/// find a code snippet. Otherwise, we update this indentation as we see
/// lines in the block with less indentation. (Usually, the minimum is the
/// indentation of the first block, but this is not required.)
///
/// By construction, all lines part of the block must have at least this
/// indentation. Additionally, it is guaranteed that the indentation length
/// of the opening indent is strictly less than the indentation of the
/// minimum indent. Namely, the block ends once we find a line that has
/// been unindented to at most the indent of the opening line.
///
/// When the code snippet has been extracted, it is re-built before being
/// reformatted. The minimum indent is stripped from each line when it is
/// re-built.
min_indent: Option<TextSize>,
/// Whether this is a directive block or not. When not a directive, this is
/// a literal block. The main difference between them is that they start
/// differently. A literal block is started merely by trailing a line with
/// `::`. A directive block is started with `.. code-block:: python`.
///
/// The other difference is that directive blocks can have options
/// (represented as a reStructuredText "field list") after the beginning of
/// the directive and before the body content of the directive.
is_directive: bool,
}
impl<'src> CodeExampleRst<'src> {
/// Looks for the start of a reStructuredText [literal block] or [code
/// block directive].
///
/// If the start of a block is found, then this returns a correctly
/// initialized reStructuredText block. Callers should print the line as
/// given as it is not retained as part of the block.
///
/// [literal block]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#literal-blocks
/// [code block directive]: https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#directive-code-block
fn new(original: InputDocstringLine<'src>) -> Option<CodeExampleRst> {
let (opening_indent, rest) = indent_with_suffix(original.line);
if rest.starts_with(".. ") {
if let Some(litblock) = CodeExampleRst::new_code_block(original) {
return Some(litblock);
}
// In theory, we could still have something that looks like a literal block,
// but if the line starts with `.. `, then it seems like it probably shouldn't
// be a literal block. For example:
//
// .. code-block::
//
// cool_stuff( 1 )
//
// The above is not valid because the `language` argument is missing from
// the `code-block` directive. Because of how we handle it here, the above
// is not treated as a code snippet.
return None;
}
// At this point, we know we didn't find a code block, so the only
// thing we can hope for is a literal block which must end with a `::`.
if !rest.trim_end().ends_with("::") {
return None;
}
Some(CodeExampleRst {
lines: vec![],
opening_indent: indentation_length(opening_indent),
min_indent: None,
is_directive: false,
})
}
/// Attempts to create a new reStructuredText code example from a
/// `code-block` or `sourcecode` directive. If one couldn't be found, then
/// `None` is returned.
fn new_code_block(original: InputDocstringLine<'src>) -> Option<CodeExampleRst> {
// This regex attempts to parse the start of a reStructuredText code
// block [directive]. From the reStructuredText spec:
//