Skip to content

Commit

Permalink
Maintain consistency when deserializing to JSON (#5114)
Browse files Browse the repository at this point in the history
## Summary

Maintain consistency while deserializing Jupyter notebook to JSON. The
following changes were made:

1. Use string array to store the source value as that's the default
(https://github.com/jupyter/nbformat/blob/57817204230a8f7173b8bec380e571b5d410de0d/nbformat/v4/nbjson.py#L56-L57)
2. Remove unused structs and enums
3. Reorder the keys in alphabetical order as that's the default.
(https://github.com/jupyter/nbformat/blob/57817204230a8f7173b8bec380e571b5d410de0d/nbformat/v4/nbjson.py#L51)

### Side effect

Removing the `preserve_order` feature means that the order of keys in
JSON output (`--format json`) will be in alphabetical order. This is
because the value is represented using `serde_json::Value` which
internally is a `BTreeMap`, thus sorting it as per the string key. For
posterity if this turns out to be not ideal, then we could define a
struct representing the JSON object and the order of struct fields will
determine the order in the JSON string.

## Test Plan

Add a test case to assert the raw JSON string.
  • Loading branch information
dhruvmanila committed Jun 19, 2023
1 parent 94abf7f commit 48f4f2d
Show file tree
Hide file tree
Showing 15 changed files with 339 additions and 363 deletions.
1 change: 0 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ rustpython-literal = { git = "https://github.com/astral-sh/RustPython-Parser.git
rustpython-parser = { git = "https://github.com/astral-sh/RustPython-Parser.git", rev = "0dc8fdf52d146698c5bcf0b842fddc9e398ad8db", default-features = false, features = ["full-lexer", "all-nodes-with-ranges"] }
schemars = { version = "0.8.12" }
serde = { version = "1.0.152", features = ["derive"] }
serde_json = { version = "1.0.93", features = ["preserve_order"] }
serde_json = { version = "1.0.93" }
shellexpand = { version = "3.0.0" }
similar = { version = "2.2.1", features = ["inline"] }
smallvec = { version = "1.10.0" }
Expand Down
37 changes: 37 additions & 0 deletions crates/ruff/resources/test/fixtures/jupyter/after_fix.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "1",
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"\n",
"math.pi"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (ruff)",
"language": "python",
"name": "ruff"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
38 changes: 38 additions & 0 deletions crates/ruff/resources/test/fixtures/jupyter/before_fix.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "1",
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"import os\n",
"\n",
"math.pi"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (ruff)",
"language": "python",
"name": "ruff"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
{
"execution_count": null,
"cell_type": "code",
"id": "1",
"metadata": {},
"outputs": [],
"source": ["def foo():\n", " pass\n", "\n", "%timeit foo()"]
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"cell_type": "markdown",
"id": "1",
"metadata": {},
"source": ["This is a markdown cell\n", "Some more content"]
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
{
"execution_count": null,
"cell_type": "code",
"id": "1",
"metadata": {},
"outputs": [],
"source": ["def foo():\n", " pass"]
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
{
"execution_count": null,
"cell_type": "code",
"id": "1",
"metadata": {},
"outputs": [],
"source": "%timeit print('hello world')"
}
106 changes: 73 additions & 33 deletions crates/ruff/src/jupyter/notebook.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::cmp::Ordering;
use std::fs::File;
use std::io::{BufReader, BufWriter, Cursor, Write};
use std::io::{BufReader, BufWriter, Write};
use std::iter;
use std::path::Path;

Expand All @@ -10,12 +10,12 @@ use serde::Serialize;
use serde_json::error::Category;

use ruff_diagnostics::Diagnostic;
use ruff_python_whitespace::NewlineWithTrailingNewline;
use ruff_python_whitespace::{NewlineWithTrailingNewline, UniversalNewlineIterator};
use ruff_text_size::{TextRange, TextSize};

use crate::autofix::source_map::{SourceMap, SourceMarker};
use crate::jupyter::index::JupyterIndex;
use crate::jupyter::{Cell, CellType, RawNotebook, SourceValue};
use crate::jupyter::schema::{Cell, RawNotebook, SortAlphabetically, SourceValue};
use crate::rules::pycodestyle::rules::SyntaxError;
use crate::IOError;

Expand All @@ -34,9 +34,9 @@ pub fn round_trip(path: &Path) -> anyhow::Result<String> {
})?;
let code = notebook.content().to_string();
notebook.update_cell_content(&code);
let mut buffer = Cursor::new(Vec::new());
let mut buffer = BufWriter::new(Vec::new());
notebook.write_inner(&mut buffer)?;
Ok(String::from_utf8(buffer.into_inner())?)
Ok(String::from_utf8(buffer.into_inner()?)?)
}

/// Return `true` if the [`Path`] appears to be that of a jupyter notebook file (`.ipynb`).
Expand All @@ -49,18 +49,37 @@ pub fn is_jupyter_notebook(path: &Path) -> bool {
}

impl Cell {
/// Return the [`SourceValue`] of the cell.
fn source(&self) -> &SourceValue {
match self {
Cell::Code(cell) => &cell.source,
Cell::Markdown(cell) => &cell.source,
Cell::Raw(cell) => &cell.source,
}
}

/// Update the [`SourceValue`] of the cell.
fn set_source(&mut self, source: SourceValue) {
match self {
Cell::Code(cell) => cell.source = source,
Cell::Markdown(cell) => cell.source = source,
Cell::Raw(cell) => cell.source = source,
}
}

/// Return `true` if it's a valid code cell.
///
/// A valid code cell is a cell where the type is [`CellType::Code`] and the
/// A valid code cell is a cell where the cell type is [`Cell::Code`] and the
/// source doesn't contain a magic, shell or help command.
fn is_valid_code_cell(&self) -> bool {
if self.cell_type != CellType::Code {
return false;
}
let source = match self {
Cell::Code(cell) => &cell.source,
_ => return false,
};
// Ignore a cell if it contains a magic command. There could be valid
// Python code as well, but we'll ignore that for now.
// TODO(dhruvmanila): https://github.com/psf/black/blob/main/src/black/handle_ipynb_magics.py
!match &self.source {
!match source {
SourceValue::String(string) => string.lines().any(|line| {
MAGIC_PREFIX
.iter()
Expand Down Expand Up @@ -92,7 +111,7 @@ pub struct Notebook {
/// The offsets of each cell in the concatenated source code. This includes
/// the first and last character offsets as well.
cell_offsets: Vec<TextSize>,
/// The cell numbers of all valid code cells in the notebook.
/// The cell index of all valid code cells in the notebook.
valid_code_cells: Vec<u32>,
}

Expand All @@ -108,7 +127,7 @@ impl Notebook {
TextRange::default(),
)
})?);
let notebook: RawNotebook = match serde_json::from_reader(reader) {
let raw_notebook: RawNotebook = match serde_json::from_reader(reader) {
Ok(notebook) => notebook,
Err(err) => {
// Translate the error into a diagnostic
Expand Down Expand Up @@ -176,34 +195,34 @@ impl Notebook {
};

// v4 is what everybody uses
if notebook.nbformat != 4 {
if raw_notebook.nbformat != 4 {
// bail because we should have already failed at the json schema stage
return Err(Box::new(Diagnostic::new(
SyntaxError {
message: format!(
"Expected Jupyter Notebook format 4, found {}",
notebook.nbformat
raw_notebook.nbformat
),
},
TextRange::default(),
)));
}

let valid_code_cells = notebook
let valid_code_cells = raw_notebook
.cells
.iter()
.enumerate()
.filter(|(_, cell)| cell.is_valid_code_cell())
.map(|(pos, _)| u32::try_from(pos).unwrap())
.map(|(idx, _)| u32::try_from(idx).unwrap())
.collect::<Vec<_>>();

let mut contents = Vec::with_capacity(valid_code_cells.len());
let mut current_offset = TextSize::from(0);
let mut cell_offsets = Vec::with_capacity(notebook.cells.len());
let mut cell_offsets = Vec::with_capacity(valid_code_cells.len());
cell_offsets.push(TextSize::from(0));

for &pos in &valid_code_cells {
let cell_contents = match &notebook.cells[pos as usize].source {
for &idx in &valid_code_cells {
let cell_contents = match &raw_notebook.cells[idx as usize].source() {
SourceValue::String(string) => string.clone(),
SourceValue::StringArray(string_array) => string_array.join(""),
};
Expand All @@ -213,7 +232,7 @@ impl Notebook {
}

Ok(Self {
raw: notebook,
raw: raw_notebook,
index: OnceCell::new(),
// The additional newline at the end is to maintain consistency for
// all cells. These newlines will be removed before updating the
Expand Down Expand Up @@ -267,30 +286,33 @@ impl Notebook {
/// can happen only if the cell offsets were not updated before calling
/// this method or the offsets were updated incorrectly.
fn update_cell_content(&mut self, transformed: &str) {
for (&pos, (start, end)) in self
for (&idx, (start, end)) in self
.valid_code_cells
.iter()
.zip(self.cell_offsets.iter().tuple_windows::<(_, _)>())
{
let cell_content = transformed
.get(start.to_usize()..end.to_usize())
.unwrap_or_else(|| {
panic!("Transformed content out of bounds ({start:?}..{end:?}) for cell {pos}");
panic!(
"Transformed content out of bounds ({start:?}..{end:?}) for cell at {idx:?}"
);
});
self.raw.cells[pos as usize].source = SourceValue::String(
cell_content
self.raw.cells[idx as usize].set_source(SourceValue::StringArray(
UniversalNewlineIterator::from(
// We only need to strip the trailing newline which we added
// while concatenating the cell contents.
.strip_suffix('\n')
.unwrap_or(cell_content)
.to_string(),
);
cell_content.strip_suffix('\n').unwrap_or(cell_content),
)
.map(|line| line.as_full_str().to_string())
.collect::<Vec<_>>(),
));
}
}

/// Build and return the [`JupyterIndex`].
///
/// # Notes
/// ## Notes
///
/// Empty cells don't have any newlines, but there's a single visible line
/// in the UI. That single line needs to be accounted for.
Expand All @@ -317,8 +339,8 @@ impl Notebook {
let mut row_to_cell = vec![0];
let mut row_to_row_in_cell = vec![0];

for &pos in &self.valid_code_cells {
let line_count = match &self.raw.cells[pos as usize].source {
for &idx in &self.valid_code_cells {
let line_count = match &self.raw.cells[idx as usize].source() {
SourceValue::String(string) => {
if string.is_empty() {
1
Expand All @@ -336,7 +358,7 @@ impl Notebook {
}
}
};
row_to_cell.extend(iter::repeat(pos + 1).take(line_count as usize));
row_to_cell.extend(iter::repeat(idx + 1).take(line_count as usize));
row_to_row_in_cell.extend(1..=line_count);
}

Expand Down Expand Up @@ -390,7 +412,7 @@ impl Notebook {
// https://github.com/psf/black/blob/69ca0a4c7a365c5f5eea519a90980bab72cab764/src/black/__init__.py#LL1041
let formatter = serde_json::ser::PrettyFormatter::with_indent(b" ");
let mut ser = serde_json::Serializer::with_formatter(writer, formatter);
self.raw.serialize(&mut ser)?;
SortAlphabetically(&self.raw).serialize(&mut ser)?;
Ok(())
}

Expand All @@ -404,6 +426,7 @@ impl Notebook {

#[cfg(test)]
mod test {
use std::io::BufWriter;
use std::path::Path;

use anyhow::Result;
Expand Down Expand Up @@ -536,4 +559,21 @@ print("after empty cells")
assert_messages!(diagnostics, path, source_kind);
Ok(())
}

#[test]
fn test_json_consistency() -> Result<()> {
let path = "before_fix.ipynb".to_string();
let (_, source_kind) = test_notebook_path(
path,
Path::new("after_fix.ipynb"),
&settings::Settings::for_rule(Rule::UnusedImport),
)?;
let mut writer = BufWriter::new(Vec::new());
source_kind.expect_jupyter().write_inner(&mut writer)?;
let actual = String::from_utf8(writer.into_inner()?)?;
let expected =
std::fs::read_to_string(test_resource_path("fixtures/jupyter/after_fix.ipynb"))?;
assert_eq!(actual, expected);
Ok(())
}
}
Loading

0 comments on commit 48f4f2d

Please sign in to comment.