Implement --diff for Jupyter Notebooks (#6149)

## Summary Implement `--diff` for Jupyter Notebooks ## Test Plan 1. Use `crates/ruff/resources/test/fixtures/jupyter/isort.ipynb` as a test case and add a markdown cell in between the code cells to check that the diff outputs the correct cell index. 2. Run the command: `cargo run --bin ruff --package ruff_cli -- check --no-cache --isolated --select=ALL crates/ruff/resources/test/fixtures/jupyter/isort.ipynb --fix --diff` <details><summary>Example output:</summary> <p> ```diff --- /Users/dhruv/playground/ruff/notebooks/test.ipynb:cell 0 +++ /Users/dhruv/playground/ruff/notebooks/test.ipynb:cell 0 @@ -1,3 +0,0 @@ -from pathlib import Path -import random -import math --- /Users/dhruv/playground/ruff/notebooks/test.ipynb:cell 4 +++ /Users/dhruv/playground/ruff/notebooks/test.ipynb:cell 4 @@ -1,5 +1,3 @@ -from typing import Any -import collections # Newline should be added here def foo(): pass --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 8 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 8 @@ -1,8 +1,7 @@ import pprint import tempfile -from IPython import display import matplotlib.pyplot as plt - import tensorflow as tf -import tensorflow_datasets as tfds +import tensorflow_datasets as tfds +from IPython import display --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 10 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 10 @@ -1,5 +1,4 @@ import tensorflow_models as tfm # These are not in the tfm public API for v2.9. They will be available in v2.10 -from official.vision.serving import export_saved_model_lib -import official.core.train_lib +from official.vision.serving import export_saved_model_lib --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 13 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 13 @@ -1,5 +1,5 @@ -exp_config = tfm.core.exp_factory.get_exp_config('resnet_imagenet') -tfds_name = 'cifar10' +exp_config = tfm.core.exp_factory.get_exp_config("resnet_imagenet") +tfds_name = "cifar10" ds,ds_info = tfds.load( tfds_name, with_info=True) --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 15 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 15 @@ -6,12 +6,12 @@ # Configure training and testing data batch_size = 128 -exp_config.task.train_data.input_path = '' +exp_config.task.train_data.input_path = "" exp_config.task.train_data.tfds_name = tfds_name -exp_config.task.train_data.tfds_split = 'train' +exp_config.task.train_data.tfds_split = "train" exp_config.task.train_data.global_batch_size = batch_size -exp_config.task.validation_data.input_path = '' +exp_config.task.validation_data.input_path = "" exp_config.task.validation_data.tfds_name = tfds_name -exp_config.task.validation_data.tfds_split = 'test' +exp_config.task.validation_data.tfds_split = "test" exp_config.task.validation_data.global_batch_size = batch_size --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 17 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 17 @@ -1,16 +1,16 @@ logical_device_names = [logical_device.name for logical_device in tf.config.list_logical_devices()] -if 'GPU' in ''.join(logical_device_names): - print('This may be broken in Colab.') - device = 'GPU' -elif 'TPU' in ''.join(logical_device_names): - print('This may be broken in Colab.') - device = 'TPU' +if "GPU" in "".join(logical_device_names): + print("This may be broken in Colab.") + device = "GPU" +elif "TPU" in "".join(logical_device_names): + print("This may be broken in Colab.") + device = "TPU" else: - print('Running on CPU is slow, so only train for a few steps.') - device = 'CPU' + print("Running on CPU is slow, so only train for a few steps.") + device = "CPU" -if device=='CPU': +if device=="CPU": train_steps = 20 exp_config.trainer.steps_per_loop = 5 else: @@ -20,9 +20,9 @@ exp_config.trainer.summary_interval = 100 exp_config.trainer.checkpoint_interval = train_steps exp_config.trainer.validation_interval = 1000 -exp_config.trainer.validation_steps = ds_info.splits['test'].num_examples // batch_size +exp_config.trainer.validation_steps = ds_info.splits["test"].num_examples // batch_size exp_config.trainer.train_steps = train_steps -exp_config.trainer.optimizer_config.learning_rate.type = 'cosine' +exp_config.trainer.optimizer_config.learning_rate.type = "cosine" exp_config.trainer.optimizer_config.learning_rate.cosine.decay_steps = train_steps exp_config.trainer.optimizer_config.learning_rate.cosine.initial_learning_rate = 0.1 exp_config.trainer.optimizer_config.warmup.linear.warmup_steps = 100 --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 21 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 21 @@ -1,14 +1,14 @@ logical_device_names = [logical_device.name for logical_device in tf.config.list_logical_devices()] if exp_config.runtime.mixed_precision_dtype == tf.float16: - tf.keras.mixed_precision.set_global_policy('mixed_float16') + tf.keras.mixed_precision.set_global_policy("mixed_float16") -if 'GPU' in ''.join(logical_device_names): +if "GPU" in "".join(logical_device_names): distribution_strategy = tf.distribute.MirroredStrategy() -elif 'TPU' in ''.join(logical_device_names): +elif "TPU" in "".join(logical_device_names): tf.tpu.experimental.initialize_tpu_system() - tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='/device:TPU_SYSTEM:0') + tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="/device:TPU_SYSTEM:0") distribution_strategy = tf.distribute.experimental.TPUStrategy(tpu) else: - print('Warning: this will be really slow.') + print("Warning: this will be really slow.") distribution_strategy = tf.distribute.OneDeviceStrategy(logical_device_names[0]) --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 23 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 23 @@ -1,5 +1,3 @@ with distribution_strategy.scope(): model_dir = tempfile.mkdtemp() task = tfm.core.task_factory.get_task(exp_config.task, logging_dir=model_dir) - -# tf.keras.utils.plot_model(task.build_model(), show_shapes=True) --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 24 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 24 @@ -1,4 +1,4 @@ for images, labels in task.build_inputs(exp_config.task.train_data).take(1): print() - print(f'images.shape: {str(images.shape):16} images.dtype: {images.dtype!r}') - print(f'labels.shape: {str(labels.shape):16} labels.dtype: {labels.dtype!r}') + print(f"images.shape: {images.shape!s:16} images.dtype: {images.dtype!r}") + print(f"labels.shape: {labels.shape!s:16} labels.dtype: {labels.dtype!r}") --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 27 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 27 @@ -1 +1 @@ -plt.hist(images.numpy().flatten()); +plt.hist(images.numpy().flatten()) --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 29 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 29 @@ -1,2 +1,2 @@ -label_info = ds_info.features['label'] +label_info = ds_info.features["label"] label_info.int2str(1) --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 31 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 31 @@ -10,9 +10,6 @@ if predictions is None: plt.title(label_info.int2str(labels[i])) else: - if labels[i] == predictions[i]: - color = 'g' - else: - color = 'r' + color = "g" if labels[i] == predictions[i] else "r" plt.title(label_info.int2str(predictions[i]), color=color) plt.axis("off") --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 35 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 35 @@ -1,3 +1,3 @@ -plt.figure(figsize=(10, 10)); +plt.figure(figsize=(10, 10)) for images, labels in task.build_inputs(exp_config.task.validation_data).take(1): show_batch(images, labels) --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 37 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 37 @@ -1,7 +1,7 @@ model, eval_logs = tfm.core.train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, - mode='train_and_eval', + mode="train_and_eval", params=exp_config, model_dir=model_dir, run_post_eval=True) --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 38 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 38 @@ -1 +0,0 @@ -# tf.keras.utils.plot_model(model, show_shapes=True) --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 40 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 40 @@ -1,4 +1,4 @@ for key, value in eval_logs.items(): if isinstance(value, tf.Tensor): value = value.numpy() - print(f'{key:20}: {value:.3f}') + print(f"{key:20}: {value:.3f}") --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 42 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 42 @@ -4,5 +4,5 @@ show_batch(images, labels, tf.cast(predictions, tf.int32)) -if device=='CPU': - plt.suptitle('The model was only trained for a few steps, it is not expected to do well.') +if device=="CPU": + plt.suptitle("The model was only trained for a few steps, it is not expected to do well.") --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 45 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 45 @@ -1,8 +1,8 @@ # Saving and exporting the trained model export_saved_model_lib.export_inference_graph( - input_type='image_tensor', + input_type="image_tensor", batch_size=1, input_image_size=[32, 32], params=exp_config, checkpoint_path=tf.train.latest_checkpoint(model_dir), - export_dir='./export/') + export_dir="./export/") --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 47 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 47 @@ -1,3 +1,3 @@ # Importing SavedModel -imported = tf.saved_model.load('./export/') -model_fn = imported.signatures['serving_default'] +imported = tf.saved_model.load("./export/") +model_fn = imported.signatures["serving_default"] --- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 49 +++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 49 @@ -1,10 +1,10 @@ plt.figure(figsize=(10, 10)) -for data in tfds.load('cifar10', split='test').batch(12).take(1): +for data in tfds.load("cifar10", split="test").batch(12).take(1): predictions = [] - for image in data['image']: - index = tf.argmax(model_fn(image[tf.newaxis, ...])['logits'], axis=1)[0] + for image in data["image"]: + index = tf.argmax(model_fn(image[tf.newaxis, ...])["logits"], axis=1)[0] predictions.append(index) - show_batch(data['image'], data['label'], predictions) + show_batch(data["image"], data["label"], predictions) - if device=='CPU': - plt.suptitle('The model was only trained for a few steps, it is not expected to do better than random.') + if device=="CPU": + plt.suptitle("The model was only trained for a few steps, it is not expected to do better than random.") Would fix 61 errors. ``` </p> </details> resolves: #4727
astral-sh · Jul 29, 2023 · 3c99fbf · 3c99fbf
1 parent 4802c7c
commit 3c99fbf
Show file tree

Hide file tree

Showing 2 changed files with 79 additions and 8 deletions.
diff --git a/crates/ruff/src/jupyter/notebook.rs b/crates/ruff/src/jupyter/notebook.rs
@@ -1,4 +1,5 @@
 use std::cmp::Ordering;
+use std::fmt::Display;
 use std::fs::File;
 use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
 use std::iter;
@@ -41,6 +42,20 @@ pub fn round_trip(path: &Path) -> anyhow::Result<String> {
     Ok(String::from_utf8(writer)?)
 }
 
+impl Display for SourceValue {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            SourceValue::String(string) => f.write_str(string),
+            SourceValue::StringArray(string_array) => {
+                for string in string_array {
+                    f.write_str(string)?;
+                }
+                Ok(())
+            }
+        }
+    }
+}
+
 impl Cell {
     /// Return the [`SourceValue`] of the cell.
     fn source(&self) -> &SourceValue {
@@ -407,6 +422,11 @@ impl Notebook {
         self.content = transformed.to_string();
     }
 
+    /// Return a slice of [`Cell`] in the Jupyter notebook.
+    pub fn cells(&self) -> &[Cell] {
+        &self.raw.cells
+    }
+
     /// Return `true` if the notebook is a Python notebook, `false` otherwise.
     pub fn is_python_notebook(&self) -> bool {
         self.raw

diff --git a/crates/ruff_cli/src/diagnostics.rs b/crates/ruff_cli/src/diagnostics.rs
@@ -17,7 +17,7 @@ use similar::TextDiff;
 use std::os::unix::fs::PermissionsExt;
 
 use crate::cache::Cache;
-use ruff::jupyter::Notebook;
+use ruff::jupyter::{Cell, Notebook};
 use ruff::linter::{lint_fix, lint_only, FixTable, FixerResult, LinterResult};
 use ruff::logging::DisplayParseError;
 use ruff::message::Message;
@@ -261,13 +261,64 @@ pub(crate) fn lint_path(
                         }
                     },
                     flags::FixMode::Diff => {
-                        let mut stdout = io::stdout().lock();
-                        TextDiff::from_lines(contents.as_str(), &transformed)
-                            .unified_diff()
-                            .header(&fs::relativize_path(path), &fs::relativize_path(path))
-                            .to_writer(&mut stdout)?;
-                        stdout.write_all(b"\n")?;
-                        stdout.flush()?;
+                        match &source_kind {
+                            SourceKind::Python(_) => {
+                                let mut stdout = io::stdout().lock();
+                                TextDiff::from_lines(contents.as_str(), &transformed)
+                                    .unified_diff()
+                                    .header(&fs::relativize_path(path), &fs::relativize_path(path))
+                                    .to_writer(&mut stdout)?;
+                                stdout.write_all(b"\n")?;
+                                stdout.flush()?;
+                            }
+                            SourceKind::Jupyter(dest_notebook) => {
+                                // We need to load the notebook again, since we might've
+                                // mutated it.
+                                let src_notebook = match load_jupyter_notebook(path) {
+                                    Ok(notebook) => notebook,
+                                    Err(diagnostic) => return Ok(*diagnostic),
+                                };
+                                let mut stdout = io::stdout().lock();
+                                for ((idx, src_cell), dest_cell) in src_notebook
+                                    .cells()
+                                    .iter()
+                                    .enumerate()
+                                    .zip(dest_notebook.cells().iter())
+                                {
+                                    let (Cell::Code(src_code_cell), Cell::Code(dest_code_cell)) = (src_cell, dest_cell) else {
+                                        continue;
+                                    };
+                                    TextDiff::from_lines(
+                                        &src_code_cell.source.to_string(),
+                                        &dest_code_cell.source.to_string(),
+                                    )
+                                    .unified_diff()
+                                    // Jupyter notebook cells don't necessarily have a newline
+                                    // at the end. For example,
+                                    //
+                                    // ```python
+                                    // print("hello")
+                                    // ```
+                                    //
+                                    // For a cell containing the above code, there'll only be one line,
+                                    // and it won't have a newline at the end. If it did, there'd be
+                                    // two lines, and the second line would be empty:
+                                    //
+                                    // ```python
+                                    // print("hello")
+                                    //
+                                    // ```
+                                    .missing_newline_hint(false)
+                                    .header(
+                                        &format!("{}:cell {}", &fs::relativize_path(path), idx),
+                                        &format!("{}:cell {}", &fs::relativize_path(path), idx),
+                                    )
+                                    .to_writer(&mut stdout)?;
+                                }
+                                stdout.write_all(b"\n")?;
+                                stdout.flush()?;
+                            }
+                        }
                     }
                     flags::FixMode::Generate => {}
                 }