From 3c99fbf808d74e07f0eba0525ae38f21c90dd752 Mon Sep 17 00:00:00 2001
From: Dhruv Manilawala <dhruvmanila@gmail.com>
Date: Sat, 29 Jul 2023 09:52:56 +0530
Subject: [PATCH] Implement `--diff` for Jupyter Notebooks (#6149)

## Summary

Implement `--diff` for Jupyter Notebooks

## Test Plan

1. Use `crates/ruff/resources/test/fixtures/jupyter/isort.ipynb` as a
test case
and add a markdown cell in between the code cells to check that the diff
   outputs the correct cell index.
2. Run the command:
`cargo run --bin ruff --package ruff_cli -- check --no-cache --isolated
--select=ALL crates/ruff/resources/test/fixtures/jupyter/isort.ipynb
--fix --diff`

<details><summary>Example output:</summary>
<p>

```diff
--- /Users/dhruv/playground/ruff/notebooks/test.ipynb:cell 0
+++ /Users/dhruv/playground/ruff/notebooks/test.ipynb:cell 0
@@ -1,3 +0,0 @@
-from pathlib import Path
-import random
-import math
--- /Users/dhruv/playground/ruff/notebooks/test.ipynb:cell 4
+++ /Users/dhruv/playground/ruff/notebooks/test.ipynb:cell 4
@@ -1,5 +1,3 @@
-from typing import Any
-import collections
 # Newline should be added here
 def foo():
     pass

--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 8
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 8
@@ -1,8 +1,7 @@
 import pprint
 import tempfile

-from IPython import display
 import matplotlib.pyplot as plt
-
 import tensorflow as tf
-import tensorflow_datasets as tfds
+import tensorflow_datasets as tfds
+from IPython import display
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 10
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 10
@@ -1,5 +1,4 @@
 import tensorflow_models as tfm

 # These are not in the tfm public API for v2.9. They will be available in v2.10
-from official.vision.serving import export_saved_model_lib
-import official.core.train_lib
+from official.vision.serving import export_saved_model_lib
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 13
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 13
@@ -1,5 +1,5 @@
-exp_config = tfm.core.exp_factory.get_exp_config('resnet_imagenet')
-tfds_name = 'cifar10'
+exp_config = tfm.core.exp_factory.get_exp_config("resnet_imagenet")
+tfds_name = "cifar10"
 ds,ds_info = tfds.load(
 tfds_name,
 with_info=True)
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 15
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 15
@@ -6,12 +6,12 @@
 # Configure training and testing data
 batch_size = 128

-exp_config.task.train_data.input_path = ''
+exp_config.task.train_data.input_path = ""
 exp_config.task.train_data.tfds_name = tfds_name
-exp_config.task.train_data.tfds_split = 'train'
+exp_config.task.train_data.tfds_split = "train"
 exp_config.task.train_data.global_batch_size = batch_size

-exp_config.task.validation_data.input_path = ''
+exp_config.task.validation_data.input_path = ""
 exp_config.task.validation_data.tfds_name = tfds_name
-exp_config.task.validation_data.tfds_split = 'test'
+exp_config.task.validation_data.tfds_split = "test"
 exp_config.task.validation_data.global_batch_size = batch_size
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 17
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 17
@@ -1,16 +1,16 @@
 logical_device_names = [logical_device.name for logical_device in tf.config.list_logical_devices()]

-if 'GPU' in ''.join(logical_device_names):
-  print('This may be broken in Colab.')
-  device = 'GPU'
-elif 'TPU' in ''.join(logical_device_names):
-  print('This may be broken in Colab.')
-  device = 'TPU'
+if "GPU" in "".join(logical_device_names):
+  print("This may be broken in Colab.")
+  device = "GPU"
+elif "TPU" in "".join(logical_device_names):
+  print("This may be broken in Colab.")
+  device = "TPU"
 else:
-  print('Running on CPU is slow, so only train for a few steps.')
-  device = 'CPU'
+  print("Running on CPU is slow, so only train for a few steps.")
+  device = "CPU"

-if device=='CPU':
+if device=="CPU":
   train_steps = 20
   exp_config.trainer.steps_per_loop = 5
 else:
@@ -20,9 +20,9 @@
 exp_config.trainer.summary_interval = 100
 exp_config.trainer.checkpoint_interval = train_steps
 exp_config.trainer.validation_interval = 1000
-exp_config.trainer.validation_steps =  ds_info.splits['test'].num_examples // batch_size
+exp_config.trainer.validation_steps =  ds_info.splits["test"].num_examples // batch_size
 exp_config.trainer.train_steps = train_steps
-exp_config.trainer.optimizer_config.learning_rate.type = 'cosine'
+exp_config.trainer.optimizer_config.learning_rate.type = "cosine"
 exp_config.trainer.optimizer_config.learning_rate.cosine.decay_steps = train_steps
 exp_config.trainer.optimizer_config.learning_rate.cosine.initial_learning_rate = 0.1
 exp_config.trainer.optimizer_config.warmup.linear.warmup_steps = 100
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 21
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 21
@@ -1,14 +1,14 @@
 logical_device_names = [logical_device.name for logical_device in tf.config.list_logical_devices()]

 if exp_config.runtime.mixed_precision_dtype == tf.float16:
-    tf.keras.mixed_precision.set_global_policy('mixed_float16')
+    tf.keras.mixed_precision.set_global_policy("mixed_float16")

-if 'GPU' in ''.join(logical_device_names):
+if "GPU" in "".join(logical_device_names):
   distribution_strategy = tf.distribute.MirroredStrategy()
-elif 'TPU' in ''.join(logical_device_names):
+elif "TPU" in "".join(logical_device_names):
   tf.tpu.experimental.initialize_tpu_system()
-  tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='/device:TPU_SYSTEM:0')
+  tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="/device:TPU_SYSTEM:0")
   distribution_strategy = tf.distribute.experimental.TPUStrategy(tpu)
 else:
-  print('Warning: this will be really slow.')
+  print("Warning: this will be really slow.")
   distribution_strategy = tf.distribute.OneDeviceStrategy(logical_device_names[0])
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 23
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 23
@@ -1,5 +1,3 @@
 with distribution_strategy.scope():
   model_dir = tempfile.mkdtemp()
   task = tfm.core.task_factory.get_task(exp_config.task, logging_dir=model_dir)
-
-#  tf.keras.utils.plot_model(task.build_model(), show_shapes=True)
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 24
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 24
@@ -1,4 +1,4 @@
 for images, labels in task.build_inputs(exp_config.task.train_data).take(1):
   print()
-  print(f'images.shape: {str(images.shape):16}  images.dtype: {images.dtype!r}')
-  print(f'labels.shape: {str(labels.shape):16}  labels.dtype: {labels.dtype!r}')
+  print(f"images.shape: {images.shape!s:16}  images.dtype: {images.dtype!r}")
+  print(f"labels.shape: {labels.shape!s:16}  labels.dtype: {labels.dtype!r}")
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 27
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 27
@@ -1 +1 @@
-plt.hist(images.numpy().flatten());
+plt.hist(images.numpy().flatten())
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 29
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 29
@@ -1,2 +1,2 @@
-label_info = ds_info.features['label']
+label_info = ds_info.features["label"]
 label_info.int2str(1)
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 31
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 31
@@ -10,9 +10,6 @@
     if predictions is None:
       plt.title(label_info.int2str(labels[i]))
     else:
-      if labels[i] == predictions[i]:
-        color = 'g'
-      else:
-        color = 'r'
+      color = "g" if labels[i] == predictions[i] else "r"
       plt.title(label_info.int2str(predictions[i]), color=color)
     plt.axis("off")
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 35
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 35
@@ -1,3 +1,3 @@
-plt.figure(figsize=(10, 10));
+plt.figure(figsize=(10, 10))
 for images, labels in task.build_inputs(exp_config.task.validation_data).take(1):
   show_batch(images, labels)
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 37
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 37
@@ -1,7 +1,7 @@
 model, eval_logs = tfm.core.train_lib.run_experiment(
     distribution_strategy=distribution_strategy,
     task=task,
-    mode='train_and_eval',
+    mode="train_and_eval",
     params=exp_config,
     model_dir=model_dir,
     run_post_eval=True)
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 38
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 38
@@ -1 +0,0 @@
-#  tf.keras.utils.plot_model(model, show_shapes=True)
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 40
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 40
@@ -1,4 +1,4 @@
 for key, value in eval_logs.items():
     if isinstance(value, tf.Tensor):
       value = value.numpy()
-    print(f'{key:20}: {value:.3f}')
+    print(f"{key:20}: {value:.3f}")
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 42
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 42
@@ -4,5 +4,5 @@

 show_batch(images, labels, tf.cast(predictions, tf.int32))

-if device=='CPU':
-  plt.suptitle('The model was only trained for a few steps, it is not expected to do well.')
+if device=="CPU":
+  plt.suptitle("The model was only trained for a few steps, it is not expected to do well.")
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 45
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 45
@@ -1,8 +1,8 @@
 # Saving and exporting the trained model
 export_saved_model_lib.export_inference_graph(
-    input_type='image_tensor',
+    input_type="image_tensor",
     batch_size=1,
     input_image_size=[32, 32],
     params=exp_config,
     checkpoint_path=tf.train.latest_checkpoint(model_dir),
-    export_dir='./export/')
+    export_dir="./export/")
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 47
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 47
@@ -1,3 +1,3 @@
 # Importing SavedModel
-imported = tf.saved_model.load('./export/')
-model_fn = imported.signatures['serving_default']
+imported = tf.saved_model.load("./export/")
+model_fn = imported.signatures["serving_default"]
--- /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 49
+++ /Users/dhruv/playground/ruff/notebooks/image_classification.ipynb:cell 49
@@ -1,10 +1,10 @@
 plt.figure(figsize=(10, 10))
-for data in tfds.load('cifar10', split='test').batch(12).take(1):
+for data in tfds.load("cifar10", split="test").batch(12).take(1):
   predictions = []
-  for image in data['image']:
-    index = tf.argmax(model_fn(image[tf.newaxis, ...])['logits'], axis=1)[0]
+  for image in data["image"]:
+    index = tf.argmax(model_fn(image[tf.newaxis, ...])["logits"], axis=1)[0]
     predictions.append(index)
-  show_batch(data['image'], data['label'], predictions)
+  show_batch(data["image"], data["label"], predictions)

-  if device=='CPU':
-    plt.suptitle('The model was only trained for a few steps, it is not expected to do better than random.')
+  if device=="CPU":
+    plt.suptitle("The model was only trained for a few steps, it is not expected to do better than random.")

Would fix 61 errors.
```

</p>
</details>

resolves: #4727
---
 crates/ruff/src/jupyter/notebook.rs | 20 +++++++++
 crates/ruff_cli/src/diagnostics.rs  | 67 +++++++++++++++++++++++++----
 2 files changed, 79 insertions(+), 8 deletions(-)
diff --git a/crates/ruff/src/jupyter/notebook.rs b/crates/ruff/src/jupyter/notebook.rs
index d6e5885e56776..37d372a65d5f1 100644
--- a/crates/ruff/src/jupyter/notebook.rs
+++ b/crates/ruff/src/jupyter/notebook.rs
@@ -1,4 +1,5 @@
 use std::cmp::Ordering;
+use std::fmt::Display;
 use std::fs::File;
 use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
 use std::iter;
@@ -41,6 +42,20 @@ pub fn round_trip(path: &Path) -> anyhow::Result<String> {
     Ok(String::from_utf8(writer)?)
 }
 
+impl Display for SourceValue {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            SourceValue::String(string) => f.write_str(string),
+            SourceValue::StringArray(string_array) => {
+                for string in string_array {
+                    f.write_str(string)?;
+                }
+                Ok(())
+            }
+        }
+    }
+}
+
 impl Cell {
     /// Return the [`SourceValue`] of the cell.
     fn source(&self) -> &SourceValue {
@@ -407,6 +422,11 @@ impl Notebook {
         self.content = transformed.to_string();
     }
 
+    /// Return a slice of [`Cell`] in the Jupyter notebook.
+    pub fn cells(&self) -> &[Cell] {
+        &self.raw.cells
+    }
+
     /// Return `true` if the notebook is a Python notebook, `false` otherwise.
     pub fn is_python_notebook(&self) -> bool {
         self.raw
diff --git a/crates/ruff_cli/src/diagnostics.rs b/crates/ruff_cli/src/diagnostics.rs
index 5e494a0482713..47217002981a2 100644
--- a/crates/ruff_cli/src/diagnostics.rs
+++ b/crates/ruff_cli/src/diagnostics.rs
@@ -17,7 +17,7 @@ use similar::TextDiff;
 use std::os::unix::fs::PermissionsExt;
 
 use crate::cache::Cache;
-use ruff::jupyter::Notebook;
+use ruff::jupyter::{Cell, Notebook};
 use ruff::linter::{lint_fix, lint_only, FixTable, FixerResult, LinterResult};
 use ruff::logging::DisplayParseError;
 use ruff::message::Message;
@@ -261,13 +261,64 @@ pub(crate) fn lint_path(
                         }
                     },
                     flags::FixMode::Diff => {
-                        let mut stdout = io::stdout().lock();
-                        TextDiff::from_lines(contents.as_str(), &transformed)
-                            .unified_diff()
-                            .header(&fs::relativize_path(path), &fs::relativize_path(path))
-                            .to_writer(&mut stdout)?;
-                        stdout.write_all(b"\n")?;
-                        stdout.flush()?;
+                        match &source_kind {
+                            SourceKind::Python(_) => {
+                                let mut stdout = io::stdout().lock();
+                                TextDiff::from_lines(contents.as_str(), &transformed)
+                                    .unified_diff()
+                                    .header(&fs::relativize_path(path), &fs::relativize_path(path))
+                                    .to_writer(&mut stdout)?;
+                                stdout.write_all(b"\n")?;
+                                stdout.flush()?;
+                            }
+                            SourceKind::Jupyter(dest_notebook) => {
+                                // We need to load the notebook again, since we might've
+                                // mutated it.
+                                let src_notebook = match load_jupyter_notebook(path) {
+                                    Ok(notebook) => notebook,
+                                    Err(diagnostic) => return Ok(*diagnostic),
+                                };
+                                let mut stdout = io::stdout().lock();
+                                for ((idx, src_cell), dest_cell) in src_notebook
+                                    .cells()
+                                    .iter()
+                                    .enumerate()
+                                    .zip(dest_notebook.cells().iter())
+                                {
+                                    let (Cell::Code(src_code_cell), Cell::Code(dest_code_cell)) = (src_cell, dest_cell) else {
+                                        continue;
+                                    };
+                                    TextDiff::from_lines(
+                                        &src_code_cell.source.to_string(),
+                                        &dest_code_cell.source.to_string(),
+                                    )
+                                    .unified_diff()
+                                    // Jupyter notebook cells don't necessarily have a newline
+                                    // at the end. For example,
+                                    //
+                                    // ```python
+                                    // print("hello")
+                                    // ```
+                                    //
+                                    // For a cell containing the above code, there'll only be one line,
+                                    // and it won't have a newline at the end. If it did, there'd be
+                                    // two lines, and the second line would be empty:
+                                    //
+                                    // ```python
+                                    // print("hello")
+                                    //
+                                    // ```
+                                    .missing_newline_hint(false)
+                                    .header(
+                                        &format!("{}:cell {}", &fs::relativize_path(path), idx),
+                                        &format!("{}:cell {}", &fs::relativize_path(path), idx),
+                                    )
+                                    .to_writer(&mut stdout)?;
+                                }
+                                stdout.write_all(b"\n")?;
+                                stdout.flush()?;
+                            }
+                        }
                     }
                     flags::FixMode::Generate => {}
                 }