In [2]:
import numpy as np
import pandas as pd

# Previous behavior
- Some indexing operations return views
- Some indexing operations return copies

In [3]:
# Create a DataFrame
df = pd.DataFrame({
    'foo': [1, 2, 3],
    'bar': [4, 5, 6]
})

df

Unnamed: 0,foo,bar
0,1,4
1,2,5
2,3,6


In [None]:
# Select subset of DataFrame -- the column 'foo'
subset = df['foo']
subset

0    1
1    2
2    3
Name: foo, dtype: int64

In [5]:
# Select and change first element of subset
subset.iloc[0] = 100
subset

0    100
1      2
2      3
Name: foo, dtype: int64

In [6]:
# The original DataFrame has also been changed!
df

Unnamed: 0,foo,bar
0,100,4
1,2,5
2,3,6


In [None]:
# Set copy_on_write=True to avoid this behavior
pd.options.mode.copy_on_write = True

# Create DataFrame
df = pd.DataFrame({
    'foo': [1, 2, 3],
    'bar': [4, 5, 6]
})

# Select subset of DataFrame -- the column 'foo'
subset = df['foo']

# Select and change first element of subset
subset.iloc[0] = 100

# The original DataFrame has not been changed
df

Unnamed: 0,foo,bar
0,1,4
1,2,5
2,3,6


# Migrating to Copy-on-Write
- Chained assignment will never work
- Accessing the underlying array of a pandas object will return a read-only view
- Only one pandas object is updated at once
- Constructors now copy NumPy arrays by default

# Description

In [8]:
# Create DataFrame
df = pd.DataFrame({
    'foo': [1, 2, 3],
    'bar': [4, 5, 6]
})

df

Unnamed: 0,foo,bar
0,1,4
1,2,5
2,3,6


In [9]:
# Set element at first row and first column to 100
df.iloc[0, 0] = 100
df

Unnamed: 0,foo,bar
0,100,4
1,2,5
2,3,6


In [10]:
# Create DataFrame
df = pd.DataFrame({
    'foo': [1, 2, 3],
    'bar': [4, 5, 6]
})

df

Unnamed: 0,foo,bar
0,1,4
1,2,5
2,3,6


In [None]:
# Create lazy copy of DataFrame using reset_index()
df2 = df.reset_index(drop=True)
df2

Unnamed: 0,foo,bar
0,1,4
1,2,5
2,3,6


In [12]:
# Set the element at the first row and column of the new DataFrame to 100
# This triggers a copy of the data
df2.iloc[0, 0] = 100
df2

Unnamed: 0,foo,bar
0,100,4
1,2,5
2,3,6


In [13]:
# The original DataFrame is unchanged
df

Unnamed: 0,foo,bar
0,1,4
1,2,5
2,3,6


In [14]:
# Emulate an in-place operation by setting the old DataFrame to the new DataFrame
df = pd.DataFrame({
    'foo': [1, 2, 3],
    'bar': [4, 5, 6]
})

df = df.reset_index(drop=True)
df.iloc[0, 0] = 100

df

Unnamed: 0,foo,bar
0,100,4
1,2,5
2,3,6


In [15]:
# Create DataFrame
# Create view to DataFrame
# Modify DataFrame -- This changes both DataFrame and view!

with pd.option_context('mode.copy_on_write', False):
    df = pd.DataFrame({
        'foo': [1, 2, 3],
        'bar': [4, 5, 6]
    })

    view = df[:]
    df.iloc[0, 0] = 100
    
    print(df)
    print(view)

   foo  bar
0  100    4
1    2    5
2    3    6
   foo  bar
0  100    4
1    2    5
2    3    6


In [16]:
# Create DataFrame
# Create view to DataFrame
# Modify DataFrame -- This changes only the original DataFrame, not the view!
df = pd.DataFrame({
    'foo': [1, 2, 3],
    'bar': [4, 5, 6]
})

view = df[:]
df.iloc[0, 0] = 100

print(df)
print(view)

   foo  bar
0  100    4
1    2    5
2    3    6
   foo  bar
0    1    4
1    2    5
2    3    6


# Chained Assignment

In [None]:
# Create DataFrame
# Select elements of column 'foo' where elements of column 'bar' are greater than 5 through chained [] operators
# Modify these elements
# The original DataFrame is changed
# Note the warning this produces!

with pd.option_context('mode.copy_on_write', False):
    df = pd.DataFrame({
        'foo': [1, 2, 3],
        'bar': [4, 5, 6]
    })

    df['foo'][df['bar'] > 5] = 100
    print(df)

   foo  bar
0    1    4
1    2    5
2  100    6


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['foo'][df['bar'] > 5] = 100


In [19]:
# Select elements of column 'foo' where elements of column 'bar' are greater than 5 using .loc[]
# Modify these elements
# The original DataFrame is changed
df = pd.DataFrame({
    'foo': [1, 2, 3],
    'bar': [4, 5, 6]
})

df.loc[df['bar'] > 5, 'foo'] = 100
df

Unnamed: 0,foo,bar
0,1,4
1,2,5
2,100,6


# Read-only `NumPy` arrays

In [None]:
# Create DataFrame
# Get underlying NumPy array -- this is a copy if the initial DataFrame has more than one array
# Changing the array does not change the DataFrame
df = pd.DataFrame({
    'a': [1, 2],
    'b': [1.5, 2.5]
})

arr = df.to_numpy()
arr[0, 0] = 100

print(arr)
print(df)

[[100.    1.5]
 [  2.    2.5]]
   a    b
0  1  1.5
1  2  2.5


In [28]:
# Create DataFrame
# Get underlying NumPy array -- this is a read-only view if the initial DataFrame has only one array
# Changing the array is not allowed
df = pd.DataFrame({
    'a': [1, 2],
    'b': [3, 4]
})

arr = df.to_numpy()
try:
    arr[0, 0] = 100
except ValueError as e:
    print("ValueError:", e)

ValueError: assignment destination is read-only


# Patterns to avoid
- If data is no longer needed, modifing it in place by reassigning to the variable is more performant than creating copies

# Copy-on-Write optimizations
- Lazy copy mechanism -- Original object is only copied if and when an object sharing data with the original object is modified
- This allows many methods to return views instead of copies, improving performance

# How to enable CoW

In [29]:
# Enable Copy-on-Write mode
pd.options.mode.copy_on_write = True

In [30]:
# Enable Copy-on-Write mode
pd.set_option('mode.copy_on_write', True)