# How do I avoid a SettingWithCopyWarning in pandas?

In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv('http://bit.ly/imdbratings')

In [3]:
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [4]:
# Counting up the missing values in the content_rating column
movies.content_rating.isnull().sum()

3

In [5]:
# Printing up the rows with the missing values 
movies.loc[movies.content_rating.isnull()]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
187,8.2,Butch Cassidy and the Sundance Kid,,Biography,110,"[u'Paul Newman', u'Robert Redford', u'Katharin..."
649,7.7,Where Eagles Dare,,Action,158,"[u'Richard Burton', u'Clint Eastwood', u'Mary ..."
936,7.4,True Grit,,Adventure,128,"[u'John Wayne', u'Kim Darby', u'Glen Campbell']"


In [6]:
# We can se that there is a value NOT RATED that should be represented as a missing value
movies.content_rating.value_counts()

R            460
PG-13        189
PG           123
NOT RATED     65
APPROVED      47
UNRATED       38
G             32
NC-17          7
PASSED         7
X              4
GP             3
TV-MA          1
Name: content_rating, dtype: int64

In [8]:
# Showing some of the NOT RATED rows
movies.loc[movies.content_rating=='NOT RATED'].head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
41,8.5,Sunset Blvd.,NOT RATED,Drama,110,"[u'William Holden', u'Gloria Swanson', u'Erich..."
63,8.4,M,NOT RATED,Crime,99,"[u'Peter Lorre', u'Ellen Widmann', u'Inge Land..."
66,8.4,Munna Bhai M.B.B.S.,NOT RATED,Comedy,156,"[u'Sunil Dutt', u'Sanjay Dutt', u'Arshad Warsi']"


In [24]:
# We want to acces the Series content_rating and convert the NOT RATED values to NaN

# The code we used to access only those rows with the NOT RATED values it's the first part of the code we need
# and although complex, it's basically equivalent to DataFrame. => movies.loc[movies.content_rating=='NOT RATED']
# From that, we then can access the Series in multiple ways, like: bracket notation or dot notation.

# movies.loc[movies.content_rating=='NOT RATED']['content_rating']
movies.loc[movies.content_rating=='NOT RATED'].content_rating.head()

5     NOT RATED
6     NOT RATED
41    NOT RATED
63    NOT RATED
66    NOT RATED
Name: content_rating, dtype: object

In [21]:
# Now that we have the selection that we want, we need to import numpy to convert to NaN
import numpy as np

In [22]:
# BUT When done like this, it shows a WARNING
movies.loc[movies.content_rating=='NOT RATED',].content_rating = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [27]:
# We can see that it did not work
movies.content_rating.isnull().sum()

3

In [28]:
# We can fix it by doing it the proper way using .loc[], as pandas tells us in the warning:
# Try using .loc[row_indexer,col_indexer] = value instead
movies.loc[movies.content_rating=='NOT RATED','content_rating'] = np.nan

In [29]:
# And now it worked
movies.content_rating.isnull().sum()

68

In [31]:
# Creating a new DataFrama for the next exercise
top_movies = movies.loc[movies.star_rating>=9]
top_movies

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."


In [33]:
# Imagine we wanted to change the duration of the 1st movie from 142 to 150.
# If we do it like pandas told us before, we get a warning anyways.
top_movies.loc[0,'duration'] = 150

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [36]:
# But if we check we can see it worked as expected
top_movies

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,150,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."


In [37]:
# The warning is telling us that pandas isn't sure if we're trying to modify the copy top_movies or the original movies
# SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.

# We can fix it by making a little change in the code that we use to create top_movies so pandas knows it is a copy
top_movies = movies.loc[movies.star_rating>=9].copy() # .copy()

In [38]:
# And now the warning is gone.
top_movies.loc[0,'duration'] = 160

In [39]:
# And we can see it worked
top_movies

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,160,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."


In [40]:
# And of course the original DataFrame is intact
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."
