In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

In [15]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## change options for each recipe

In [2]:
pd.set_option('max_columns', 8, 'max_rows', 10)

In [4]:
movie = pd.read_csv('../data/movie.csv')
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
4,,Doug Walker,,,...,12.0,7.1,,0


![dataframe anatomy](./images/ch01_dataframe_anatomy.png)

# Understanding data types

In [13]:
movie.get_dtype_counts()

float64    13
int64       3
object     12
dtype: int64

In [17]:
director = movie['director_name'] # save Series to variable
director.name

'director_name'

In [18]:
director.to_frame().head()

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


# Calling Series methods

## Getting ready...

In [19]:
set('abcdabcd')

{'a', 'b', 'c', 'd'}

In [8]:
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

439

In [20]:
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

445

In [21]:
len(s_attr_methods & df_attr_methods)

376

## How to do it...

In [22]:
director = movie['director_name']
actor_1_fb_likes = movie['actor_1_facebook_likes']

In [25]:
with pd.option_context('max_rows', 10):
    display(director.value_counts())

Steven Spielberg       26
Woody Allen            22
Martin Scorsese        20
Clint Eastwood         20
Spike Lee              16
                       ..
Drew Goddard            1
Thomas L. Phillips      1
Georgia Hilton          1
Giuliano Montaldo       1
Perry Andelin Blake     1
Name: director_name, Length: 2397, dtype: int64

In [26]:
pd.set_option('max_rows', 20)
actor_1_fb_likes.value_counts()

1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
13000.0    123
14000.0    120
10000.0    109
18000.0    106
22000.0     80
          ... 
437.0        1
406.0        1
762.0        1
432.0        1
644.0        1
362.0        1
216.0        1
859.0        1
225.0        1
334.0        1
Name: actor_1_facebook_likes, Length: 877, dtype: int64

In [28]:
director.shape

(4916,)

In [29]:
director.size

4916

In [30]:
len(director)

4916

In [31]:
director.count()

4814

In [32]:
actor_1_fb_likes.count()

4909

In [33]:
actor_1_fb_likes.quantile() # 等分

982.0

In [36]:
actor_1_fb_likes.quantile([.3,.5,.6])

0.3     694.0
0.5     982.0
0.6    1000.0
Name: actor_1_facebook_likes, dtype: float64

In [35]:
actor_1_fb_likes.min()
actor_1_fb_likes.max()
actor_1_fb_likes.mean()
actor_1_fb_likes.median()
actor_1_fb_likes.std()
actor_1_fb_likes.sum()

0.0

640000.0

6494.488490527602

982.0

15106.986883848309

31881444.0

In [35]:
actor_1_fb_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

## There's more...

In [44]:
director.value_counts(normalize=True) # 相当于director.value_counts()/director.count()

Steven Spielberg    0.005401
Woody Allen         0.004570
Martin Scorsese     0.004155
Clint Eastwood      0.004155
                      ...   
Benny Boom          0.000208
Alister Grierson    0.000208
Tadeo Garcia        0.000208
Julio DePietro      0.000208
Name: director_name, Length: 2397, dtype: float64

In [46]:
director.hasnans

True

In [47]:
director.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

# Working with operators on a Series

In [56]:
pd.options.display.max_rows=6

In [49]:
7 in [1, 2, 6]    # in operator checks for membership of a list

False

In [50]:
set([1,2,3]) & set([2,3,4]) # 交集

{2, 3}

In [58]:
a = set([1,2,3])     
a[0]                 # the indexing operator does not work with sets

TypeError: 'set' object does not support indexing

## Getting ready...

In [57]:
imdb_score = movie['imdb_score']
imdb_score

0       7.9
1       7.1
2       6.8
       ... 
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [58]:
imdb_score + 1

0       8.9
1       8.1
2       7.8
       ... 
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [59]:
imdb_score // 7

0       1.0
1       1.0
2       0.0
       ... 
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [60]:
imdb_score > 7

0        True
1        True
2       False
        ...  
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [61]:
director = movie['director_name']

In [62]:
director == 'James Cameron'

0        True
1       False
2       False
        ...  
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

## There's more...

In [63]:
imdb_score.add(1)              # imdb_score + 1

0       8.9
1       8.1
2       7.8
       ... 
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [64]:
imdb_score.mul(2.5)            # imdb_score * 2.5

0       19.75
1       17.75
2       17.00
        ...  
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [65]:
imdb_score.floordiv(7)         # imdb_score // 7

0       1.0
1       1.0
2       0.0
       ... 
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [66]:
imdb_score.gt(7)               # imdb_score > 7

0        True
1        True
2       False
        ...  
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [67]:
director.eq('James Cameron')   # director == 'James Cameron'

0        True
1       False
2       False
        ...  
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [68]:
imdb_score.astype(int).mod(5)

0       2
1       2
2       1
       ..
4913    1
4914    1
4915    1
Name: imdb_score, Length: 4916, dtype: int64

In [69]:
a = type(1)

In [70]:
type(1)
type(a)

int

type

In [71]:
a = type(imdb_score)

In [72]:
a
a([1,2,3])

pandas.core.series.Series

0    1
1    2
2    3
dtype: int64

# Chaining Series methods together

In [74]:
director.value_counts().head(3)

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Name: director_name, dtype: int64

In [75]:
actor_1_fb_likes.isnull().sum()

7

In [76]:
actor_1_fb_likes.dtype

dtype('float64')

In [77]:
actor_1_fb_likes.fillna(0)\
                .astype(int)\
                .head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

## There's more...

In [80]:
actor_1_fb_likes.isnull().describe()

count      4916
unique        2
top       False
freq       4909
Name: actor_1_facebook_likes, dtype: object

In [81]:
actor_1_fb_likes.isnull().mean()

0.0014239218877135883

In [82]:
(actor_1_fb_likes.fillna(0)
                 .astype(int)
                 .head())

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

# Make a Meaningful Index

In [83]:
movie.shape

(4916, 28)

In [88]:
pd.set_option('max_columns',5)
movie2 = movie.set_index('movie_title')
movie2

Unnamed: 0_level_0,color,director_name,...,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avatar,Color,James Cameron,...,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,...,2.35,0
Spectre,Color,Sam Mendes,...,2.35,85000
...,...,...,...,...,...
A Plague So Pleasant,Color,Benjamin Roberds,...,,16
Shanghai Calling,Color,Daniel Hsia,...,2.35,660
My Date with Drew,Color,Jon Gunn,...,1.85,456


In [89]:
pd.read_csv('../data/movie.csv', index_col='movie_title')

Unnamed: 0_level_0,color,director_name,...,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avatar,Color,James Cameron,...,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,...,2.35,0
Spectre,Color,Sam Mendes,...,2.35,85000
...,...,...,...,...,...
A Plague So Pleasant,Color,Benjamin Roberds,...,,16
Shanghai Calling,Color,Daniel Hsia,...,2.35,660
My Date with Drew,Color,Jon Gunn,...,1.85,456


# There's more...

In [90]:
movie2.reset_index()

Unnamed: 0,movie_title,color,...,aspect_ratio,movie_facebook_likes
0,Avatar,Color,...,1.78,33000
1,Pirates of the Caribbean: At World's End,Color,...,2.35,0
2,Spectre,Color,...,2.35,85000
...,...,...,...,...,...
4913,A Plague So Pleasant,Color,...,,16
4914,Shanghai Calling,Color,...,2.35,660
4915,My Date with Drew,Color,...,1.85,456


# Renaming row and column labels

In [94]:
movie = pd.read_csv('../data/movie.csv', index_col='movie_title')

In [96]:
movie.index

Index(['Avatar', 'Pirates of the Caribbean: At World's End', 'Spectre',
       'The Dark Knight Rises', 'Star Wars: Episode VII - The Force Awakens',
       'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron',
       'Harry Potter and the Half-Blood Prince',
       ...
       'Primer', 'Cavite', 'El Mariachi', 'The Mongol King', 'Newlyweds',
       'Signed Sealed Delivered', 'The Following', 'A Plague So Pleasant',
       'Shanghai Calling', 'My Date with Drew'],
      dtype='object', name='movie_title', length=4916)

In [97]:
indexes_renamed = {'Avatar':'Ratava', 'Spectre': 'Ertceps'} 
columns_renamed = {'director_name':'Director Name', 
                       'num_critic_for_reviews': 'Critical Reviews'} 

In [98]:
movie.rename(index=indexes_renamed, columns=columns_renamed).head()

Unnamed: 0_level_0,color,Director Name,...,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ratava,Color,James Cameron,...,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,...,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,...,,0


# There's more

In [100]:
movie = pd.read_csv('../data/movie.csv', index_col='movie_title')
index = movie.index
column = movie.columns

In [105]:
index_list = index.tolist()
column_list = column.tolist()

In [106]:
index_list[0] = 'Ratava'
column_list[1] = 'Director Name'

In [108]:
index[0]
index_list[0]
print(index_list[:5])

'Avatar'

'Ratava'

['Ratava', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'Star Wars: Episode VII - The Force Awakens']


In [109]:
movie_index = index_list
movie_columns = column_list

# Creating and deleting columns

In [110]:
movie['has_seen'] = 0

In [111]:
movie.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes', 'has_seen'],
      dtype='object')

In [112]:
movie['actor_director_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                              movie['actor_2_facebook_likes'] + 
                                              movie['actor_3_facebook_likes'] + 
                                              movie['director_facebook_likes'])

In [113]:
movie['actor_director_facebook_likes'].isnull().sum()

122

In [114]:
movie['actor_director_facebook_likes'] = movie['actor_director_facebook_likes'].fillna(0)

In [115]:
movie['is_cast_likes_more'] = (movie['cast_total_facebook_likes'] >= 
                                  movie['actor_director_facebook_likes'])

In [116]:
movie['is_cast_likes_more'].all()

False

In [117]:
movie = movie.drop('actor_director_facebook_likes', axis='columns')

In [118]:
movie['actor_total_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                       movie['actor_2_facebook_likes'] + 
                                       movie['actor_3_facebook_likes'])

movie['actor_total_facebook_likes'] = movie['actor_total_facebook_likes'].fillna(0)

In [119]:
movie['is_cast_likes_more'] = movie['cast_total_facebook_likes'] >= \
                                  movie['actor_total_facebook_likes']
    
movie['is_cast_likes_more'].all()

True

In [120]:
movie['pct_actor_cast_like'] = (movie['actor_total_facebook_likes'] / 
                                movie['cast_total_facebook_likes'])

In [121]:
movie['pct_actor_cast_like'].min(), movie['pct_actor_cast_like'].max() 

(0.0, 1.0)

## There's more...

In [130]:
movie.columns.get_loc('color')#Get integer location for requested label.

0

In [136]:
profit_index = movie.columns.get_loc('gross') + 1 

In [138]:
movie.insert(8, 'profit', movie.gross - movie.budget) # 插入行

ValueError: cannot insert profit, already exists

In [139]:
pd.set_option('max_columns',16)
movie

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen,is_cast_likes_more,actor_total_facebook_likes,pct_actor_cast_like
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,...,936.0,7.9,1.78,33000,0,True,2791.0,0.577369
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,...,5000.0,7.1,2.35,0,0,True,46000.0,0.951396
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,...,393.0,6.8,2.35,85000,0,True,11554.0,0.987521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A Plague So Pleasant,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,...,0.0,6.3,,16,0,True,0.0,
Shanghai Calling,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,...,719.0,6.3,2.35,660,0,True,2154.0,0.902766
My Date with Drew,Color,Jon Gunn,43.0,90.0,16.0,16.0,Brian Herzlinger,86.0,...,23.0,6.6,1.85,456,0,True,125.0,0.766871
