# Chapter 1: Pandas Foundations

In [1]:
import pandas as pd
import numpy as np

## Introduction

## Dissecting the anatomy of a DataFrame

In [2]:
pd.set_option("display.max_columns", 4, "display.max_rows", 10)

In [3]:
movies = pd.read_csv("../data/movie.csv")
movies.head()

Unnamed: 0,color,director_name,...,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,...,1.78,33000
1,Color,Gore Verbinski,...,2.35,0
2,Color,Sam Mendes,...,2.35,85000
3,Color,Christopher Nolan,...,2.35,164000
4,,Doug Walker,...,,0


### How it works...

## DataFrame Attributes

### How to do it... {#how-to-do-it-1}

In [4]:
movies = pd.read_csv("../data/movie.csv")
columns = movies.columns
index = movies.index
data = movies.values

In [5]:
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [6]:
index

RangeIndex(start=0, stop=4916, step=1)

In [7]:
data

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ...,
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]],
      shape=(4916, 28), dtype=object)

In [8]:
type(index)

pandas.core.indexes.range.RangeIndex

In [9]:
type(columns)

pandas.core.indexes.base.Index

In [10]:
type(data)

numpy.ndarray

In [11]:
issubclass(pd.RangeIndex, pd.Index)

True

### How it works...

### There's more

In [12]:
index.values

array([   0,    1,    2, ..., 4913, 4914, 4915], shape=(4916,))

In [13]:
columns.values

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes'], dtype=object)

## Understanding Data Types

### How to do it... {#how-to-do-it-2}

In [14]:
movies = pd.read_csv("../data/movie.csv")

In [15]:
movies.dtypes

color                       object
director_name               object
num_critic_for_reviews     float64
duration                   float64
director_facebook_likes    float64
                            ...   
title_year                 float64
actor_2_facebook_likes     float64
imdb_score                 float64
aspect_ratio               float64
movie_facebook_likes         int64
Length: 28, dtype: object

In [16]:
movies.dtypes.value_counts()

float64    13
object     12
int64       3
Name: count, dtype: int64

In [17]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4916 entries, 0 to 4915
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      4897 non-null   object 
 1   director_name              4814 non-null   object 
 2   num_critic_for_reviews     4867 non-null   float64
 3   duration                   4901 non-null   float64
 4   director_facebook_likes    4814 non-null   float64
 5   actor_3_facebook_likes     4893 non-null   float64
 6   actor_2_name               4903 non-null   object 
 7   actor_1_facebook_likes     4909 non-null   float64
 8   gross                      4054 non-null   float64
 9   genres                     4916 non-null   object 
 10  actor_1_name               4909 non-null   object 
 11  movie_title                4916 non-null   object 
 12  num_voted_users            4916 non-null   int64  
 13  cast_total_facebook_likes  4916 non-null   int64

### How it works...

In [18]:
pd.Series(["Paul", np.nan, "George"]).dtype

dtype('O')

### There's more...

### See also

## Selecting a Column

### How to do it... {#how-to-do-it-3}

In [19]:
movies = pd.read_csv("../data/movie.csv")
movies["director_name"]

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [20]:
movies.director_name

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [21]:
movies.loc[:, "director_name"]

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [22]:
movies.iloc[:, 1]

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [23]:
movies["director_name"].index

RangeIndex(start=0, stop=4916, step=1)

In [24]:
movies["director_name"].dtype

dtype('O')

In [25]:
movies["director_name"].size

4916

In [26]:
movies["director_name"].name

'director_name'

In [27]:
type(movies["director_name"])

pandas.core.series.Series

In [28]:
movies["director_name"].apply(type).unique()

array([<class 'str'>, <class 'float'>], dtype=object)

### How it works...

### There's more

### See also

## Calling Series Methods

In [29]:
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

419

In [30]:
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

437

In [31]:
len(s_attr_methods & df_attr_methods)

362

### How to do it... {#how-to-do-it-4}

In [32]:
movies = pd.read_csv("../data/movie.csv")
director = movies["director_name"]
fb_likes = movies["actor_1_facebook_likes"]

In [33]:
director.dtype

dtype('O')

In [34]:
fb_likes.dtype

dtype('float64')

In [35]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [36]:
director.sample(n=5, random_state=42)

2347      Brian Percival
4687         Lucio Fulci
691        Phillip Noyce
3911       Sam Peckinpah
2488    Rowdy Herrington
Name: director_name, dtype: object

In [37]:
fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [38]:
director.value_counts()

director_name
Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
Ridley Scott        16
                    ..
John Putch           1
Luca Guadagnino      1
Sam Fell             1
Dan Fogelman         1
Daniel Hsia          1
Name: count, Length: 2397, dtype: int64

In [39]:
fb_likes.value_counts()

actor_1_facebook_likes
1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
          ... 
703.0        1
208.0        1
79.0         1
269.0        1
291.0        1
Name: count, Length: 877, dtype: int64

In [40]:
director.size

4916

In [41]:
director.shape

(4916,)

In [42]:
len(director)

4916

In [43]:
director.unique()

array(['James Cameron', 'Gore Verbinski', 'Sam Mendes', ...,
       'Scott Smith', 'Benjamin Roberds', 'Daniel Hsia'],
      shape=(2398,), dtype=object)

In [44]:
director.count()

np.int64(4814)

In [45]:
fb_likes.count()

np.int64(4909)

In [46]:
fb_likes.quantile()

np.float64(982.0)

In [47]:
fb_likes.min()

np.float64(0.0)

In [48]:
fb_likes.max()

np.float64(640000.0)

In [49]:
fb_likes.mean()

np.float64(6494.488490527602)

In [50]:
fb_likes.median()

np.float64(982.0)

In [51]:
fb_likes.std()

np.float64(15106.986883848185)

In [52]:
fb_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [53]:
director.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

In [54]:
fb_likes.quantile(0.2)

np.float64(510.0)

In [55]:
fb_likes.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

0.1      240.0
0.2      510.0
0.3      694.0
0.4      854.0
0.5      982.0
0.6     1000.0
0.7     8000.0
0.8    13000.0
0.9    18000.0
Name: actor_1_facebook_likes, dtype: float64

In [56]:
director.isna()

0       False
1       False
2       False
3       False
4       False
        ...  
4911    False
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [57]:
fb_likes_filled = fb_likes.fillna(0)
fb_likes_filled.count()

np.int64(4916)

In [58]:
fb_likes_dropped = fb_likes.dropna()
fb_likes_dropped.size

4909

### How it works...

### There's more...

In [59]:
director.value_counts(normalize=True)

director_name
Steven Spielberg    0.005401
Woody Allen         0.004570
Martin Scorsese     0.004155
Clint Eastwood      0.004155
Ridley Scott        0.003324
                      ...   
John Putch          0.000208
Luca Guadagnino     0.000208
Sam Fell            0.000208
Dan Fogelman        0.000208
Daniel Hsia         0.000208
Name: proportion, Length: 2397, dtype: float64

In [60]:
director.hasnans

True

In [61]:
director.notna()

0        True
1        True
2        True
3        True
4        True
        ...  
4911     True
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool

### See also

## Series Operations

In [62]:
5 + 9  # plus operator example. Adds 5 and 9

14

### How to do it... {#how-to-do-it-5}

In [63]:
movies = pd.read_csv("../data/movie.csv")
imdb_score = movies["imdb_score"]
imdb_score

0       7.9
1       7.1
2       6.8
3       8.5
4       7.1
       ... 
4911    7.7
4912    7.5
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [64]:
imdb_score + 1

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [65]:
imdb_score * 2.5

0       19.75
1       17.75
2       17.00
3       21.25
4       17.75
        ...  
4911    19.25
4912    18.75
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [66]:
imdb_score // 7

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
4911    1.0
4912    1.0
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [67]:
imdb_score * 7

0       55.3
1       49.7
2       47.6
3       59.5
4       49.7
        ... 
4911    53.9
4912    52.5
4913    44.1
4914    44.1
4915    46.2
Name: imdb_score, Length: 4916, dtype: float64

In [68]:
imdb_score > 7

0        True
1        True
2       False
3        True
4        True
        ...  
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [69]:
director = movies["director_name"]
director == "James Cameron"

0        True
1       False
2       False
3       False
4       False
        ...  
4911    False
4912    False
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

### How it works...

### There's more...

In [70]:
imdb_score.add(1)  # imdb_score + 1

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [71]:
imdb_score.gt(7)  # imdb_score > 7

0        True
1        True
2       False
3        True
4        True
        ...  
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

### See also

## Chaining Series Methods

### How to do it... {#how-to-do-it-6}

In [72]:
movies = pd.read_csv("../data/movie.csv")
fb_likes = movies["actor_1_facebook_likes"]
director = movies["director_name"]

In [73]:
director.value_counts().head(3)

director_name
Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Name: count, dtype: int64

In [74]:
fb_likes.isna().sum()

np.int64(7)

In [75]:
fb_likes.dtype

dtype('float64')

In [76]:
(fb_likes.fillna(0).astype(int).head())

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

### How it works...

### There's more...

In [77]:
(
    fb_likes.fillna(0)
    # .astype(int)
    # .head()
)

0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64

In [78]:
(
    fb_likes.fillna(0).astype(int)
    # .head()
)

0        1000
1       40000
2       11000
3       27000
4         131
        ...  
4911      637
4912      841
4913        0
4914      946
4915       86
Name: actor_1_facebook_likes, Length: 4916, dtype: int64

In [79]:
fb_likes.isna().mean()

np.float64(0.0014239218877135883)

In [80]:
fb_likes.fillna(0).astype(int).head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

In [81]:
def debug_df(ser):
    print("BEFORE")
    print(ser)
    print("AFTER")
    return ser

In [82]:
(fb_likes.fillna(0).pipe(debug_df).astype(int).head())

BEFORE
0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64
AFTER


0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

In [83]:
intermediate = None


def get_intermediate(ser):
    global intermediate
    intermediate = ser
    return ser

In [84]:
res = fb_likes.fillna(0).pipe(get_intermediate).astype(int).head()

In [85]:
intermediate

0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64

## Renaming Column Names

### How to do it...

In [86]:
movies = pd.read_csv("../data/movie.csv")

In [87]:
col_map = {
    "director_name": "Director Name",
    "num_critic_for_reviews": "Critical Reviews",
}

In [88]:
movies.rename(columns=col_map).head()

Unnamed: 0,color,Director Name,...,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,...,1.78,33000
1,Color,Gore Verbinski,...,2.35,0
2,Color,Sam Mendes,...,2.35,85000
3,Color,Christopher Nolan,...,2.35,164000
4,,Doug Walker,...,,0


### How it works... {#how-it-works-8}

### There's more {#theres-more-7}

In [89]:
idx_map = {
    "Avatar": "Ratava",
    "Spectre": "Ertceps",
    "Pirates of the Caribbean: At World's End": "POC",
}
col_map = {"aspect_ratio": "aspect", "movie_facebook_likes": "fblikes"}
(movies.set_index("movie_title").rename(index=idx_map, columns=col_map).head(3))

Unnamed: 0_level_0,color,director_name,...,aspect,fblikes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ratava,Color,James Cameron,...,1.78,33000
POC,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000


In [90]:
movies = pd.read_csv("../data/movie.csv", index_col="movie_title")
ids = movies.index.tolist()
columns = movies.columns.tolist()

In [91]:
display(movies.index[:3])
display(movies.columns[:3])

Index(['Avatar', 'Pirates of the Caribbean: At World's End', 'Spectre'], dtype='object', name='movie_title')

Index(['color', 'director_name', 'num_critic_for_reviews'], dtype='object')

In [92]:
display(ids[:3])
display(columns[:3])

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre']

['color', 'director_name', 'num_critic_for_reviews']

### rename the row and column labels with list assignments

In [93]:
ids[0] = "Ratava"
ids[1] = "POC"
ids[2] = "Ertceps"
columns[1] = "director"
columns[-2] = "aspect"
columns[-1] = "fblikes"
movies.index = ids
movies.columns = columns

In [94]:
movies.head(3)

Unnamed: 0,color,director,...,aspect,fblikes
Ratava,Color,James Cameron,...,1.78,33000
POC,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000


In [95]:
def to_clean(val):
    return val.strip().lower().replace(" ", "_")

In [96]:
movies.rename(columns=to_clean).head(3)

Unnamed: 0,color,director,...,aspect,fblikes
Ratava,Color,James Cameron,...,1.78,33000
POC,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000


In [97]:
movies.rename(columns=lambda x: x.strip().lower().replace(" ", "_")).head(3)

Unnamed: 0,color,director,...,aspect,fblikes
Ratava,Color,James Cameron,...,1.78,33000
POC,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000


In [98]:
cols = [col.strip().lower().replace(" ", "_") for col in movies.columns]
movies.columns = cols
movies.head(3)

Unnamed: 0,color,director,...,aspect,fblikes
Ratava,Color,James Cameron,...,1.78,33000
POC,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000


## Creating and Deleting Columns

### How to do it... {#how-to-do-it-9}

In [99]:
movies = pd.read_csv("../data/movie.csv", index_col="movie_title")
movies["has_seen"] = 0

In [100]:
idx_map = {
    "Avatar": "Ratava",
    "Spectre": "Ertceps",
    "Pirates of the Caribbean: At World's End": "POC",
}
col_map = {"aspect_ratio": "aspect", "movie_facebook_likes": "fblikes"}
(movies.rename(index=idx_map, columns=col_map).assign(has_seen=0))

Unnamed: 0_level_0,color,director_name,...,fblikes,has_seen
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ratava,Color,James Cameron,...,33000,0
POC,Color,Gore Verbinski,...,0,0
Ertceps,Color,Sam Mendes,...,85000,0
The Dark Knight Rises,Color,Christopher Nolan,...,164000,0
Star Wars: Episode VII - The Force Awakens,,Doug Walker,...,0,0
...,...,...,...,...,...
Signed Sealed Delivered,Color,Scott Smith,...,84,0
The Following,Color,,...,32000,0
A Plague So Pleasant,Color,Benjamin Roberds,...,16,0
Shanghai Calling,Color,Daniel Hsia,...,660,0


In [101]:
total = (
    movies["actor_1_facebook_likes"]
    + movies["actor_2_facebook_likes"]
    + movies["actor_3_facebook_likes"]
    + movies["director_facebook_likes"]
)

In [102]:
total.head(5)

movie_title
Avatar                                         2791.0
Pirates of the Caribbean: At World's End      46563.0
Spectre                                       11554.0
The Dark Knight Rises                         95000.0
Star Wars: Episode VII - The Force Awakens        NaN
dtype: float64

In [103]:
cols = [
    "actor_1_facebook_likes",
    "actor_2_facebook_likes",
    "actor_3_facebook_likes",
    "director_facebook_likes",
]
sum_col = movies[cols].sum(axis="columns")
sum_col.head(5)

movie_title
Avatar                                         2791.0
Pirates of the Caribbean: At World's End      46563.0
Spectre                                       11554.0
The Dark Knight Rises                         95000.0
Star Wars: Episode VII - The Force Awakens      274.0
dtype: float64

In [104]:
movies.assign(total_likes=sum_col).head(5)

Unnamed: 0_level_0,color,director_name,...,has_seen,total_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avatar,Color,James Cameron,...,0,2791.0
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,...,0,46563.0
Spectre,Color,Sam Mendes,...,0,11554.0
The Dark Knight Rises,Color,Christopher Nolan,...,0,95000.0
Star Wars: Episode VII - The Force Awakens,,Doug Walker,...,0,274.0


In [105]:
def sum_likes(df):
    return df[[c for c in df.columns if "like" in c]].sum(axis=1)

In [106]:
movies.assign(total_likes=sum_likes).head(5)

Unnamed: 0_level_0,color,director_name,...,has_seen,total_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avatar,Color,James Cameron,...,0,40625.0
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,...,0,94913.0
Spectre,Color,Sam Mendes,...,0,108254.0
The Dark Knight Rises,Color,Christopher Nolan,...,0,365759.0
Star Wars: Episode VII - The Force Awakens,,Doug Walker,...,0,417.0


In [107]:
(movies.assign(total_likes=sum_col)["total_likes"].isna().sum())

np.int64(0)

In [108]:
(movies.assign(total_likes=total)["total_likes"].isna().sum())

np.int64(122)

In [109]:
(movies.assign(total_likes=total.fillna(0))["total_likes"].isna().sum())

np.int64(0)

In [110]:
def cast_like_gt_actor_director(df):
    return df["cast_total_facebook_likes"] >= df["total_likes"]

In [111]:
df2 = movies.assign(total_likes=total, is_cast_likes_more=cast_like_gt_actor_director)

In [112]:
df2["is_cast_likes_more"].all()

np.False_

In [113]:
df2 = df2.drop(columns="total_likes")

In [114]:
actor_sum = movies[[c for c in movies.columns if "actor_" in c and "_likes" in c]].sum(
    axis="columns"
)

In [115]:
actor_sum.head(5)

movie_title
Avatar                                         2791.0
Pirates of the Caribbean: At World's End      46000.0
Spectre                                       11554.0
The Dark Knight Rises                         73000.0
Star Wars: Episode VII - The Force Awakens      143.0
dtype: float64

In [116]:
movies["cast_total_facebook_likes"] >= actor_sum

movie_title
Avatar                                        True
Pirates of the Caribbean: At World's End      True
Spectre                                       True
The Dark Knight Rises                         True
Star Wars: Episode VII - The Force Awakens    True
                                              ... 
Signed Sealed Delivered                       True
The Following                                 True
A Plague So Pleasant                          True
Shanghai Calling                              True
My Date with Drew                             True
Length: 4916, dtype: bool

In [117]:
movies["cast_total_facebook_likes"].ge(actor_sum)

movie_title
Avatar                                        True
Pirates of the Caribbean: At World's End      True
Spectre                                       True
The Dark Knight Rises                         True
Star Wars: Episode VII - The Force Awakens    True
                                              ... 
Signed Sealed Delivered                       True
The Following                                 True
A Plague So Pleasant                          True
Shanghai Calling                              True
My Date with Drew                             True
Length: 4916, dtype: bool

In [118]:
movies["cast_total_facebook_likes"].ge(actor_sum).all()

np.True_

In [119]:
pct_like = actor_sum.div(movies["cast_total_facebook_likes"])

In [120]:
pct_like.describe()

count    4883.000000
mean        0.833279
std         0.140566
min         0.300767
25%         0.735284
50%         0.869289
75%         0.954774
max         1.000000
dtype: float64

In [121]:
# try/except block so that the cell can be run multiple times without breaking the code flow
try:
    pd.Series(pct_like.values, index=movies["movie_title"].values).head()
except KeyError as e:
    print("KeyError : ", e)
    print("Index already set -> movies.index : ", movies.index)

    pd.Series(pct_like.values, index=movies.index.values).head()

KeyError :  'movie_title'
Index already set -> movies.index :  Index(['Avatar', 'Pirates of the Caribbean: At World's End', 'Spectre',
       'The Dark Knight Rises', 'Star Wars: Episode VII - The Force Awakens',
       'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron',
       'Harry Potter and the Half-Blood Prince',
       ...
       'Primer', 'Cavite', 'El Mariachi', 'The Mongol King', 'Newlyweds',
       'Signed Sealed Delivered', 'The Following', 'A Plague So Pleasant',
       'Shanghai Calling', 'My Date with Drew'],
      dtype='object', name='movie_title', length=4916)


### How it works... {#how-it-works-9}

### There's more... {#theres-more-8}

In [122]:
[el for el in dir(pd.Index) if el.startswith("get")]

['get_indexer',
 'get_indexer_for',
 'get_indexer_non_unique',
 'get_level_values',
 'get_loc',
 'get_slice_bound']

In [123]:
profit_index = movies.columns.get_loc("gross") + 1
profit_index

9

In [124]:
try:
    movies.insert(
        loc=profit_index, column="profit", value=movies["gross"] - movies["budget"]
    )
except ValueError as e:
    print(e)

In [125]:
try:
    del movies["director_name"]
except KeyError as e:
    print("KeyError : ", e)
    print("movies.columns : ", movies.columns)