# Chapter 1: Pandas 基础

## 第一章目录
* [解析DataFrame数据类型](#解析DataFrame数据类型)
* [DataFrame数据类型的主要属性](#DataFrame数据类型的主要属性)
* [理解原始数据的数据类型](#理解原始数据的数据类型)
* [选择原始数据中的某一列作为一个Series](#选择原始数据中的某一列作为一个Series)
* [调用Series的方法](#调用Series的方法)
* [对Series进行操作](#对Series进行操作)
* [与Series相关的方法](#与Series相关的方法)
* [给行索引赋值，使其有意义](#给行索引赋值，使其有意义)
* [重命名行和列的名称](#重命名行和列的名称)
* [创建和删除列](#创建和删除列)

In [30]:
import pandas as pd
import numpy as np

# 解析DataFrame数据类型

In [11]:
pd.set_option('max_columns', 8, 'max_rows', 10)  # 只显示最多10行和8列，不完全显示数据

In [14]:
movie = pd.read_csv('data/movie.csv')
movie.head()  # 只显示数据的前5行

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
4,,Doug Walker,,,...,12.0,7.1,,0


# DataFrame数据类型的主要属性

In [15]:
columns = movie.columns  # 数据中的列索引
index = movie.index   # 数据中的行索引
data = movie.values   # 具体的数据值

In [16]:
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [6]:
index

RangeIndex(start=0, stop=4916, step=1)

In [7]:
data

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ..., 
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

In [17]:
type(index)  # 查看行索引的数据类型

pandas.core.indexes.range.RangeIndex

In [18]:
type(columns)  # 查看列索引的数据类型

pandas.core.indexes.base.Index

In [19]:
type(data)  # 查看DataFrame中的存放的具体数据是什么类型

numpy.ndarray

In [20]:
index.values   # 具体的行索引值

array([   0,    1,    2, ..., 4913, 4914, 4915])

In [21]:
columns.values  # 具体的列索引值

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'], dtype=object)

# 理解原始数据的数据类型

In [14]:
movie = pd.read_csv('data/movie.csv')

In [15]:
movie.dtypes

color                       object
director_name               object
num_critic_for_reviews     float64
duration                   float64
director_facebook_likes    float64
                            ...   
title_year                 float64
actor_2_facebook_likes     float64
imdb_score                 float64
aspect_ratio               float64
movie_facebook_likes         int64
Length: 28, dtype: object

In [22]:
movie.get_dtype_counts()  # 统计属于每种数据类型列的数量

float64    13
int64       3
object     12
dtype: int64

# 选择原始数据中的某一列作为一个Series

In [23]:
movie = pd.read_csv('data/movie.csv')

In [25]:
movie['director_name']  # 一个列即一个Series

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [26]:
movie.director_name  # 这种方法也可以

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [32]:
type(movie['director_name'])  # 确认一下数据类型是Series

pandas.core.series.Series

In [35]:
director = movie['director_name'] # 使用一个新的变量保存一个Seires
director.name   # 查看列的名字

'director_name'

In [36]:
director.to_frame().head()  # 把Series转为DataFrame数据类型

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


# 调用Series的方法

In [43]:
s_attr_methods = set(dir(pd.Series))  # 查看Series有多少属性和方法个数
len(s_attr_methods)

441

In [44]:
df_attr_methods = set(dir(pd.DataFrame))  # 查看DataFrame有多少属性和方法个数
len(s_attr_methods)
len(df_attr_methods)

444

In [45]:
len(s_attr_methods & df_attr_methods)  # 查看Series和DataFrame都有的属性和方法个数

375

In [46]:
movie = pd.read_csv('data/movie.csv')
director = movie['director_name']  # 一个列就是一个Series
actor_1_fb_likes = movie.actor_1_facebook_likes

In [47]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [48]:
actor_1_fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [52]:
pd.set_option('max_rows', 8)  # 只显示8行
director.value_counts()  # 统计某一列中不同值的数量

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
                    ..
Valentine            1
Dena Seidel          1
Stephen Kijak        1
John Bonito          1
Name: director_name, Length: 2397, dtype: int64

In [51]:
actor_1_fb_likes.value_counts()

1000.0     436
11000.0    206
2000.0     189
3000.0     150
          ... 
216.0        1
859.0        1
225.0        1
334.0        1
Name: actor_1_facebook_likes, Length: 877, dtype: int64

In [53]:
director.size  # 某一列有多少行

4916

In [58]:
director.shape  #  某一列的形状
director.ndim  # 1D向量

1

In [55]:
len(director)  # 某一列的长度

4916

In [56]:
director.count()  # 某一列除去缺失值后数量

4814

In [59]:
actor_1_fb_likes.size

4916

In [35]:
actor_1_fb_likes.count()

4909

In [61]:
actor_1_fb_likes.quantile()  # 数值类型的数据有四分位数

982.0

In [65]:
# 最大、最小 均值 中位数 标准差 求和
actor_1_fb_likes.min(), actor_1_fb_likes.max(), \
actor_1_fb_likes.mean(), actor_1_fb_likes.median(), \
actor_1_fb_likes.std(), actor_1_fb_likes.sum()

(0.0, 640000.0, 6494.4884905276022, 982.0, 15106.986883848185, 31881444.0)

In [38]:
actor_1_fb_likes.describe()  # 某一列的描述方法

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [66]:
director.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

In [40]:
actor_1_fb_likes.quantile(.2)

510.0

In [41]:
actor_1_fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])

0.1      240.0
0.2      510.0
0.3      694.0
0.4      854.0
        ...   
0.6     1000.0
0.7     8000.0
0.8    13000.0
0.9    18000.0
Name: actor_1_facebook_likes, Length: 9, dtype: float64

In [67]:
director.isnull()  # 判断是否有缺失值

0       False
1       False
2       False
3       False
        ...  
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [70]:
actor_1_fb_likes.count()  # 除去缺失值后的数量

4909

In [74]:
actor_1_fb_likes_filled = actor_1_fb_likes.fillna(0)  # 用0填充缺失值
actor_1_fb_likes_filled.count()

4916

In [75]:
actor_1_fb_likes_dropped = actor_1_fb_likes.dropna()  # 对含有缺失值的某列，扔掉缺失值
actor_1_fb_likes_dropped.size

4909

In [76]:
director.value_counts()  # 统计某一列中不同值的数量

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
                    ..
Valentine            1
Dena Seidel          1
Stephen Kijak        1
John Bonito          1
Name: director_name, Length: 2397, dtype: int64

In [77]:
director.value_counts(normalize=True)  # 统计某一列中不同值的数量，并进行归一化操作

Steven Spielberg    0.005401
Woody Allen         0.004570
Martin Scorsese     0.004155
Clint Eastwood      0.004155
                      ...   
Valentine           0.000208
Dena Seidel         0.000208
Stephen Kijak       0.000208
John Bonito         0.000208
Name: director_name, Length: 2397, dtype: float64

In [80]:
director.hasnans  # 判断是否有缺失值

True

In [81]:
director.notnull()  # 判断是否有缺失值

0        True
1        True
2        True
3        True
        ...  
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool

# 对Series进行操作

In [90]:
# pd.options.display.max_rows = 6
pd.set_option('max_rows', 8)

In [84]:
5 + 9   

14

In [50]:
4 ** 2   

16

In [51]:
a = 10   

In [52]:
5 <= 9   

True

In [53]:
'abcde' + 'fg'    

'abcdefg'

In [54]:
not (5 <= 9)     

False

In [55]:
7 in [1, 2, 6]   

False

In [56]:
set([1,2,3]) & set([2,3,4])  # 交集

{2, 3}

In [57]:
[1, 2, 3] - 3

TypeError: unsupported operand type(s) for -: 'list' and 'int'

In [58]:
a = set([1,2,3])     
a[0]                

TypeError: 'set' object does not support indexing

In [91]:
movie = pd.read_csv('data/movie.csv')
# imdb_score = movie['imdb_score']  
imdb_score = movie.imdb_score
imdb_score

0       7.9
1       7.1
2       6.8
3       8.5
       ... 
4912    7.5
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [92]:
imdb_score + 1  # 类似与广播机制

0       8.9
1       8.1
2       7.8
3       9.5
       ... 
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [94]:
imdb_score * 2.5  # 类似与广播机制

0       19.75
1       17.75
2       17.00
3       21.25
        ...  
4912    18.75
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [95]:
imdb_score // 7  #（地板除）

0       1.0
1       1.0
2       0.0
3       1.0
       ... 
4912    1.0
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [96]:
imdb_score > 7

0        True
1        True
2       False
3        True
        ...  
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [97]:
director = movie['director_name']

In [98]:
director == 'James Cameron'

0        True
1       False
2       False
3       False
        ...  
4912    False
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [99]:
imdb_score.add(1)              # 等价于imdb_score + 1

0       8.9
1       8.1
2       7.8
3       9.5
       ... 
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [100]:
imdb_score.mul(2.5)            # 等价于imdb_score * 2.5

0       19.75
1       17.75
2       17.00
3       21.25
        ...  
4912    18.75
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [101]:
imdb_score.floordiv(7)         # 等价于imdb_score // 7

0       1.0
1       1.0
2       0.0
3       1.0
       ... 
4912    1.0
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [104]:
imdb_score.lt(7)               # 等价于imdb_score < 7

0       False
1       False
2        True
3       False
        ...  
4912    False
4913     True
4914     True
4915     True
Name: imdb_score, Length: 4916, dtype: bool

In [105]:
director.eq('James Cameron')   # 等价于director == 'James Cameron'

0        True
1       False
2       False
3       False
        ...  
4912    False
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [106]:
imdb_score.head()

0    7.9
1    7.1
2    6.8
3    8.5
4    7.1
Name: imdb_score, dtype: float64

In [108]:
imdb_score.astype(int).head()  # 转换成int型

0    7
1    7
2    6
3    8
4    7
Name: imdb_score, dtype: int64

In [109]:
imdb_score.astype(int).mod(5)  # 转换成int型，除5后取余数

0       2
1       2
2       1
3       3
       ..
4912    2
4913    1
4914    1
4915    1
Name: imdb_score, Length: 4916, dtype: int64

In [115]:
a = type(imdb_score)
a

pandas.core.series.Series

In [117]:
a([1,2,3])  # a就是一个Series

0    1
1    2
2    3
dtype: int64

# 与Series相关的方法

In [118]:
movie = pd.read_csv('data/movie.csv')
actor_1_fb_likes = movie['actor_1_facebook_likes']
director = movie['director_name']

In [120]:
director.value_counts().head()

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
Spike Lee           16
Name: director_name, dtype: int64

In [130]:
actor_1_fb_likes.value_counts().head()

1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
Name: actor_1_facebook_likes, dtype: int64

In [125]:
actor_1_fb_likes.count()

4909

In [126]:
actor_1_fb_likes.size

4916

In [128]:
isnull = actor_1_fb_likes.size - actor_1_fb_likes.count()
isnull

7

In [122]:
actor_1_fb_likes.isnull().sum()  # 有7个缺失值

7

In [79]:
actor_1_fb_likes.dtype  

dtype('float64')

In [131]:
actor_1_fb_likes.fillna(0).astype(int).head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

In [137]:
temp = actor_1_fb_likes.fillna(0).astype(int)
temp.isnull().sum()  # 缺失值填充后，无缺失值

0

In [138]:
actor_1_fb_likes.isnull()

0       False
1       False
2       False
3       False
        ...  
4912    False
4913    False
4914    False
4915    False
Name: actor_1_facebook_likes, Length: 4916, dtype: bool

In [140]:
actor_1_fb_likes.isnull().mean() 

0.0014239218877135883

# 给行索引赋值，使其有意义

In [141]:
movie = pd.read_csv('data/movie.csv')

In [142]:
movie.shape

(4916, 28)

In [144]:
movie2 = movie.set_index('movie_title')  # 把movie_title这一列当作行索引
movie2

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
...,...,...,...,...,...,...,...,...,...
The Following,Color,,43.0,43.0,...,593.0,7.5,16.00,32000
A Plague So Pleasant,Color,Benjamin Roberds,13.0,76.0,...,0.0,6.3,,16
Shanghai Calling,Color,Daniel Hsia,14.0,100.0,...,719.0,6.3,2.35,660
My Date with Drew,Color,Jon Gunn,43.0,90.0,...,23.0,6.6,1.85,456


In [149]:
pd.read_csv('data/movie.csv', index_col='movie_title') # 把movie_title这一列当作行索引

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
...,...,...,...,...,...,...,...,...,...
The Following,Color,,43.0,43.0,...,593.0,7.5,16.00,32000
A Plague So Pleasant,Color,Benjamin Roberds,13.0,76.0,...,0.0,6.3,,16
Shanghai Calling,Color,Daniel Hsia,14.0,100.0,...,719.0,6.3,2.35,660
My Date with Drew,Color,Jon Gunn,43.0,90.0,...,23.0,6.6,1.85,456


In [158]:
movie.reset_index()  # 给原始的数据额外加入行索引

Unnamed: 0,movie_title,color,director_name,num_critic_for_reviews,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Avatar,Color,James Cameron,723.0,...,936.0,7.9,1.78,33000
1,Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,...,5000.0,7.1,2.35,0
2,Spectre,Color,Sam Mendes,602.0,...,393.0,6.8,2.35,85000
3,The Dark Knight Rises,Color,Christopher Nolan,813.0,...,23000.0,8.5,2.35,164000
...,...,...,...,...,...,...,...,...,...
4912,The Following,Color,,43.0,...,593.0,7.5,16.00,32000
4913,A Plague So Pleasant,Color,Benjamin Roberds,13.0,...,0.0,6.3,,16
4914,Shanghai Calling,Color,Daniel Hsia,14.0,...,719.0,6.3,2.35,660
4915,My Date with Drew,Color,Jon Gunn,43.0,...,23.0,6.6,1.85,456


# 重命名行和列的名称

In [154]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
movie.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,12.0,7.1,,0


In [159]:
movie.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [153]:
idx_rename = {'Avatar':'Ratava', 'Spectre': 'Ertceps'} 
col_rename = {'director_name':'Director Name', 'num_critic_for_reviews': 'Critical Reviews'} 

In [90]:
movie.rename(index=idx_rename, columns=col_rename).head()

Unnamed: 0_level_0,color,Director Name,Critical Reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ratava,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,12.0,7.1,,0


In [171]:
# 或者使用下面的方法
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
index = movie.index
columns = movie.columns

index_list = index.tolist()
column_list = columns.tolist()

index_list[0] = 'Ratava'
index_list[2] = 'Ertceps'
column_list[1] = 'Director Name'
column_list[2] = 'Critical Reviews'

In [172]:
print(index_list[:5])

['Ratava', "Pirates of the Caribbean: At World's End", 'Ertceps', 'The Dark Knight Rises', 'Star Wars: Episode VII - The Force Awakens']


In [173]:
print(column_list)

['color', 'Director Name', 'Critical Reviews', 'duration', 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name', 'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name', 'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link', 'num_user_for_reviews', 'language', 'country', 'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio', 'movie_facebook_likes']


In [174]:
movie.index = index_list
movie.columns = column_list

In [175]:
movie.head()

Unnamed: 0,color,Director Name,Critical Reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
Ratava,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,12.0,7.1,,0


# 创建和删除列

In [179]:
movie = pd.read_csv('data/movie.csv')
movie.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [183]:
movie['has_seen'] = 0  # 创建一个新的列

In [181]:
movie.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes', 'has_seen'],
      dtype='object')

In [211]:
# 创建一个新的列
movie['actor_director_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                              movie['actor_2_facebook_likes'] + 
                                              movie['actor_3_facebook_likes'] + 
                                              movie['director_facebook_likes'])

In [212]:
movie['actor_director_facebook_likes'].isnull().sum()  # 统计某列有多少缺失值

122

In [213]:
movie['actor_director_facebook_likes'] = movie['actor_director_facebook_likes'].fillna(0)

In [214]:
movie['is_cast_likes_more'] = (movie['cast_total_facebook_likes'] >= 
                                  movie['actor_director_facebook_likes'])

In [215]:
movie['is_cast_likes_more'].all()

False

In [216]:
movie = movie.drop('actor_director_facebook_likes', axis='columns')  # 删除某列actor_director_facebook_likes

In [217]:
# 创建一个新的列
movie['actor_total_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                       movie['actor_2_facebook_likes'] + 
                                       movie['actor_3_facebook_likes'])

movie['actor_total_facebook_likes'] = movie['actor_total_facebook_likes'].fillna(0)

In [218]:
movie['is_cast_likes_more'] = movie['cast_total_facebook_likes'] >= \
                                  movie['actor_total_facebook_likes']
    
movie['is_cast_likes_more'].all() # 在指定的轴上看所有元素是否都为真

True

In [219]:
movie['pct_actor_cast_like'] = (movie['actor_total_facebook_likes'] / 
                                movie['cast_total_facebook_likes'])

In [225]:
movie['pct_actor_cast_like'].min(), movie['pct_actor_cast_like'].max() 

(0.0, 1.0)

In [231]:
columns = movie.columns
columns_value = columns.values
print('查看所有列的值:\n', columns_value)
print('统计有多少列:', len(columns_value))

查看所有列的值:
 ['color' 'director_name' 'num_critic_for_reviews' 'duration'
 'director_facebook_likes' 'actor_3_facebook_likes' 'actor_2_name'
 'actor_1_facebook_likes' 'gross' 'genres' 'actor_1_name' 'movie_title'
 'num_voted_users' 'cast_total_facebook_likes' 'actor_3_name'
 'facenumber_in_poster' 'plot_keywords' 'movie_imdb_link'
 'num_user_for_reviews' 'language' 'country' 'content_rating' 'budget'
 'title_year' 'actor_2_facebook_likes' 'imdb_score' 'aspect_ratio'
 'movie_facebook_likes' 'has_seen' 'is_cast_likes_more'
 'actor_total_facebook_likes' 'pct_actor_cast_like']
统计有多少列: 32


In [224]:
movie['pct_actor_cast_like'].head()

0    0.577369
1    0.951396
2    0.987521
3    0.683783
4    0.000000
Name: pct_actor_cast_like, dtype: float64

In [223]:
movie.set_index('movie_title')['pct_actor_cast_like'].head()

movie_title
Avatar                                        0.577369
Pirates of the Caribbean: At World's End      0.951396
Spectre                                       0.987521
The Dark Knight Rises                         0.683783
Star Wars: Episode VII - The Force Awakens    0.000000
Name: pct_actor_cast_like, dtype: float64

In [233]:
# 在gross列后增加一列
profit_index = movie.columns.get_loc('gross') + 1
profit_index

9

In [234]:
movie.insert(loc=profit_index, column='profit', value=movie['gross'] - movie['budget'])

In [235]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,has_seen,is_cast_likes_more,actor_total_facebook_likes,pct_actor_cast_like
0,Color,James Cameron,723.0,178.0,...,0,True,2791.0,0.577369
1,Color,Gore Verbinski,302.0,169.0,...,0,True,46000.0,0.951396
2,Color,Sam Mendes,602.0,148.0,...,0,True,11554.0,0.987521
3,Color,Christopher Nolan,813.0,164.0,...,0,True,73000.0,0.683783
4,,Doug Walker,,,...,0,True,0.0,0.0


In [236]:
columns = movie.columns
columns_value = columns.values
print('查看所有列的值:\n', columns_value)
print('统计有多少列:', len(columns_value))

查看所有列的值:
 ['color' 'director_name' 'num_critic_for_reviews' 'duration'
 'director_facebook_likes' 'actor_3_facebook_likes' 'actor_2_name'
 'actor_1_facebook_likes' 'gross' 'profit' 'genres' 'actor_1_name'
 'movie_title' 'num_voted_users' 'cast_total_facebook_likes' 'actor_3_name'
 'facenumber_in_poster' 'plot_keywords' 'movie_imdb_link'
 'num_user_for_reviews' 'language' 'country' 'content_rating' 'budget'
 'title_year' 'actor_2_facebook_likes' 'imdb_score' 'aspect_ratio'
 'movie_facebook_likes' 'has_seen' 'is_cast_likes_more'
 'actor_total_facebook_likes' 'pct_actor_cast_like']
统计有多少列: 33
