<a href="https://colab.research.google.com/github/adi0229/ML-DL/blob/master/top_pandas_25_tricks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 载入示例数据集

In [0]:
# 调包
import pandas as pd
import numpy as np

In [0]:
# 读取演示数据

# 饮品数据
drinks = pd.read_csv("http://bit.ly/drinksbycountry")

# IMDB电影评分数据
movies = pd.read_csv("http://bit.ly/imdbratings")

# 订单数据
orders = pd.read_csv("http://bit.ly/chiporders",sep='\t')
orders['item_price'] = orders.item_price.str.replace("$","").astype("float")

# 股票数据
stocks = pd.read_csv("http://bit.ly/smallstocks", parse_dates=['Date'])

# 泰坦尼克号数据
titanic = pd.read_csv("http://bit.ly/kaggletrain")

# 不明飞行物数据
ufo = pd.read_csv("http://bit.ly/uforeports",parse_dates=['Time'])

### 1.显示安装版本

In [0]:
pd.__version__

'0.25.3'

In [0]:
# 显示 Pandas 的依赖项
pd.show_versions()

  """)



INSTALLED VERSIONS
------------------
commit           : None
python           : 3.6.9.final.0
python-bits      : 64
OS               : Linux
OS-release       : 4.14.137+
machine          : x86_64
processor        : x86_64
byteorder        : little
LC_ALL           : None
LANG             : en_US.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 0.25.3
numpy            : 1.17.4
pytz             : 2018.9
dateutil         : 2.6.1
pip              : 19.3.1
setuptools       : 42.0.2
Cython           : 0.29.14
pytest           : 3.6.4
hypothesis       : None
sphinx           : 1.8.5
blosc            : None
feather          : 0.4.0
xlsxwriter       : None
lxml.etree       : 4.2.6
html5lib         : 1.0.1
pymysql          : None
psycopg2         : 2.7.6.1 (dt dec pq3 ext lo64)
jinja2           : 2.10.3
IPython          : 5.5.0
pandas_datareader: 0.7.4
bs4              : 4.6.3
bottleneck       : 1.3.1
fastparquet      : None
gcsfs            : None
lxml.etree       : 4.2.6
matplotlib  

### 2.创建 DataFrame 示例

In [0]:
# 字典传入 DataFrame 构建器，键是列名，值是也是列的值
df = pd.DataFrame({"列1":[42,420], "列 2":[24,240]})
df

Unnamed: 0,列1,列 2
0,42,24
1,420,240


In [0]:
# random 函数随机生成数字
pd.DataFrame(np.random.rand(3,9))

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.104009,0.094394,0.388348,0.657919,0.090957,0.289313,0.820612,0.038782,0.833982
1,0.121815,0.294482,0.029123,0.734957,0.410179,0.912114,0.8875,0.634482,0.806066
2,0.721472,0.850854,0.666635,0.358737,0.796974,0.428657,0.070066,0.888116,0.828093


In [0]:
# random 函数随机生成数字，使用列表的值作为列的名称
pd.DataFrame(np.random.rand(3,9),columns=list("一二三四五六七八九"))

Unnamed: 0,一,二,三,四,五,六,七,八,九
0,0.875658,0.310665,0.751236,0.716105,0.7337,0.998898,0.762344,0.506449,0.125859
1,0.640718,0.427353,0.678902,0.449891,0.74331,0.524009,0.314052,0.495913,0.199729
2,0.415798,0.276754,0.955417,0.844547,0.033066,0.136106,0.649239,0.052769,0.637021


### 3. 重命名 列的名称

In [0]:
df

Unnamed: 0,列1,列 2
0,42,24
1,420,240


In [0]:
df = df.rename({"列1":"新的列名很清楚","列 2":"列名要易懂"}, axis="columns")
df

Unnamed: 0,新的列名很清楚,列名要易懂
0,42,24
1,420,240


In [0]:
df.columns = ["特征 1","特征 2"]
df

Unnamed: 0,特征 1,特征 2
0,42,24
1,420,240


In [0]:
df.columns = df.columns.str.replace(" ","_")
df

Unnamed: 0,特征_1,特征_2
0,42,24
1,420,240


In [0]:
df.add_prefix("X_")

Unnamed: 0,X_特征_1,X_特征_2
0,42,24
1,420,240


In [0]:
df.add_suffix("_Y")

Unnamed: 0,特征_1_Y,特征_2_Y
0,42,24
1,420,240


### 4. 倒转数据行（Row）的顺序

In [0]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [0]:
# loc函数 + [::-1]倒转list元素的语法
drinks.loc[::-1].head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
192,Zimbabwe,64,18,4,4.7,Africa
191,Zambia,32,19,4,2.5,Africa
190,Yemen,6,0,0,0.1,Asia
189,Vietnam,111,2,1,2.0,Asia
188,Venezuela,333,100,3,7.7,South America


In [0]:
# 重新设置新索引（完全丢掉旧的索引）
drinks.loc[::-1].reset_index(drop=True).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Zimbabwe,64,18,4,4.7,Africa
1,Zambia,32,19,4,2.5,Africa
2,Yemen,6,0,0,0.1,Asia
3,Vietnam,111,2,1,2.0,Asia
4,Venezuela,333,100,3,7.7,South America


### 5. 倒转数据列（column）的顺序

In [0]:
# 与上一个技巧类似，行位置用:全选，在列位置用::-1倒转
drinks.loc[:,::-1].head()

Unnamed: 0,continent,total_litres_of_pure_alcohol,wine_servings,spirit_servings,beer_servings,country
0,Asia,0.0,0,0,0,Afghanistan
1,Europe,4.9,54,132,89,Albania
2,Africa,0.7,14,0,25,Algeria
3,Europe,12.4,312,138,245,Andorra
4,Africa,5.9,45,57,217,Angola


### 6.  根据数据类型（datatype）选择列（columns）

In [0]:
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [0]:
# 选择数据类型是数字类型（整数和浮点数）的列
drinks.select_dtypes(include='number').head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,0,0,0,0.0
1,89,132,54,4.9
2,25,0,14,0.7
3,245,138,312,12.4
4,217,57,45,5.9


In [0]:
# 选择数据类型是object类型的列
drinks.select_dtypes(include='object').head()

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa


In [0]:
drinks.select_dtypes(include=['number','object','category','datetime']).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [0]:
# 排除数据类型是字符串类型的列
drinks.select_dtypes(exclude='object').head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,0,0,0,0.0
1,89,132,54,4.9
2,25,0,14,0.7
3,245,138,312,12.4
4,217,57,45,5.9


### 7.   字符串（strings）转换数字（numbers）

In [0]:
df = pd.DataFrame({'col_one':["1.1","2.1","3.1"],
                   'col_two':["4.1","5.1","6.1"],
                   'col_three':["7.1","8.1","-"]})
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.1,7.1
1,2.1,5.1,8.1
2,3.1,6.1,-


In [0]:
df.dtypes

col_one      object
col_two      object
col_three    object
dtype: object

In [0]:
df.astype({"col_one":"float", "col_two":"float"}).dtypes

col_one      float64
col_two      float64
col_three     object
dtype: object

In [0]:
pd.to_numeric(df.col_three, errors='coerce')

0    7.1
1    8.1
2    NaN
Name: col_three, dtype: float64

In [0]:
pd.to_numeric(df.col_three, errors='coerce').fillna(0)

0    7.1
1    8.1
2    0.0
Name: col_three, dtype: float64

In [0]:
# 更便捷的方法，一行转换所有字符串成为可运算的数字，并填充替代NaN值
df = df.apply(pd.to_numeric, errors='coerce').fillna(0)

In [0]:
df.dtypes

col_one      float64
col_two      float64
col_three    float64
dtype: object

In [0]:
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.1,7.1
1,2.1,5.1,8.1
2,3.1,6.1,0.0


### 8. 减少 Dataframe 的容量体积

In [0]:
drinks.info(memory_usage='deep')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


In [0]:
# 载入数据时，只读取所需的列（columns）
cols = ['beer_servings','continent']
small_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols)
# 存储占用空间变小
small_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
beer_servings    193 non-null int64
continent        193 non-null object
dtypes: int64(1), object(1)
memory usage: 13.7 KB


In [0]:
dtypes = {'continent':'category'}
smaller_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols, dtype=dtypes)
smaller_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
beer_servings    193 non-null int64
continent        193 non-null category
dtypes: category(1), int64(1)
memory usage: 2.4 KB


### 9. 从多个文件的数据，构建一个 DataFrame（row-wise）

In [0]:
# pd.read_csv("data/stocks1.csv")
# pd.read_csv("data/stocks2.csv")
# pd.read_csv("data/stocks3.csv")

# from glob import glob

# stock_files = sorted(glob('data/stocks*.csv'))

# pd.concat((pd.read_csv(file) for file in stock_files), ignore_index=True)

### 10. 从多个文件的数据，构建一个 DataFrame（column-wise）

In [0]:
# pd.read_csv("data/stocks1.csv")
# pd.read_csv("data/stocks2.csv")
# pd.read_csv("data/stocks3.csv")

# from glob import glob

# stock_files = sorted(glob('data/stocks*.csv'))

# pd.concat((pd.read_csv(file) for file in stock_files), axis='columns')

### 11. 从剪贴板，构建一个 DataFrame

In [0]:
# colab环境无法模拟本地操作系统的复制/粘贴机制 
df = pd.read_clipboard()
df

PyperclipException: ignored

### 12. 随机分割 DataFrame 成为两个数据子集

In [0]:
len(movies)

979

In [0]:
movie_1 = movies.sample(frac=0.82, random_state=42)

In [0]:
movie_2 = movies.drop(movie_1.index)

In [0]:
len(movie_1) + len(movie_2)

979

In [0]:
movie_1.index.sort_values()

Int64Index([  0,   2,   3,   5,   6,   7,   8,   9,  10,  11,
            ...
            967, 968, 969, 970, 972, 973, 974, 976, 977, 978],
           dtype='int64', length=803)

In [0]:
movie_2.index.sort_values()

Int64Index([  1,   4,  13,  14,  20,  21,  27,  32,  34,  40,
            ...
            875, 877, 897, 917, 942, 955, 959, 961, 971, 975],
           dtype='int64', length=176)

### 13. 在一个 DataFrame 中，筛选多种类型（categories）

In [0]:
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [0]:
movies.genre.unique()

array(['Crime', 'Action', 'Drama', 'Western', 'Adventure', 'Biography',
       'Comedy', 'Animation', 'Mystery', 'Horror', 'Film-Noir', 'Sci-Fi',
       'History', 'Thriller', 'Family', 'Fantasy'], dtype=object)

In [0]:
movies[
       (movies.genre == 'Action')|
       (movies.genre == 'Drama') |
       (movies.genre == 'Western')
].head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
11,8.8,Inception,PG-13,Action,148,"[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'..."


In [0]:
movies[movies.genre.isin(['Action','Drama','Western'])].head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
11,8.8,Inception,PG-13,Action,148,"[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'..."


In [0]:
# 加上“~"排除列表中包含类型的数据
movies[~movies.genre.isin(['Action','Drama','Western'])].head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."


### 14. 在一个 DataFrame 中， 根据数量最多的类别进行筛选

In [0]:
counts = movies.genre.value_counts()
counts

Drama        278
Comedy       156
Action       136
Crime        124
Biography     77
Adventure     75
Animation     62
Horror        29
Mystery       16
Western        9
Thriller       5
Sci-Fi         5
Film-Noir      3
Family         2
History        1
Fantasy        1
Name: genre, dtype: int64

In [0]:
counts.nlargest(3)

Drama     278
Comedy    156
Action    136
Name: genre, dtype: int64

In [0]:
counts.nlargest(3).index

Index(['Drama', 'Comedy', 'Action'], dtype='object')

In [0]:
movies[movies.genre.isin(counts.nlargest(3).index)].head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
11,8.8,Inception,PG-13,Action,148,"[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'..."
12,8.8,Star Wars: Episode V - The Empire Strikes Back,PG,Action,124,"[u'Mark Hamill', u'Harrison Ford', u'Carrie Fi..."


### 15. 处理缺失值

In [0]:
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,1930-06-01 22:00:00
1,Willingboro,,OTHER,NJ,1930-06-30 20:00:00
2,Holyoke,,OVAL,CO,1931-02-15 14:00:00
3,Abilene,,DISK,KS,1931-06-01 13:00:00
4,New York Worlds Fair,,LIGHT,NY,1933-04-18 19:00:00


In [0]:
ufo.isna().sum()

City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

In [0]:
ufo.isna().mean()

City               0.001371
Colors Reported    0.842004
Shape Reported     0.144948
State              0.000000
Time               0.000000
dtype: float64

In [0]:
ufo.dropna(axis='columns').head()

Unnamed: 0,State,Time
0,NY,1930-06-01 22:00:00
1,NJ,1930-06-30 20:00:00
2,CO,1931-02-15 14:00:00
3,KS,1931-06-01 13:00:00
4,NY,1933-04-18 19:00:00


In [0]:
ufo.dropna(thresh=len(ufo)*0.9, axis='columns').head()

Unnamed: 0,City,State,Time
0,Ithaca,NY,1930-06-01 22:00:00
1,Willingboro,NJ,1930-06-30 20:00:00
2,Holyoke,CO,1931-02-15 14:00:00
3,Abilene,KS,1931-06-01 13:00:00
4,New York Worlds Fair,NY,1933-04-18 19:00:00


### 16.  字符串分割为多列

In [0]:
df = pd.DataFrame({"name":["John Arthur Doe","Jane Ann Smith"],
                   'location':["Los Angeles, CA", "Washington, DC"]})
df

Unnamed: 0,name,location
0,John Arthur Doe,"Los Angeles, CA"
1,Jane Ann Smith,"Washington, DC"


In [0]:
df.name.str.split(' ', expand=True)


Unnamed: 0,0,1,2
0,John,Arthur,Doe
1,Jane,Ann,Smith


In [0]:
df[['first','middle','last']] = df.name.str.split(' ', expand=True)
df

Unnamed: 0,name,location,first,middle,last
0,John Arthur Doe,"Los Angeles, CA",John,Arthur,Doe
1,Jane Ann Smith,"Washington, DC",Jane,Ann,Smith


In [0]:
df.location.str.split(', ', expand=True)


Unnamed: 0,0,1
0,Los Angeles,CA
1,Washington,DC


In [0]:
df['city'] = df.location.str.split(', ', expand=True)[0]
df['code'] = df.location.str.split(', ', expand=True)[1]
df

Unnamed: 0,name,location,first,middle,last,city,code
0,John Arthur Doe,"Los Angeles, CA",John,Arthur,Doe,Los Angeles,CA
1,Jane Ann Smith,"Washington, DC",Jane,Ann,Smith,Washington,DC


### 17.  列表扩展至 DataFrame 之中（Expand a Series of lists into a DataFrame）

In [0]:
df = pd.DataFrame({'col_one':['a','b','c'], 'col_two':[[10, 40], [20, 50], [30, 60]]})
df

Unnamed: 0,col_one,col_two
0,a,"[10, 40]"
1,b,"[20, 50]"
2,c,"[30, 60]"


In [0]:
df_new = df.col_two.apply(pd.Series)
df_new

Unnamed: 0,0,1
0,10,40
1,20,50
2,30,60


In [0]:
pd.concat([df, df_new], axis='columns')

Unnamed: 0,col_one,col_two,0,1
0,a,"[10, 40]",10,40
1,b,"[20, 50]",20,50
2,c,"[30, 60]",30,60


### 18.  使用多个函数来合计运算（Aggregate）

In [0]:
orders.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98
6,3,1,Side of Chips,,1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",9.25


In [0]:
orders[orders.order_id == 1].item_price.sum()

11.56

In [0]:
orders[orders.order_id == 42].item_price.sum()

13.7

In [0]:
orders.groupby(['order_id']).item_price.sum().head()

order_id
1    11.56
2    16.98
3    12.67
4    21.00
5    13.70
Name: item_price, dtype: float64

In [0]:
orders.groupby('order_id').item_price.agg(['sum', 'count']).head()

Unnamed: 0_level_0,sum,count
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,11.56,4
2,16.98,1
3,12.67,2
4,21.0,2
5,13.7,2


### 19.  合并总计数据的输出到 DataFrame（Aggregate）

In [0]:
orders.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98
6,3,1,Side of Chips,,1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",9.25


In [0]:
orders.groupby('order_id').item_price.sum().head()

order_id
1    11.56
2    16.98
3    12.67
4    21.00
5    13.70
Name: item_price, dtype: float64

In [0]:
len(orders.groupby('order_id').item_price.sum().head())

5

In [0]:
len(orders.item_price)

4622

In [0]:
# 每个订单id的总价的数目，与订单id不同，使用 transform 方法
total_price = orders.groupby('order_id').item_price.transform('sum')
len(total_price)

4622

In [0]:
orders['total_price'] = total_price
orders.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,total_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39,11.56
1,1,1,Izze,[Clementine],3.39,11.56
2,1,1,Nantucket Nectar,[Apple],3.39,11.56
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,11.56
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98,12.67
6,3,1,Side of Chips,,1.69,12.67
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",11.75,21.0
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",9.25,21.0
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",9.25,13.7


In [0]:
orders['percent_of_total'] = orders.item_price / orders.total_price
orders.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,total_price,pecent_of_total,percent_of_total
0,1,1,Chips and Fresh Tomato Salsa,,2.39,11.56,0.206747,0.206747
1,1,1,Izze,[Clementine],3.39,11.56,0.293253,0.293253
2,1,1,Nantucket Nectar,[Apple],3.39,11.56,0.293253,0.293253
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,11.56,0.206747,0.206747
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,16.98,1.0,1.0
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98,12.67,0.866614,0.866614
6,3,1,Side of Chips,,1.69,12.67,0.133386,0.133386
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",11.75,21.0,0.559524,0.559524
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",9.25,21.0,0.440476,0.440476
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",9.25,13.7,0.675182,0.675182


### 20.  选择行列的一个切片（Slice）

In [0]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [0]:
# 查看 min 到 max 的切片
titanic.describe().loc['min':'max']

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [0]:
titanic.describe().loc['min':'max','Age':'Fare']

Unnamed: 0,Age,SibSp,Parch,Fare
min,0.42,0.0,0.0,0.0
25%,20.125,0.0,0.0,7.9104
50%,28.0,0.0,0.0,14.4542
75%,38.0,1.0,0.0,31.0
max,80.0,8.0,6.0,512.3292


### 21.  重塑多重索引Series

In [0]:
titanic.Survived.mean()

0.3838383838383838

In [0]:
titanic.groupby('Sex').Survived.mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [0]:
# 按照性别、船舱级别分组，计算存活率，得出多重索引
titanic.groupby(['Sex','Pclass']).Survived.mean()

Sex     Pclass
female  1         0.968085
        2         0.921053
        3         0.500000
male    1         0.368852
        2         0.157407
        3         0.135447
Name: Survived, dtype: float64

In [0]:
# 使用 unstack()，转换化成 DataFrame 对象
titanic.groupby(['Sex','Pclass']).Survived.mean().unstack()

Pclass,1,2,3
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


### 22.   创建一个透视表（Pivot Table）







In [0]:
titanic.pivot_table(index='Sex', columns='Pclass', values='Survived', aggfunc='mean')

Pclass,1,2,3
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [0]:
titanic.pivot_table(index='Sex', columns='Pclass', values='Survived',
                    aggfunc='mean', margins=True)

Pclass,1,2,3,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838


In [0]:
titanic.pivot_table(index='Sex', columns='Pclass', values='Survived',
                    aggfunc='count', margins=True)

Pclass,1,2,3,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,94,76,144,314
male,122,108,347,577
All,216,184,491,891


### 23.   连续型（continuous）数值转化为类别型（categorical）数据


In [0]:
titanic.Age.head(10)

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [0]:
pd.cut(titanic.Age, bins=[0, 18, 25, 99],
       labels=['child','young adult','adult']).head()

0    young adult
1          adult
2          adult
3          adult
4          adult
Name: Age, dtype: category
Categories (3, object): [child < young adult < adult]

### 24.   更改显示选项

In [0]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
# 显示浮点数字的小数点后的2位数
pd.set_option('display.float_format', '{:.2f}'.format)

In [0]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
pd.reset_option('display.float_format')

In [0]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 25.   样式化 DataFrame

In [0]:
stocks

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
3,2016-10-04,113.0,29736800,AAPL
4,2016-10-04,57.24,20085900,MSFT
5,2016-10-04,31.35,18460400,CSCO
6,2016-10-05,57.64,16726400,MSFT
7,2016-10-05,31.59,11808600,CSCO
8,2016-10-05,113.05,21453100,AAPL


In [0]:
format_dict = {"Date":"{:%m/%d/%y}", "Close":"${:.2f}", "Volume":"{:,}"}

In [0]:
stocks.style.format(format_dict)

Unnamed: 0,Date,Close,Volume,Symbol
0,10/03/16,$31.50,14070500,CSCO
1,10/03/16,$112.52,21701800,AAPL
2,10/03/16,$57.42,19189500,MSFT
3,10/04/16,$113.00,29736800,AAPL
4,10/04/16,$57.24,20085900,MSFT
5,10/04/16,$31.35,18460400,CSCO
6,10/05/16,$57.64,16726400,MSFT
7,10/05/16,$31.59,11808600,CSCO
8,10/05/16,$113.05,21453100,AAPL


In [0]:
(
    stocks.style.format(format_dict)
    .hide_index()
    .highlight_min('Close', color='red')
    .highlight_max('Close', color='lightgreen')
)

Date,Close,Volume,Symbol
10/03/16,$31.50,14070500,CSCO
10/03/16,$112.52,21701800,AAPL
10/03/16,$57.42,19189500,MSFT
10/04/16,$113.00,29736800,AAPL
10/04/16,$57.24,20085900,MSFT
10/04/16,$31.35,18460400,CSCO
10/05/16,$57.64,16726400,MSFT
10/05/16,$31.59,11808600,CSCO
10/05/16,$113.05,21453100,AAPL


In [0]:
(
    stocks.style.format(format_dict)
    .hide_index()
    .background_gradient(subset='Volume', cmap='Blues')
    
)

Date,Close,Volume,Symbol
10/03/16,$31.50,14070500,CSCO
10/03/16,$112.52,21701800,AAPL
10/03/16,$57.42,19189500,MSFT
10/04/16,$113.00,29736800,AAPL
10/04/16,$57.24,20085900,MSFT
10/04/16,$31.35,18460400,CSCO
10/05/16,$57.64,16726400,MSFT
10/05/16,$31.59,11808600,CSCO
10/05/16,$113.05,21453100,AAPL


In [0]:
(
    stocks.style.format(format_dict)
    .hide_index()
    .bar('Volume', color='lightblue', align='zero')
    .set_caption("Stock Prices from October 2016")
    
)

Date,Close,Volume,Symbol
10/03/16,$31.50,14070500,CSCO
10/03/16,$112.52,21701800,AAPL
10/03/16,$57.42,19189500,MSFT
10/04/16,$113.00,29736800,AAPL
10/04/16,$57.24,20085900,MSFT
10/04/16,$31.35,18460400,CSCO
10/05/16,$57.64,16726400,MSFT
10/05/16,$31.59,11808600,CSCO
10/05/16,$113.05,21453100,AAPL


### 奖励加餐：一行代码查看DataFrame各种统计属性

In [0]:
import pandas_profiling

pandas_profiling.ProfileReport(titanic)

  variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)


0,1
Number of variables,12
Number of observations,891
Total Missing (%),8.1%
Total size in memory,83.7 KiB
Average record size in memory,96.1 B

0,1
Numeric,6
Categorical,4
Boolean,1
Date,0
Text (Unique),1
Rejected,0
Unsupported,0

0,1
Distinct count,891
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,446
Minimum,1
Maximum,891
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,45.5
Q1,223.5
Median,446.0
Q3,668.5
95-th percentile,846.5
Maximum,891.0
Range,890.0
Interquartile range,445.0

0,1
Standard deviation,257.35
Coef of variation,0.57703
Kurtosis,-1.2
Mean,446
MAD,222.75
Skewness,0
Sum,397386
Variance,66231
Memory size,7.1 KiB

Value,Count,Frequency (%),Unnamed: 3
891,1,0.1%,
293,1,0.1%,
304,1,0.1%,
303,1,0.1%,
302,1,0.1%,
301,1,0.1%,
300,1,0.1%,
299,1,0.1%,
298,1,0.1%,
297,1,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1,1,0.1%,
2,1,0.1%,
3,1,0.1%,
4,1,0.1%,
5,1,0.1%,

Value,Count,Frequency (%),Unnamed: 3
887,1,0.1%,
888,1,0.1%,
889,1,0.1%,
890,1,0.1%,
891,1,0.1%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.38384

0,1
0,549
1,342

Value,Count,Frequency (%),Unnamed: 3
0,549,61.6%,
1,342,38.4%,

0,1
Distinct count,3
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2.3086
Minimum,1
Maximum,3
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1
Q1,2
Median,3
Q3,3
95-th percentile,3
Maximum,3
Range,2
Interquartile range,1

0,1
Standard deviation,0.83607
Coef of variation,0.36215
Kurtosis,-1.28
Mean,2.3086
MAD,0.76197
Skewness,-0.63055
Sum,2057
Variance,0.69902
Memory size,7.1 KiB

Value,Count,Frequency (%),Unnamed: 3
3,491,55.1%,
1,216,24.2%,
2,184,20.7%,

Value,Count,Frequency (%),Unnamed: 3
1,216,24.2%,
2,184,20.7%,
3,491,55.1%,

Value,Count,Frequency (%),Unnamed: 3
1,216,24.2%,
2,184,20.7%,
3,491,55.1%,

First 3 values
"Morrow, Mr. Thomas Rowan"
"Cherry, Miss. Gladys"
"Mangan, Miss. Mary"

Last 3 values
"McGovern, Miss. Mary"
"Samaan, Mr. Youssef"
"Vande Walle, Mr. Nestor Cyriel"

Value,Count,Frequency (%),Unnamed: 3
"Abbing, Mr. Anthony",1,0.1%,
"Abbott, Mr. Rossmore Edward",1,0.1%,
"Abbott, Mrs. Stanton (Rosa Hunt)",1,0.1%,
"Abelson, Mr. Samuel",1,0.1%,
"Abelson, Mrs. Samuel (Hannah Wizosky)",1,0.1%,

Value,Count,Frequency (%),Unnamed: 3
"de Mulder, Mr. Theodore",1,0.1%,
"de Pelsmaeker, Mr. Alfons",1,0.1%,
"del Carlo, Mr. Sebastiano",1,0.1%,
"van Billiard, Mr. Austin Blyler",1,0.1%,
"van Melkebeke, Mr. Philemon",1,0.1%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
male,577
female,314

Value,Count,Frequency (%),Unnamed: 3
male,577,64.8%,
female,314,35.2%,

0,1
Distinct count,89
Unique (%),10.0%
Missing (%),19.9%
Missing (n),177
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,29.699
Minimum,0.42
Maximum,80
Zeros (%),0.0%

0,1
Minimum,0.42
5-th percentile,4.0
Q1,20.125
Median,28.0
Q3,38.0
95-th percentile,56.0
Maximum,80.0
Range,79.58
Interquartile range,17.875

0,1
Standard deviation,14.526
Coef of variation,0.48912
Kurtosis,0.17827
Mean,29.699
MAD,11.323
Skewness,0.38911
Sum,21205
Variance,211.02
Memory size,7.1 KiB

Value,Count,Frequency (%),Unnamed: 3
24.0,30,3.4%,
22.0,27,3.0%,
18.0,26,2.9%,
28.0,25,2.8%,
19.0,25,2.8%,
30.0,25,2.8%,
21.0,24,2.7%,
25.0,23,2.6%,
36.0,22,2.5%,
29.0,20,2.2%,

Value,Count,Frequency (%),Unnamed: 3
0.42,1,0.1%,
0.67,1,0.1%,
0.75,2,0.2%,
0.83,2,0.2%,
0.92,1,0.1%,

Value,Count,Frequency (%),Unnamed: 3
70.0,2,0.2%,
70.5,1,0.1%,
71.0,2,0.2%,
74.0,1,0.1%,
80.0,1,0.1%,

0,1
Distinct count,7
Unique (%),0.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.52301
Minimum,0
Maximum,8
Zeros (%),68.2%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,3
Maximum,8
Range,8
Interquartile range,1

0,1
Standard deviation,1.1027
Coef of variation,2.1085
Kurtosis,17.88
Mean,0.52301
MAD,0.71378
Skewness,3.6954
Sum,466
Variance,1.216
Memory size,7.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,608,68.2%,
1,209,23.5%,
2,28,3.1%,
4,18,2.0%,
3,16,1.8%,
8,7,0.8%,
5,5,0.6%,

Value,Count,Frequency (%),Unnamed: 3
0,608,68.2%,
1,209,23.5%,
2,28,3.1%,
3,16,1.8%,
4,18,2.0%,

Value,Count,Frequency (%),Unnamed: 3
2,28,3.1%,
3,16,1.8%,
4,18,2.0%,
5,5,0.6%,
8,7,0.8%,

0,1
Distinct count,7
Unique (%),0.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.38159
Minimum,0
Maximum,6
Zeros (%),76.1%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,2
Maximum,6
Range,6
Interquartile range,0

0,1
Standard deviation,0.80606
Coef of variation,2.1123
Kurtosis,9.7781
Mean,0.38159
MAD,0.58074
Skewness,2.7491
Sum,340
Variance,0.64973
Memory size,7.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,678,76.1%,
1,118,13.2%,
2,80,9.0%,
5,5,0.6%,
3,5,0.6%,
4,4,0.4%,
6,1,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0,678,76.1%,
1,118,13.2%,
2,80,9.0%,
3,5,0.6%,
4,4,0.4%,

Value,Count,Frequency (%),Unnamed: 3
2,80,9.0%,
3,5,0.6%,
4,4,0.4%,
5,5,0.6%,
6,1,0.1%,

0,1
Distinct count,681
Unique (%),76.4%
Missing (%),0.0%
Missing (n),0

0,1
CA. 2343,7
1601,7
347082,7
Other values (678),870

Value,Count,Frequency (%),Unnamed: 3
CA. 2343,7,0.8%,
1601,7,0.8%,
347082,7,0.8%,
347088,6,0.7%,
3101295,6,0.7%,
CA 2144,6,0.7%,
S.O.C. 14879,5,0.6%,
382652,5,0.6%,
113760,4,0.4%,
PC 17757,4,0.4%,

0,1
Distinct count,248
Unique (%),27.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,32.204
Minimum,0
Maximum,512.33
Zeros (%),1.7%

0,1
Minimum,0.0
5-th percentile,7.225
Q1,7.9104
Median,14.454
Q3,31.0
95-th percentile,112.08
Maximum,512.33
Range,512.33
Interquartile range,23.09

0,1
Standard deviation,49.693
Coef of variation,1.5431
Kurtosis,33.398
Mean,32.204
MAD,28.164
Skewness,4.7873
Sum,28694
Variance,2469.4
Memory size,7.1 KiB

Value,Count,Frequency (%),Unnamed: 3
8.05,43,4.8%,
13.0,42,4.7%,
7.8958,38,4.3%,
7.75,34,3.8%,
26.0,31,3.5%,
10.5,24,2.7%,
7.925,18,2.0%,
7.775,16,1.8%,
26.55,15,1.7%,
0.0,15,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0.0,15,1.7%,
4.0125,1,0.1%,
5.0,1,0.1%,
6.2375,1,0.1%,
6.4375,1,0.1%,

Value,Count,Frequency (%),Unnamed: 3
227.525,4,0.4%,
247.5208,2,0.2%,
262.375,2,0.2%,
263.0,4,0.4%,
512.3292,3,0.3%,

0,1
Distinct count,148
Unique (%),16.6%
Missing (%),77.1%
Missing (n),687

0,1
B96 B98,4
G6,4
C23 C25 C27,4
Other values (144),192
(Missing),687

Value,Count,Frequency (%),Unnamed: 3
B96 B98,4,0.4%,
G6,4,0.4%,
C23 C25 C27,4,0.4%,
C22 C26,3,0.3%,
E101,3,0.3%,
D,3,0.3%,
F33,3,0.3%,
F2,3,0.3%,
F G73,2,0.2%,
C125,2,0.2%,

0,1
Distinct count,4
Unique (%),0.4%
Missing (%),0.2%
Missing (n),2

0,1
S,644
C,168
Q,77
(Missing),2

Value,Count,Frequency (%),Unnamed: 3
S,644,72.3%,
C,168,18.9%,
Q,77,8.6%,
(Missing),2,0.2%,

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
