http://ithelp.ithome.com.tw/articles/10185922

# Pandas 與 data frame 的常用屬性或方法

## 建立 data frame

使用 pandas 套件的 DataFrame() 方法將一個 dictionary 的資料結構轉換成 data frame。

In [2]:
import pandas as pd

# 截至 2016-12-14 上午 11 時第 8 屆 iT 邦幫忙各組的鐵人分別是 59、9、19、14、6 與 77 人
groups = ["Modern Web", "DevOps", "Cloud", "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, 77]

ironmen_dict = {
                "groups": groups,
                "ironmen": ironmen
}

ironmen_df = pd.DataFrame(ironmen_dict)
ironmen_df

Unnamed: 0,groups,ironmen
0,Modern Web,59
1,DevOps,9
2,Cloud,19
3,Big Data,14
4,Security,6
5,自我挑戰組,77


# 瞭解 data frame 的概觀

>+ ndim 屬性
>+ shape 屬性
>+ dtypes 屬性

In [3]:
import pandas as pd

groups = ["Modern Web", "DevOps", "Cloud", "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, 77]

ironmen_dict = {
                "groups": groups,
                "ironmen": ironmen
}

# 建立 data frame
ironmen_df = pd.DataFrame(ironmen_dict)

# 使用屬性
print(ironmen_df.ndim)
print("---") # 分隔線
print(ironmen_df.shape)
print("---") # 分隔線
print(ironmen_df.dtypes)

2
---
(6, 2)
---
groups     object
ironmen     int64
dtype: object


# 刪除觀測值或欄位

data frame 可以透過 drop() 方法來刪除觀測值或欄位，

指定參數 axis = 0 表示要刪除觀測值（row），

指定參數 axis = 1 表示要刪除欄位（column）。

In [5]:
import pandas as pd

groups = ["Modern Web", "DevOps", "Cloud", "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, 77]

ironmen_dict = {
                "groups": groups,
                "ironmen": ironmen
}

# 建立 data frame
ironmen_df = pd.DataFrame(ironmen_dict)

# 刪除觀測值
ironmen_df_no_mw = ironmen_df.drop(0, axis = 0)
print(ironmen_df_no_mw)
print("---") # 分隔線

# 刪除欄位
ironmen_df_no_groups = ironmen_df.drop("groups", axis = 1)
print(ironmen_df_no_groups)

     groups  ironmen
1    DevOps        9
2     Cloud       19
3  Big Data       14
4  Security        6
5     自我挑戰組       77
---
   ironmen
0       59
1        9
2       19
3       14
4        6
5       77


# 透過 ix 屬性篩選 data frame

我們可以透過 ix 屬性（利用索引值）篩選 data frame。

In [6]:
import pandas as pd

groups = ["Modern Web", "DevOps", "Cloud", "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, 77]

ironmen_dict = {
                "groups": groups,
                "ironmen": ironmen
}

# 建立 data frame
ironmen_df = pd.DataFrame(ironmen_dict)

# 選擇欄位
print(ironmen_df.ix[:, "groups"])
print("---") # 分隔線

# 選擇觀測值
print(ironmen_df.ix[0])
print("---") # 分隔線

# 同時選擇欄位與觀測值
print(ironmen_df.ix[0, "groups"])

0    Modern Web
1        DevOps
2         Cloud
3      Big Data
4      Security
5         自我挑戰組
Name: groups, dtype: object
---
groups     Modern Web
ironmen            59
Name: 0, dtype: object
---
Modern Web


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  from ipykernel import kernelapp as app


# 透過布林值篩選 data frame

In [7]:
import pandas as pd

groups = ["Modern Web", "DevOps", "Cloud", "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, 77]

ironmen_dict = {
                "groups": groups,
                "ironmen": ironmen
}

# 建立 data frame
ironmen_df = pd.DataFrame(ironmen_dict)

filter = ironmen_df["ironmen"] > 10 # 參賽人數大於 10
ironmen_df[filter] # 篩選 data frame

Unnamed: 0,groups,ironmen
0,Modern Web,59
2,Cloud,19
3,Big Data,14
5,自我挑戰組,77


# 排序

>+ sort_index() 方法
>+ sort_values() 方法
>+ 使用 data frame 的 sort_index() 方法可以用索引值排序。

In [8]:
import pandas as pd

groups = ["Modern Web", "DevOps", "Cloud", "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, 77]

# 建立 data frame
ironmen_df = pd.DataFrame(ironmen, columns = ["ironmen"], index = groups)

# 用索引值排序
ironmen_df.sort_index()

Unnamed: 0,ironmen
Big Data,14
Cloud,19
DevOps,9
Modern Web,59
Security,6
自我挑戰組,77


使用 data frame 的 sort_values() 方法可以用指定欄位的數值排序。

In [9]:
import pandas as pd

groups = ["Modern Web", "DevOps", "Cloud", "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, 77]

# 建立 data frame
ironmen_df = pd.DataFrame(ironmen, columns = ["ironmen"], index = groups)

# 用數值排序
ironmen_df.sort_values(by = "ironmen")

Unnamed: 0,ironmen
Security,6
DevOps,9
Big Data,14
Cloud,19
Modern Web,59
自我挑戰組,77


# 描述統計

data frame 有 sum()、mean()、median() 與 describe() 等統計方法可以使用。

In [10]:
import pandas as pd

groups = ["Modern Web", "DevOps", "Cloud", "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, 77]

ironmen_dict = {
                "groups": groups,
                "ironmen": ironmen
}

# 建立 data frame
ironmen_df = pd.DataFrame(ironmen_dict)

print(ironmen_df.sum()) # 計算總鐵人數
print("---") # 分隔線
print(ironmen_df.mean()) # 計算平均鐵人數
print("---") # 分隔線
print(ironmen_df.median()) # 計算中位數
print("---") # 分隔線
print(ironmen_df.describe()) # 描述統計

groups     Modern WebDevOpsCloudBig DataSecurity自我挑戰組
ironmen                                           184
dtype: object
---
ironmen    30.666667
dtype: float64
---
ironmen    16.5
dtype: float64
---
         ironmen
count   6.000000
mean   30.666667
std    29.803803
min     6.000000
25%    10.250000
50%    16.500000
75%    49.000000
max    77.000000


# 相異值個數

透過 pandas 的 value_counts() 方法可以統計相異值的個數。

In [11]:
import pandas as pd

gender = ["Male", "Male", "Female", "Male", "Male", "Male", "Female", "Male", "Male"]
name = ["蒙其·D·魯夫", "羅羅亞·索隆", "娜美", "騙人布", "文斯莫克·香吉士", "多尼多尼·喬巴", "妮可·羅賓", "佛朗基", "布魯克"]

# 建立 data frame
ironmen_df = pd.DataFrame(gender, columns = ["gender"], index = name)

# 計算男女各有幾個觀測值
pd.value_counts(ironmen_df.gender)

Male      7
Female    2
Name: gender, dtype: int64

# 遺失值

## 判斷遺失值

>+ isnull() 方法
>+ notnull() 方法


In [12]:
import numpy as np
import pandas as pd

groups = ["Modern Web", "DevOps", np.nan, "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, np.nan]

ironmen_dict = {
                "groups": groups,
                "ironmen": ironmen
}

# 建立 data frame
ironmen_df = pd.DataFrame(ironmen_dict)

print(ironmen_df.ix[:, "groups"].isnull()) # 判斷哪些組的組名是遺失值
print("---") # 分隔線
print(ironmen_df.ix[:, "ironmen"].notnull()) # 判斷哪些組的鐵人數不是遺失值

0    False
1    False
2     True
3    False
4    False
5    False
Name: groups, dtype: bool
---
0     True
1     True
2     True
3     True
4     True
5    False
Name: ironmen, dtype: bool


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  from ipykernel import kernelapp as app


## 處理遺失值

>+ dropna() 方法
>+ fillna() 方法

In [13]:
import numpy as np
import pandas as pd

groups = ["Modern Web", "DevOps", np.nan, "Big Data", "Security", "自我挑戰組"]
ironmen = [59, 9, 19, 14, 6, np.nan]

ironmen_dict = {
                "groups": groups,
                "ironmen": ironmen
}

# 建立 data frame
ironmen_df = pd.DataFrame(ironmen_dict)

ironmen_df_na_dropped = ironmen_df.dropna() # 有遺失值的觀測值都刪除
print(ironmen_df_na_dropped)
print("---") # 分隔線
ironmen_df_na_filled = ironmen_df.fillna(0) # 有遺失值的觀測值填補 0
print(ironmen_df_na_filled)
print("---") # 分隔線
ironmen_df_na_filled = ironmen_df.fillna({"groups": "Cloud", "ironmen": 71}) # 依欄位填補遺失值
print(ironmen_df_na_filled)

       groups  ironmen
0  Modern Web     59.0
1      DevOps      9.0
3    Big Data     14.0
4    Security      6.0
---
       groups  ironmen
0  Modern Web     59.0
1      DevOps      9.0
2           0     19.0
3    Big Data     14.0
4    Security      6.0
5       自我挑戰組      0.0
---
       groups  ironmen
0  Modern Web     59.0
1      DevOps      9.0
2       Cloud     19.0
3    Big Data     14.0
4    Security      6.0
5       自我挑戰組     71.0
