In [2]:
import pandas as pd
import numpy as np

- Pandas
    - DataFrame retrieve
        - [去除0的資料](#去除0的資料)
        - [取特定幾筆資料](#取特定幾筆資料)
    - [合併DataFrame(以col)](#合併DataFrame(以col))
    - [尋找不重複的資料](#尋找不重複的資料)
    - [特徵onehot編碼](#特徵onehot編碼)

- Numpy
    - [Numpy建立指定大小陣列](#Numpy建立指定大小陣列)
    - [Numpy取指定欄位資料](#Numpy取指定欄位資料)
    - [Numpy堆疊資料(以col)](#Numpy堆疊資料(以col))
    - [計算元素出現次數](#計算元素出現次數)

## DataFrame retrieve
### 去除0的資料
To remove all rows that contain only 0 we can use the following syntax

In [22]:
import numpy as np

data = np.random.choice(10, 100)

data = data.reshape(10,10)

data[3,:] = 0
data[7,:] = 0
data[8,:] = 0

print(data)
print('original data shape', data.shape)

print('----- after removing rows with only 0 -----')

data = data[~np.all(data == 0, axis=1)]

print(data)
print('new data shape', data.shape)

[[5 5 6 9 7 3 9 0 4 4]
 [5 5 7 4 0 1 1 0 7 3]
 [0 2 6 6 7 6 8 4 4 9]
 [0 0 0 0 0 0 0 0 0 0]
 [9 8 7 3 1 3 6 1 1 9]
 [5 2 9 8 4 8 1 8 0 1]
 [2 0 6 7 4 2 2 0 3 4]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [1 5 1 3 8 5 3 5 9 9]]
original data shape (10, 10)
----- after removing rows with only 0 -----
[[5 5 6 9 7 3 9 0 4 4]
 [5 5 7 4 0 1 1 0 7 3]
 [0 2 6 6 7 6 8 4 4 9]
 [9 8 7 3 1 3 6 1 1 9]
 [5 2 9 8 4 8 1 8 0 1]
 [2 0 6 7 4 2 2 0 3 4]
 [1 5 1 3 8 5 3 5 9 9]]
new data shape (7, 10)


### 取特定幾筆資料
使用 `isin()` 來提取指定欄位的資料

In [12]:
df = pd.DataFrame({'ID': list(range(20)), 'Profit': np.random.rand(20)})

In [13]:
df[df["ID"].isin([5, 10, 15])]

Unnamed: 0,ID,Profit
5,5,0.153827
10,10,0.786268
15,15,0.066038


篩選除了某list以外的那些行，相反的關鍵就在於 `~`

In [14]:
df[~df["ID"].isin([5, 10, 15])]

Unnamed: 0,ID,Profit
0,0,0.261335
1,1,0.725721
2,2,0.075262
3,3,0.811761
4,4,0.627254
6,6,0.817034
7,7,0.06452
8,8,0.332003
9,9,0.039607
11,11,0.077607


### 特徵onehot編碼 

In [3]:
import seaborn as sns

penguins = sns.load_dataset("penguins").dropna()
X, y = penguins.drop("body_mass_g", axis=1), penguins[["body_mass_g"]]
pd.get_dummies(X['species'])

Unnamed: 0,Adelie,Chinstrap,Gentoo
0,1,0,0
1,1,0,0
2,1,0,0
4,1,0,0
5,1,0,0
...,...,...,...
338,0,0,1
340,0,0,1
341,0,0,1
342,0,0,1


## Numpy

### Numpy建立指定大小陣列

In [5]:
import numpy as np

arr = np.full((5, 5), 0.0)

arr[0][3]=19

In [6]:
arr

array([[ 0.,  0.,  0., 19.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

### Numpy取指定欄位資料


In [4]:
arr = np.random.choice(10, 100)
arr = arr.reshape(10,10)

arr

array([[7, 3, 8, 1, 9, 0, 6, 5, 6, 6],
       [8, 8, 7, 1, 8, 3, 0, 9, 0, 4],
       [3, 8, 5, 7, 4, 5, 1, 4, 5, 7],
       [9, 5, 5, 6, 6, 8, 0, 8, 1, 7],
       [4, 7, 3, 2, 9, 7, 5, 1, 1, 5],
       [2, 7, 2, 7, 4, 5, 7, 3, 5, 4],
       [4, 9, 9, 4, 6, 2, 4, 6, 8, 6],
       [5, 4, 3, 0, 2, 6, 1, 4, 1, 2],
       [9, 0, 3, 4, 2, 3, 0, 5, 6, 5],
       [2, 5, 9, 5, 4, 9, 6, 2, 0, 1]])

In [6]:
arr[:,[0,1,3]] # 指定 0 1 3 column欄位的資料

array([[7, 3, 1],
       [8, 8, 1],
       [3, 8, 7],
       [9, 5, 6],
       [4, 7, 2],
       [2, 7, 7],
       [4, 9, 4],
       [5, 4, 0],
       [9, 0, 4],
       [2, 5, 5]])

In [11]:
arr[:,[0,1,3]][[7,8,9]] #第二個[] 表示從10筆資料中拿位置7 8 9這三筆的資料

array([[5, 4, 0],
       [9, 0, 4],
       [2, 5, 5]])

### Numpy堆疊資料(以col)

In [5]:
a = np.array((0, 1))
b = np.array((2, 1))
c = np.array((-1, -1))

In [6]:
# 使用numpy
np.vstack((a,b,c)).T

array([[ 0,  2, -1],
       [ 1,  1, -1]])

In [7]:
# 使用pandas
np.column_stack((a,b,c))

array([[ 0,  2, -1],
       [ 1,  1, -1]])

## 合併DataFrame(以col)

In [5]:
df1 = pd.DataFrame([['a', 1], ['b', 2]],
                   columns=['letter', 'number'])
df2 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
                   columns=['animal', 'name'])
pd.concat([df1, df2], axis=1)

Unnamed: 0,letter,number,animal,name
0,a,1,bird,polly
1,b,2,monkey,george


## 尋找不重複的資料

In [9]:
df1 = pd.DataFrame([1,2,2,3,4,5,3], columns=['id'])
df1['id'].unique()

array([1, 2, 3, 4, 5])

## 計算元素出現次數

In [29]:
# 計算每個元素出現次數
a = np.array([0, 3, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 1, 3, 4])
unique, counts = np.unique(a, return_counts=True)
dict(zip(unique, counts))

{0: 7, 1: 4, 2: 1, 3: 2, 4: 1}

In [10]:
# 內建指定某個數值的出現次數
[1, 2, 3, 4, 1, 4, 1].count(1)

3

In [26]:
# 使用 bincount 計算非負整數數組中每個值的出現次數
np.bincount(np.array([0, 1, 1, 3, 2, 1, 7]))

array([1, 3, 1, 1, 0, 0, 0, 1])

In [27]:
# 計算串列中1出現次數
np.sum([0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1])

4

In [14]:
# 把重複元素挑出來
mylist = [1,7,7,7,3,9,9,9,7,9,10,0]   
print(sorted(set([i for i in mylist if mylist.count(i)>2])))

[7, 9]
