## Python自定義函數

In [1]:
def sum(x,y):
    return x + y

In [2]:
sum(1,5)

6

In [3]:
# 函數的參數能事先設定預設值

def sum(a, b, c=1):
    return a + b + c

In [4]:
sum(1,4)

6

In [5]:
#練習區

## Pandas 讀取資料

In [6]:
import numpy as np
import pandas as pd

In [7]:
# 當檔案名稱包含中文時，直接使用pandas函數會報錯
# 解決辦法有兩種
table = pd.read_table('測試.txt', encoding='utf8')
table

OSError: Initializing from file failed

In [8]:
# 解決辦法一 使用python內建的open函數來讀寫文件
# open函數的詳細內容 http://www.runoob.com/python/python-func-open.html

f = open('測試.txt', encoding = 'utf8')
table = pd.read_table(f, delim_whitespace=True ,encoding='utf8')   #當資料是以空白分隔時，將delim_whitespace設為True
table

Unnamed: 0,1,2016-03-22,00:06:24.4463094,中文测试字符
0,2,2016-03-22,00:06:32.4565680,需要编辑encoding
1,3,2016-03-22,00:06:32.6835965,abc
2,4,2016-03-22,00:06:32.8041945,egb


In [9]:
# 解決辦法二 將中文改為英文

test = pd.read_table('test.txt',delim_whitespace=True)
test

Unnamed: 0,1,2016-03-22,00:06:24.4463094,中文测试字符
0,2,2016-03-22,00:06:32.4565680,需要编辑encoding
1,3,2016-03-22,00:06:32.6835965,abc
2,4,2016-03-22,00:06:32.8041945,egb


In [10]:
#讀取excel試算表

excel = pd.read_excel('excel.xlsx')
excel

Unnamed: 0,time,value
0,2000-01-01,40
1,2010-01-01,200
2,2012-01-01,300
3,2014-01-01,160


In [11]:
#練習區

## 檢查數據

In [12]:
excel.head()

Unnamed: 0,time,value
0,2000-01-01,40
1,2010-01-01,200
2,2012-01-01,300
3,2014-01-01,160


In [13]:
excel.describe()

Unnamed: 0,value
count,4.0
mean,175.0
std,107.548439
min,40.0
25%,130.0
50%,180.0
75%,225.0
max,300.0


In [14]:
excel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
time     4 non-null datetime64[ns]
value    4 non-null int64
dtypes: datetime64[ns](1), int64(1)
memory usage: 144.0 bytes


In [15]:
#練習區

## 選擇、過濾數據

In [16]:
print(excel['value'])

0     40
1    200
2    300
3    160
Name: value, dtype: int64


In [17]:
#若想要只選取欄位的值

print((excel.time.values))

['2000-01-01T00:00:00.000000000' '2010-01-01T00:00:00.000000000'
 '2012-01-01T00:00:00.000000000' '2014-01-01T00:00:00.000000000']


In [18]:
print(excel[['time', 'value']])

        time  value
0 2000-01-01     40
1 2010-01-01    200
2 2012-01-01    300
3 2014-01-01    160


In [19]:
#使用過濾條件篩選數據
print(excel[excel['time'] > '2000'])

        time  value
1 2010-01-01    200
2 2012-01-01    300
3 2014-01-01    160


In [20]:
#練習區

## 處理遺失值

In [21]:


f = pd.DataFrame([[0, 2, np.nan, 0],
                    [3, 4, np.nan, 1],
                    [np.nan, np.nan, np.nan, np.nan],
                    [np.nan, 3, np.nan, 4]],
                    columns=list('ABCD'))
f

Unnamed: 0,A,B,C,D
0,0.0,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,
3,,3.0,,4.0


In [22]:
#只要row裡面包含遺失值就捨棄
f_how_any = f.dropna(how='any')
f_how_any

Unnamed: 0,A,B,C,D


In [23]:
#row裡面必須全為遺失值才捨棄
f_how_all = f.dropna(how='all') 
f_how_all

Unnamed: 0,A,B,C,D
0,0.0,2.0,,0.0
1,3.0,4.0,,1.0
3,,3.0,,4.0


In [24]:
#將遺失值全部替換為0

f_fillna_0 = f.fillna(0)
f_fillna_0

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0.0
1,3.0,4.0,0.0,1.0
2,0.0,0.0,0.0,0.0
3,0.0,3.0,0.0,4.0


In [25]:
#練習區

## 文字相關

In [26]:
#大、小寫處理
name = ["harry", "Jack", "TIna", "peTer"]

name_dict = {"name": name}


name_df = pd.DataFrame(name_dict)
name_df

Unnamed: 0,name
0,harry
1,Jack
2,TIna
3,peTer


In [27]:
name_df.name.str.upper()

0    HARRY
1     JACK
2     TINA
3    PETER
Name: name, dtype: object

In [28]:
name_df.name.str.lower()

0    harry
1     jack
2     tina
3    peter
Name: name, dtype: object

In [29]:
#練習區

## 數據合併

In [30]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3'],
                     'C': ['C0', 'C1', 'C2', 'C3'],
                     'D': ['D0', 'D1', 'D2', 'D3']},
                     index=[0, 1, 2, 3])
 

df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                     'B': ['B4', 'B5', 'B6', 'B7'],
                     'C': ['C4', 'C5', 'C6', 'C7'],
                     'D': ['D4', 'D5', 'D6', 'D7']},
                      index=[5, 6, 7, 8])
 

df3 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'],
                  'D': ['D2', 'D3', 'D6', 'D7'],
                  'F': ['F2', 'F3', 'F6', 'F7']},
                 index=[2, 3, 6, 7])

In [31]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [32]:
df2

Unnamed: 0,A,B,C,D
5,A4,B4,C4,D4
6,A5,B5,C5,D5
7,A6,B6,C6,D6
8,A7,B7,C7,D7


In [33]:
df3

Unnamed: 0,B,D,F
2,B2,D2,F2
3,B3,D3,F3
6,B6,D6,F6
7,B7,D7,F7


In [34]:
#兩表欄位一致時沿著axis=0連接
pd.concat([df1,df2])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
5,A4,B4,C4,D4
6,A5,B5,C5,D5
7,A6,B6,C6,D6
8,A7,B7,C7,D7


In [35]:
#兩表欄位一致時沿著axis=1連接
pd.concat([df1,df2], axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,,,,
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,A3,B3,C3,D3,,,,
5,,,,,A4,B4,C4,D4
6,,,,,A5,B5,C5,D5
7,,,,,A6,B6,C6,D6
8,,,,,A7,B7,C7,D7


In [36]:
#ignore_index設定為True時
#index會重新設定

pd.concat([df1,df2], ignore_index=True)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [37]:
#兩表欄位不一致時沿著axis=0連接
pd.concat([df1, df3])

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
2,,B2,,D2,F2
3,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7


In [38]:
#兩表欄位不一致，join設定為inner
#只連接兩表交集的部分，也就是column為 B D的部分

pd.concat([df1, df3], join='inner', axis=0)

Unnamed: 0,B,D
0,B0,D0
1,B1,D1
2,B2,D2
3,B3,D3
2,B2,D2
3,B3,D3
6,B6,D6
7,B7,D7


In [39]:
#兩表欄位不一致，join設定為inner
#只連接交集的部分也就是 index=2,3的欄位

pd.concat([df1, df3], join='inner', axis=1)

Unnamed: 0,A,B,C,D,B.1,D.1,F
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


In [40]:
#分組


df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                           'foo', 'bar', 'foo', 'foo'],
                    'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                    'C' : np.random.randn(8),
                    'D' : np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,1.110699,-0.681306
1,bar,one,0.707609,-0.023292
2,foo,two,-0.683871,0.453758
3,bar,three,-0.344804,-0.038643
4,foo,two,0.412123,-0.226106
5,bar,two,-0.567503,1.619959
6,foo,one,0.319965,-0.39886
7,foo,three,-1.166331,0.33613


In [43]:
#分組並進行群組的總和計算
df.groupby(['A', 'B']).agg('sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.707609,-0.023292
bar,three,-0.344804,-0.038643
bar,two,-0.567503,1.619959
foo,one,1.430665,-1.080166
foo,three,-1.166331,0.33613
foo,two,-0.271748,0.227653


In [None]:
#https://my.oschina.net/lionets/blog/280332

## pandas處理時間序列數據