In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# 形の違うデータの計算

In [2]:
ser1 = Series([0, 1, 2], index=['A', 'B', 'C'])

In [3]:
ser1

A    0
B    1
C    2
dtype: int64

In [4]:
ser2 = Series([3, 4, 5, 6], index=['A', 'B', 'C', 'D'])

In [5]:
ser2

A    3
B    4
C    5
D    6
dtype: int64

In [6]:
ser1 + ser2

A    3.0
B    5.0
C    7.0
D    NaN
dtype: float64

In [7]:
df1 = DataFrame(np.arange(4).reshape((2, 2)), columns=list('AB'), index=['NY', 'LA'])

In [8]:
df1

Unnamed: 0,A,B
NY,0,1
LA,2,3


In [11]:
df2 = DataFrame(np.arange(9).reshape((3, 3)), columns=list('ADC'), index=['NY', 'SF', 'LA'])

In [12]:
df2

Unnamed: 0,A,D,C
NY,0,1,2
SF,3,4,5
LA,6,7,8


In [13]:
df1 + df2

Unnamed: 0,A,B,C,D
LA,8.0,,,
NY,0.0,,,
SF,,,,


In [14]:
df1.add(df2, fill_value=0)

Unnamed: 0,A,B,C,D
LA,8.0,3.0,8.0,7.0
NY,0.0,1.0,2.0,1.0
SF,3.0,,5.0,4.0


In [15]:
df2

Unnamed: 0,A,D,C
NY,0,1,2
SF,3,4,5
LA,6,7,8


In [17]:
ser3 = df2.iloc[0]

In [18]:
ser3

A    0
D    1
C    2
Name: NY, dtype: int64

In [19]:
df2 - ser3

Unnamed: 0,A,D,C
NY,0,0,0
SF,3,3,3
LA,6,6,6


# データの並べ替えと順番

In [31]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [32]:
ser1 = Series(range(3), index=['C', 'A', 'B'])

In [33]:
ser1

C    0
A    1
B    2
dtype: int64

In [34]:
ser1.sort_index()

A    1
B    2
C    0
dtype: int64

In [35]:
ser1

C    0
A    1
B    2
dtype: int64

In [36]:
from numpy.random import randn

In [37]:
ser2 = Series(randn(10))

In [38]:
ser2

0   -0.210967
1   -0.284728
2   -1.456951
3   -1.911833
4    1.892703
5   -0.431785
6   -0.649372
7    1.719287
8   -0.610569
9    0.264762
dtype: float64

In [39]:
ser2.rank()

0     7.0
1     6.0
2     2.0
3     1.0
4    10.0
5     5.0
6     3.0
7     9.0
8     4.0
9     8.0
dtype: float64

In [42]:
ser2.sort_values(ascending=False)

4    1.892703
7    1.719287
9    0.264762
0   -0.210967
1   -0.284728
5   -0.431785
8   -0.610569
6   -0.649372
2   -1.456951
3   -1.911833
dtype: float64

In [44]:
sorted_df2 = ser2.sort_values(ascending=False)

In [45]:
sorted_df2

4    1.892703
7    1.719287
9    0.264762
0   -0.210967
1   -0.284728
5   -0.431785
8   -0.610569
6   -0.649372
2   -1.456951
3   -1.911833
dtype: float64

In [46]:
sorted_df2.rank()

4    10.0
7     9.0
9     8.0
0     7.0
1     6.0
5     5.0
8     4.0
6     3.0
2     2.0
3     1.0
dtype: float64

# データの統計量

In [47]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [48]:
arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]])

In [49]:
arr

array([[  1.,   2.,  nan],
       [ nan,   3.,   4.]])

In [50]:
df1 = DataFrame(arr, index=['A', 'B'], columns=['One', 'Two', 'Three'])

In [51]:
df1

Unnamed: 0,One,Two,Three
A,1.0,2.0,
B,,3.0,4.0


In [52]:
df1.sum()

One      1.0
Two      5.0
Three    4.0
dtype: float64

In [53]:
df1.sum(axis=1)

A    3.0
B    7.0
dtype: float64

In [54]:
df1.min()

One      1.0
Two      2.0
Three    4.0
dtype: float64

In [55]:
df1.idxmin()

One      A
Two      A
Three    B
dtype: object

In [56]:
df1.max()

One      1.0
Two      3.0
Three    4.0
dtype: float64

In [57]:
df1

Unnamed: 0,One,Two,Three
A,1.0,2.0,
B,,3.0,4.0


## 累積を求める

In [58]:
df1.cumsum()

Unnamed: 0,One,Two,Three
A,1.0,2.0,
B,,5.0,4.0


In [59]:
df1.describe()

Unnamed: 0,One,Two,Three
count,1.0,2.0,1.0
mean,1.0,2.5,4.0
std,,0.707107,
min,1.0,2.0,4.0
25%,1.0,2.25,4.0
50%,1.0,2.5,4.0
75%,1.0,2.75,4.0
max,1.0,3.0,4.0


# 欠損値の扱い

In [106]:
import numpy as np
from numpy import nan
import pandas as pd
from pandas import Series, DataFrame

In [107]:
data = Series(['One', 'Two', nan, 'Four'])

In [108]:
data

0     One
1     Two
2     NaN
3    Four
dtype: object

In [109]:
data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [110]:
data.dropna()

0     One
1     Two
3    Four
dtype: object

In [111]:
df = DataFrame([[1, 2, 3], [nan, 5, 6], [7, nan, 9], [nan, nan, nan]])

In [112]:
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [113]:
df.dropna()

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [114]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0


In [115]:
df.dropna(axis=0)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [116]:
df.dropna(axis=1)

0
1
2
3


In [117]:
df.dropna(how='any')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [118]:
df2 = DataFrame([[1,2,3,nan], [2, nan, 5, 6], [nan, 7, nan, 9], [1, nan, nan, nan]])

In [119]:
df2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


## 欠損値でないデータがn個以上残る

In [120]:
df2.dropna(thresh=2)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0


In [121]:
df2.dropna(thresh=3)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0


## 欠損値を別の値で埋める:fillna(n)

In [122]:
df2.fillna(1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,1.0
1,2.0,1.0,5.0,6.0
2,1.0,7.0,1.0,9.0
3,1.0,1.0,1.0,1.0


In [123]:
df2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [124]:
df2.fillna({0:0, 1:1, 2:2, 3:3})

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,3.0
1,2.0,1.0,5.0,6.0
2,0.0,7.0,2.0,9.0
3,1.0,1.0,2.0,3.0


In [125]:
df2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [126]:
df2.fillna(0, inplace=True)

In [127]:
df2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,0.0
1,2.0,0.0,5.0,6.0
2,0.0,7.0,0.0,9.0
3,1.0,0.0,0.0,0.0


# indexの階層構造

In [161]:
import numpy as np
from numpy.random import randn
import pandas as pd
from pandas import Series, DataFrame

In [162]:
ser = Series(np.random.randn(6), index=[[1, 1, 1, 2, 2, 2], ['a', 'b', 'c', 'a', 'b', 'c']])

In [163]:
ser

1  a   -1.813810
   b   -0.754311
   c   -0.766721
2  a    0.214925
   b   -0.843587
   c    0.466687
dtype: float64

In [164]:
ser.index

MultiIndex(levels=[[1, 2], ['a', 'b', 'c']],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [165]:
ser[1]

a   -1.813810
b   -0.754311
c   -0.766721
dtype: float64

In [166]:
ser[2]

a    0.214925
b   -0.843587
c    0.466687
dtype: float64

In [167]:
ser[:, 'a']

1   -1.813810
2    0.214925
dtype: float64

In [168]:
df = ser.unstack()

In [169]:
df

Unnamed: 0,a,b,c
1,-1.81381,-0.754311,-0.766721
2,0.214925,-0.843587,0.466687


In [170]:
df.unstack()

a  1   -1.813810
   2    0.214925
b  1   -0.754311
   2   -0.843587
c  1   -0.766721
   2    0.466687
dtype: float64

In [171]:
df.T.unstack()

1  a   -1.813810
   b   -0.754311
   c   -0.766721
2  a    0.214925
   b   -0.843587
   c    0.466687
dtype: float64

In [172]:
df2 = DataFrame(np.arange(16).reshape((4, 4)), index=[['a', 'a', 'b', 'b'], [1, 1, 2, 2]], columns=[['NY', 'NY', 'LA', 'SF'], ['cold', 'hot', 'hot', 'cold']])

In [173]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,NY,NY,LA,SF
Unnamed: 0_level_1,Unnamed: 1_level_1,cold,hot,hot,cold
a,1,0,1,2,3
a,1,4,5,6,7
b,2,8,9,10,11
b,2,12,13,14,15


In [174]:
df2.index.names = ['INDEX_1', 'INDEX_2']

In [175]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,NY,NY,LA,SF
Unnamed: 0_level_1,Unnamed: 1_level_1,cold,hot,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,1,4,5,6,7
b,2,8,9,10,11
b,2,12,13,14,15


In [176]:
df2.columns.names = ['Cities', 'Temp']

In [177]:
df2

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,1,4,5,6,7
b,2,8,9,10,11
b,2,12,13,14,15


In [178]:
df2.swaplevel('Cities', 'Temp', axis=1)

Unnamed: 0_level_0,Temp,cold,hot,hot,cold
Unnamed: 0_level_1,Cities,NY,NY,LA,SF
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,1,4,5,6,7
b,2,8,9,10,11
b,2,12,13,14,15


In [179]:
df2.sort_index(1)

Unnamed: 0_level_0,Cities,LA,NY,NY,SF
Unnamed: 0_level_1,Temp,hot,cold,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,2,0,1,3
a,1,6,4,5,7
b,2,10,8,9,11
b,2,14,12,13,15


In [180]:
df2

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,1,4,5,6,7
b,2,8,9,10,11
b,2,12,13,14,15


In [182]:
df2.sort_index(1).sort_index(0)

Unnamed: 0_level_0,Cities,LA,NY,NY,SF
Unnamed: 0_level_1,Temp,hot,cold,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,2,0,1,3
a,1,6,4,5,7
b,2,10,8,9,11
b,2,14,12,13,15


In [184]:
df2.sum(level='Temp', axis=1)

Unnamed: 0_level_0,Temp,cold,hot
INDEX_1,INDEX_2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,3,3
a,1,11,11
b,2,19,19
b,2,27,27
