# Pandas
* Series: 1次元の配列
* DataFrame: 2次元の配列

## データの取り扱い

In [5]:
import numpy as np
import pandas as pd

s = pd.Series([1, 2, 3])
s  # 左にインデックス、右に要素

print(
    s,
    s.sum(),
)

0    1
1    2
2    3
dtype: int64 6


In [8]:
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
print(df)
df = pd.DataFrame(np.random.randn(6, 4))
print(df)

   A  B
0  1  3
1  2  4
          0         1         2         3
0  0.527904  0.064980 -0.754208 -1.337228
1 -0.036329 -0.878987  2.527335  1.367659
2  0.963007  0.213922 -0.249781 -1.364490
3 -0.321657 -1.110548 -0.460904  0.464913
4 -0.912040 -0.302405  0.731294 -1.094151
5 -1.091501 -0.652010  0.232474  0.278643


In [9]:
# 行・列に名前を付ける
df = pd.DataFrame(np.random.randn(6,4), index=pd.date_range('20220101', periods=6))
df

Unnamed: 0,0,1,2,3
2022-01-01,-1.183619,-0.339819,0.428886,-0.293968
2022-01-02,-0.217525,-0.877012,0.14022,-1.261291
2022-01-03,-1.510796,0.417685,0.068797,0.733771
2022-01-04,-0.721062,0.948308,1.796441,1.202504
2022-01-05,-1.126037,0.018592,0.534462,0.536118
2022-01-06,0.991293,-0.870274,0.089913,0.560146


In [11]:
df = pd.DataFrame(
    np.random.randn(6, 4),
    index=pd.date_range("20220101", periods=6),
    columns=["A", "B", "C", "D"],
)
df

Unnamed: 0,A,B,C,D
2022-01-01,-0.801423,-0.521256,2.613659,-1.264521
2022-01-02,0.17557,0.21141,-1.132272,0.330294
2022-01-03,0.648094,2.468972,0.237504,-0.577279
2022-01-04,-1.083805,-0.804736,0.219717,0.798769
2022-01-05,0.974522,-0.910128,0.652141,0.316324
2022-01-06,0.332762,0.685423,0.514601,1.679364


In [12]:
# データの並び替え
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2022-01-05,0.974522,-0.910128,0.652141,0.316324
2022-01-04,-1.083805,-0.804736,0.219717,0.798769
2022-01-01,-0.801423,-0.521256,2.613659,-1.264521
2022-01-02,0.17557,0.21141,-1.132272,0.330294
2022-01-06,0.332762,0.685423,0.514601,1.679364
2022-01-03,0.648094,2.468972,0.237504,-0.577279


## データの取り出し

In [13]:
df[0:3]  # 特定の行を取り出す

Unnamed: 0,A,B,C,D
2022-01-01,-0.801423,-0.521256,2.613659,-1.264521
2022-01-02,0.17557,0.21141,-1.132272,0.330294
2022-01-03,0.648094,2.468972,0.237504,-0.577279


In [17]:
print(df.loc["20220101"])
print(df.loc["20220101", ["A", "B"]])
print(df.loc["20220101":"20220104", ["A", "B"]])

A   -0.801423
B   -0.521256
C    2.613659
D   -1.264521
Name: 2022-01-01 00:00:00, dtype: float64
A   -0.801423
B   -0.521256
Name: 2022-01-01 00:00:00, dtype: float64
                   A         B
2022-01-01 -0.801423 -0.521256
2022-01-02  0.175570  0.211410
2022-01-03  0.648094  2.468972
2022-01-04 -1.083805 -0.804736


## 条件に当てはまるデータの抽出
boolean indexing（ブーリアンインデクシング）

In [20]:
print(df[df.A > 0])
print(df[df > 0])  # 当てはまらない箇所はNaN表示

                   A         B         C         D
2022-01-02  0.175570  0.211410 -1.132272  0.330294
2022-01-03  0.648094  2.468972  0.237504 -0.577279
2022-01-05  0.974522 -0.910128  0.652141  0.316324
2022-01-06  0.332762  0.685423  0.514601  1.679364
                   A         B         C         D
2022-01-01       NaN       NaN  2.613659       NaN
2022-01-02  0.175570  0.211410       NaN  0.330294
2022-01-03  0.648094  2.468972  0.237504       NaN
2022-01-04       NaN       NaN  0.219717  0.798769
2022-01-05  0.974522       NaN  0.652141  0.316324
2022-01-06  0.332762  0.685423  0.514601  1.679364


## データの追加・結合

In [22]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2022-01-01,-0.801423,-0.521256,2.613659,-1.264521,one
2022-01-02,0.17557,0.21141,-1.132272,0.330294,one
2022-01-03,0.648094,2.468972,0.237504,-0.577279,two
2022-01-04,-1.083805,-0.804736,0.219717,0.798769,three
2022-01-05,0.974522,-0.910128,0.652141,0.316324,four
2022-01-06,0.332762,0.685423,0.514601,1.679364,three


In [25]:
# isinメソッド
df2[df2["E"].isin(["one", "four"])]

Unnamed: 0,A,B,C,D,E
2022-01-01,-0.801423,-0.521256,2.613659,-1.264521,one
2022-01-02,0.17557,0.21141,-1.132272,0.330294,one
2022-01-05,0.974522,-0.910128,0.652141,0.316324,four


In [26]:
# 1行追加
df2.shift(1)

Unnamed: 0,A,B,C,D,E
2022-01-01,,,,,
2022-01-02,-0.801423,-0.521256,2.613659,-1.264521,one
2022-01-03,0.17557,0.21141,-1.132272,0.330294,one
2022-01-04,0.648094,2.468972,0.237504,-0.577279,two
2022-01-05,-1.083805,-0.804736,0.219717,0.798769,three
2022-01-06,0.974522,-0.910128,0.652141,0.316324,four


In [29]:
# 結合
df = pd.DataFrame(np.random.randn(2, 2))
pd.concat([df, df])

Unnamed: 0,0,1
0,1.256729,0.499931
1,0.929016,1.076345
0,1.256729,0.499931
1,0.929016,1.076345


In [33]:
# グループでまとめる
df = pd.DataFrame({"A": ["foo", "bar", "foo", "bar"], "B": np.random.randn(4)})
df.groupby("A").sum()

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
bar,2.660197
foo,-0.639427
