# 第1章 pandasの基礎知識

In [1]:
import pandas as pd
import numpy as np

---

## 1.1 データの参照と更新（`loc` / `iloc`）

In [2]:
df = pd.DataFrame(
    [["Alice", 87], ["Bob", 65], ["Carol", 92]],
    columns=["Name", "Point"],
    index=[10, 20, 30],
)
df

Unnamed: 0,Name,Point
10,Alice,87
20,Bob,65
30,Carol,92


In [None]:
sr = df.Name  # df["Name"]とも書けます
sr

In [None]:
df.loc[10]

In [None]:
sr.loc[10]

In [None]:
df.loc[:, "Name"]

In [None]:
df.loc[:, "Name"]
df["Name"]

In [None]:
# 警告のでる例

df2 = df.copy()
df2["Name"][20] = "Bill"

In [None]:
df2.loc[20, "Name"] = "Bill"

In [None]:
df.loc[10:20]

In [None]:
sr.loc[10:20]

In [None]:
columns = ["Jan", "Feb", "Mar", "Apr"]
df4 = pd.DataFrame([[1, 2, 3, 4]], columns=columns)
df4.loc[:, "Feb":"Apr"]

In [None]:
df3 = df.copy()
df3.index = df3.index.astype(str)
df3["10":"20"]

In [None]:
sr2 = sr.copy()
sr2.index = sr2.index.astype(str)  # 行名を文字列型に変換
sr2["10":"20"]

In [None]:
sr3 = sr.copy()
sr3.index = ["20", "10", "30"]  # ソートされていないインデックス
sr3["10":"20"]

In [None]:
sr2["10":"3"]

In [None]:
df.loc[[10, 30]]

In [None]:
sr.loc[[10, 30]]

In [None]:
df.loc[:, ["Name"]]

In [None]:
df.loc[:, ["Name"]]
df[["Name"]]

In [None]:
df.loc[df["Point"] >= 80]

In [None]:
sr.loc[sr.str.startswith("A")]

In [None]:
df.loc[df["Point"] >= 80]
df[df["Point"] >= 80]

In [None]:
# 警告のでる例

df5 = df.copy()
df5[df5["Point"] >= 80]["Point"] = 80
df5

In [None]:
df5.loc[df5["Point"] >= 80, "Point"] = 80
df5

In [None]:
df.loc[:, df.columns.str.startswith("Na")]

In [None]:
df.iloc[0]

In [None]:
df.iloc[:, 0]

In [None]:
df.iloc[:2]

In [None]:
df[:2]

In [None]:
df.iloc[[0, 1]]

In [None]:
df.iloc[:, [0]]

## 1.2 行の絞り込み（ブールインデックス）

In [None]:
df = pd.DataFrame(
    [["Alice", 87, 76], ["Bob", 65, 88]],
    columns=["Name", "Math", "Sci"],
)
df

In [None]:
df["Math"] >= 80

In [None]:
df[df["Math"] >= 80]

In [None]:
df["Math"] < df["Sci"]

In [None]:
df[df["Math"] < df["Sci"]]

In [None]:
df["Math"].lt(df["Sci"])

In [None]:
df[df["Math"].lt(df["Sci"])]

## 1.3 インデックスの設定（`DataFrame.set_index()`）

In [None]:
df = pd.DataFrame(
    [
        ["平日", "大人", 2000],
        ["平日", "小人", 1000],
        ["土日祝", "大人", 3000],
        ["土日祝", "小人", 1500],
    ],
    columns=["日別", "年齢別", "料金"],
)
df

In [None]:
df.set_index("日別")

In [None]:
df.set_index(["日別", "年齢別"])

## 1.4 インデックスのリセット（`DataFrame.reset_index()`）

In [None]:
df = pd.DataFrame([2, np.nan, 1], columns=["Point"])
df = df.dropna()  # 欠損値を除外してインデックスを不連続にする
df

In [None]:
df.dropna().reset_index(drop=True)

In [None]:
df = pd.DataFrame(
    [
        ["Alice", "国語", 100],
        ["Alice", "数学", 80],
        ["Bob", "国語", 40],
        ["Bob", "理科", 80],
    ],
    columns=["Name", "Subject", "Point"],
)
df

In [None]:
df.groupby("Name").mean(numeric_only=True)

In [None]:
df.groupby("Name").mean(numeric_only=True).reset_index()

## 1.5 Seriesのインデックスのリセット（`Series.reset_index()`）

In [None]:
df = pd.DataFrame(
    [
        ["Alice", "国語", 100],
        ["Alice", "数学", 80],
        ["Bob", "国語", 40],
        ["Bob", "理科", 80],
    ],
    columns=["Name", "Subject", "Point"],
)
# 生徒ごとの点数の平均を計算
sr = df.groupby("Name").Point.mean()
sr

In [None]:
sr.reset_index(drop=True)

In [None]:
sr.reset_index()

## 1.6 データの結合（`DataFrame.merge()`）

In [None]:
df1 = pd.DataFrame(
    [[0, "Alice"], [0, "Bob"], [1, "Carol"]],
    columns=["ID", "Name"],
)
df2 = pd.DataFrame(
    [[0, "国語"], [0, "数学"]],
    columns=["ID", "Subject"],
)
df1.merge(df2, how="inner", on=None)

In [None]:
df1.merge(df2, how="left")

## 1.7 データの結合（`DataFrame.join()`）

In [None]:
df1 = pd.DataFrame(
    ["Alice", "Bob", "Carol"],
    index=[0, 0, 1],
    columns=["Name"],
)
df2 = pd.DataFrame(
    ["国語", "数学"], index=[0, 0], columns=["Subject"]
)
df1.join(df2, on=None, how="left")

In [None]:
df3 = pd.DataFrame(
    [[0, "Alice"], [0, "Bob"], [1, "Carol"]],
    columns=["ID", "Name"],
)
df3.join(df2, on="ID", how="left")

In [None]:
df1.join(df2, on=None, how="inner")

## 1.8 関数の適用（`DataFrame.apply()` / `Series.apply()`）

In [None]:
# 利用者の年齢と割引種別を格納したデータ
df = pd.DataFrame(
    [[12, ""], [20, "学割利用"], [32, ""]], columns=["年齢", "割引種別"]
)
df

In [None]:
def categorize(x):
    # 引数xには、各要素の値が渡される
    return "子供" if x < 13 else "大人"

# 列「年齢」の各要素にcategorize()を適用する
df["年齢"].apply(categorize)

In [None]:
def categorize(sr):
    # 引数srには、各行のSeriesが渡される
    age_type = "子供" if sr["年齢"] < 13 else "大人"
    coupon_type = f"（{sr['割引種別']}）" if sr["割引種別"] else ""
    return age_type + coupon_type  # 年齢と割引種別を連結

# 各行にcategorize()を適用する（列に沿った処理）
df.apply(categorize, axis=1)

## 1.9 データのグループ化（`DataFrame.groupby()`）

In [None]:
# 生徒ごとの身長と体重を記録したデータ
df = pd.DataFrame(
    [["A", 172, 63], ["A", 160, 54], ["B", 155, 51], ["B", 162, 59]],
    columns=["クラス", "身長", "体重"],
)
df

In [None]:
# 各クラスの平均値
df.groupby("クラス").mean()

In [None]:
# 各クラスの身長の平均値
df.groupby("クラス")["身長"].mean()