In [1]:
# ライブラリのimport
import polars as pl
import numpy as np
import math
from sklearn.model_selection import train_test_split, TimeSeriesSplit


# dtypeの設定
dtypes = {
    "customer_id": str,
    "gender_cd": str,
    "postal_cd": str,
    "application_store_cd": str,
    "status_cd": str,
    "category_major_cd": str,
    "category_medium_cd": str,
    "category_small_cd": str,
    "product_cd": str,
    "store_cd": str,
    "prefecture_cd": str,
    "tel_no": str,
    "postal_cd": str,
    "street": str,
    "application_date": str,
    "birth_day": str,
}

# DataFrameの作成
df_category = pl.read_csv("../data/category.csv", dtypes=dtypes)
df_customer = pl.read_csv("../data/customer.csv", dtypes=dtypes)
df_geocode = pl.read_csv("../data/geocode.csv", dtypes=dtypes)
df_product = pl.read_csv("../data/product.csv", dtypes=dtypes)
df_receipt = pl.read_csv("../data/receipt.csv", dtypes=dtypes)
df_store = pl.read_csv("../data/store.csv", dtypes=dtypes)

print("pandasと異なる部分:")
print("1. shapeが表示される")
print("2. データ型が表示される")
print("3. indexが存在しない")

pandasと異なる部分:
1. shapeが表示される
2. データ型が表示される
3. indexが存在しない


# 演習問題
## 表示

---
> P-001: レシート明細データ（df_receipt）から全項目の先頭10件を表示し、どのようなデータを保有しているか目視で確認せよ。

In [2]:
df_receipt.head(10)

sales_ymd,sales_epoch,store_cd,receipt_no,receipt_sub_no,customer_id,product_cd,quantity,amount
i64,i64,str,i64,i64,str,str,i64,i64
20181103,1541203200,"""S14006""",112,1,"""CS006214000001…","""P070305012""",1,158
20181118,1542499200,"""S13008""",1132,2,"""CS008415000097…","""P070701017""",1,81
20170712,1499817600,"""S14028""",1102,1,"""CS028414000014…","""P060101005""",1,170
20190205,1549324800,"""S14042""",1132,1,"""ZZ000000000000…","""P050301001""",1,25
20180821,1534809600,"""S14025""",1102,2,"""CS025415000050…","""P060102007""",1,90
20190605,1559692800,"""S13003""",1112,1,"""CS003515000195…","""P050102002""",1,138
20181205,1543968000,"""S14024""",1102,2,"""CS024514000042…","""P080101005""",1,30
20190922,1569110400,"""S14040""",1102,1,"""CS040415000178…","""P070501004""",1,128
20170504,1493856000,"""S13020""",1112,2,"""ZZ000000000000…","""P071302010""",1,770
20191010,1570665600,"""S14027""",1102,1,"""CS027514000015…","""P071101003""",1,680


In [3]:
# スライスでも可能
df_receipt[:10]

sales_ymd,sales_epoch,store_cd,receipt_no,receipt_sub_no,customer_id,product_cd,quantity,amount
i64,i64,str,i64,i64,str,str,i64,i64
20181103,1541203200,"""S14006""",112,1,"""CS006214000001…","""P070305012""",1,158
20181118,1542499200,"""S13008""",1132,2,"""CS008415000097…","""P070701017""",1,81
20170712,1499817600,"""S14028""",1102,1,"""CS028414000014…","""P060101005""",1,170
20190205,1549324800,"""S14042""",1132,1,"""ZZ000000000000…","""P050301001""",1,25
20180821,1534809600,"""S14025""",1102,2,"""CS025415000050…","""P060102007""",1,90
20190605,1559692800,"""S13003""",1112,1,"""CS003515000195…","""P050102002""",1,138
20181205,1543968000,"""S14024""",1102,2,"""CS024514000042…","""P080101005""",1,30
20190922,1569110400,"""S14040""",1102,1,"""CS040415000178…","""P070501004""",1,128
20170504,1493856000,"""S13020""",1112,2,"""ZZ000000000000…","""P071302010""",1,770
20191010,1570665600,"""S14027""",1102,1,"""CS027514000015…","""P071101003""",1,680


---
> P-002: レシート明細データ（df_receipt）から売上年月日（sales_ymd）、顧客ID（customer_id）、商品コード（product_cd）、売上金額（amount）の順に列を指定し、10件表示せよ。

In [4]:
# DataFrameから特定の列を選択する場合は.select()メソッドを用いる
# .select()メソッドの中身はリスト型で渡してもいい
df_receipt.select("sales_ymd", "customer_id", "product_cd", "amount").head(10)

sales_ymd,customer_id,product_cd,amount
i64,str,str,i64
20181103,"""CS006214000001…","""P070305012""",158
20181118,"""CS008415000097…","""P070701017""",81
20170712,"""CS028414000014…","""P060101005""",170
20190205,"""ZZ000000000000…","""P050301001""",25
20180821,"""CS025415000050…","""P060102007""",90
20190605,"""CS003515000195…","""P050102002""",138
20181205,"""CS024514000042…","""P080101005""",30
20190922,"""CS040415000178…","""P070501004""",128
20170504,"""ZZ000000000000…","""P071302010""",770
20191010,"""CS027514000015…","""P071101003""",680


In [5]:
# エクスプレッション(pl.Expr())を渡してもいい
df_receipt.select(
    pl.col("sales_ymd"), pl.col("customer_id"), pl.col("product_cd"), pl.col("amount")
).head(10)

sales_ymd,customer_id,product_cd,amount
i64,str,str,i64
20181103,"""CS006214000001…","""P070305012""",158
20181118,"""CS008415000097…","""P070701017""",81
20170712,"""CS028414000014…","""P060101005""",170
20190205,"""ZZ000000000000…","""P050301001""",25
20180821,"""CS025415000050…","""P060102007""",90
20190605,"""CS003515000195…","""P050102002""",138
20181205,"""CS024514000042…","""P080101005""",30
20190922,"""CS040415000178…","""P070501004""",128
20170504,"""ZZ000000000000…","""P071302010""",770
20191010,"""CS027514000015…","""P071101003""",680


---
> P-003: レシート明細データ（df_receipt）から売上年月日（sales_ymd）、顧客ID（customer_id）、商品コード（product_cd）、売上金額（amount）の順に列を指定し、5件表示せよ。ただし、sales_ymdをsales_dateに項目名を変更しながら抽出すること。

In [6]:
# renameは辞書形式で行う(pandasも同じ感じ)

df_003 = df_receipt.select("sales_ymd", "customer_id", "product_cd", "amount").rename(
    {"sales_ymd": "sales_date"}
)

df_003.head()

sales_date,customer_id,product_cd,amount
i64,str,str,i64
20181103,"""CS006214000001…","""P070305012""",158
20181118,"""CS008415000097…","""P070701017""",81
20170712,"""CS028414000014…","""P060101005""",170
20190205,"""ZZ000000000000…","""P050301001""",25
20180821,"""CS025415000050…","""P060102007""",90


In [7]:
# aliasを用いた方法

df_003 = df_receipt.select(
    pl.col("sales_ymd").alias("sales_date"), "customer_id", "product_cd", "amount"
)

df_003.head()

sales_date,customer_id,product_cd,amount
i64,str,str,i64
20181103,"""CS006214000001…","""P070305012""",158
20181118,"""CS008415000097…","""P070701017""",81
20170712,"""CS028414000014…","""P060101005""",170
20190205,"""ZZ000000000000…","""P050301001""",25
20180821,"""CS025415000050…","""P060102007""",90


## 抽出

---
> P-004: レシート明細データ（df_receipt）から売上日（sales_ymd）、顧客ID（customer_id）、商品コード（product_cd）、売上金額（amount）の順に列を指定し、以下の条件を満たすデータを抽出せよ。
> - 顧客ID（customer_id）が"CS018205000001"

In [8]:
# エクスプレッション(pl.expr())を渡してフィルタリングをする。(pandasの.query()メソッドに似ている)
# pl.col()エクスプレッションは列選択を意味する
# and -> &, or -> | で繋ぐことができる(pandasと同様)
df_004 = df_receipt.select("sales_ymd", "customer_id", "product_cd", "amount").filter(
    pl.col("customer_id") == "CS018205000001"
)

df_004.head()

sales_ymd,customer_id,product_cd,amount
i64,str,str,i64
20180911,"""CS018205000001…","""P071401012""",2200
20180414,"""CS018205000001…","""P060104007""",600
20170614,"""CS018205000001…","""P050206001""",990
20170614,"""CS018205000001…","""P060702015""",108
20190216,"""CS018205000001…","""P071005024""",102


---
> P-005: レシート明細データ（df_receipt）から売上日（sales_ymd）、顧客ID（customer_id）、商品コード（product_cd）、売上金額（amount）の順に列を指定し、以下の全ての条件を満たすデータを抽出せよ。
> - 顧客ID（customer_id）が"CS018205000001"
> - 売上金額（amount）が1,000以上

In [9]:
# &や|で繋ぐときは()で囲ってあげること

df_005 = df_receipt.select("sales_ymd", "customer_id", "product_cd", "amount").filter(
    (pl.col("customer_id") == "CS018205000001") & (pl.col("amount") >= 1000)
)

df_005.head()

sales_ymd,customer_id,product_cd,amount
i64,str,str,i64
20180911,"""CS018205000001…","""P071401012""",2200
20190226,"""CS018205000001…","""P071401020""",2200
20180911,"""CS018205000001…","""P071401005""",1100


---
> P-006: レシート明細データ（df_receipt）から売上日（sales_ymd）、顧客ID（customer_id）、商品コード（product_cd）、売上数量（quantity）、売上金額（amount）の順に列を指定し、以下の全ての条件を満たすデータを抽出せよ。
> - 顧客ID（customer_id）が"CS018205000001"
> - 売上金額（amount）が1,000以上または売上数量（quantity）が5以上

In [10]:
df_006 = df_receipt.select(
    "sales_ymd", "customer_id", "product_cd", "quantity", "amount"
).filter(
    (pl.col("customer_id") == "CS018205000001")
    & ((pl.col("amount") >= 1000) | (pl.col("quantity") > 5))
)

df_006.head()

sales_ymd,customer_id,product_cd,quantity,amount
i64,str,str,i64,i64
20180911,"""CS018205000001…","""P071401012""",1,2200
20180414,"""CS018205000001…","""P060104007""",6,600
20190226,"""CS018205000001…","""P071401020""",1,2200
20180911,"""CS018205000001…","""P071401005""",1,1100


---
> P-007: レシート明細データ（df_receipt）から売上日（sales_ymd）、顧客ID（customer_id）、商品コード（product_cd）、売上金額（amount）の順に列を指定し、以下の全ての条件を満たすデータを抽出せよ。
> - 顧客ID（customer_id）が"CS018205000001"
> - 売上金額（amount）が1,000以上2,000以下

In [11]:
df_007 = df_receipt.select("sales_ymd", "customer_id", "product_cd", "amount").filter(
    (pl.col("customer_id") == "CS018205000001")
    & ((pl.col("amount") >= 1000) & (pl.col("amount") <= 2000))
)

df_007.head()

sales_ymd,customer_id,product_cd,amount
i64,str,str,i64
20180911,"""CS018205000001…","""P071401005""",1100


In [12]:
df_007 = df_receipt.select("sales_ymd", "customer_id", "product_cd", "amount").filter(
    (pl.col("customer_id") == "CS018205000001")
    & (pl.col("amount").is_between(lower_bound=1000, upper_bound=2000, closed="both"))
)

df_007

sales_ymd,customer_id,product_cd,amount
i64,str,str,i64
20180911,"""CS018205000001…","""P071401005""",1100


---
> P-008: レシート明細データ（df_receipt）から売上日（sales_ymd）、顧客ID（customer_id）、商品コード（product_cd）、売上金額（amount）の順に列を指定し、以下の全ての条件を満たすデータを抽出せよ。
> - 顧客ID（customer_id）が"CS018205000001"
> - 商品コード（product_cd）が"P071401019"以外

In [13]:
df_008 = df_receipt.select("sales_ymd", "customer_id", "product_cd", "amount").filter(
    (pl.col("customer_id") == "CS018205000001") & (pl.col("product_cd") != "P071401019")
)

df_008

sales_ymd,customer_id,product_cd,amount
i64,str,str,i64
20180911,"""CS018205000001…","""P071401012""",2200
20180414,"""CS018205000001…","""P060104007""",600
20170614,"""CS018205000001…","""P050206001""",990
20170614,"""CS018205000001…","""P060702015""",108
20190216,"""CS018205000001…","""P071005024""",102
20180414,"""CS018205000001…","""P071101002""",278
20190226,"""CS018205000001…","""P070902035""",168
20190924,"""CS018205000001…","""P060805001""",495
20190226,"""CS018205000001…","""P071401020""",2200
20180911,"""CS018205000001…","""P071401005""",1100


---
> P-009: 以下の処理において、出力結果を変えずにORをANDに書き換えよ。
> 
> pandas:`df_store.query('not(prefecture_cd == "13" | floor_area > 900)')` <br>
> polars:`df_store.filter(~((pl.col("prefecture_cd") == "13") | (pl.col("floor_area") > 900)))`

否定は`~`でもいいが、`pl.Expr.is_not`を使用しても良い

> `df_store.filter(((pl.col("prefecture_cd") == "13") |(pl.col("floor_area")> 900)).not_())`

In [14]:
df_009 = df_store.filter(
    (pl.col("prefecture_cd") != "13") & (pl.col("floor_area") <= 900)
)

df_009

store_cd,store_name,prefecture_cd,prefecture,address,address_kana,tel_no,longitude,latitude,floor_area
str,str,str,str,str,str,str,f64,f64,f64
"""S14046""","""北山田店""","""14""","""神奈川県""","""神奈川県横浜市都筑区北山田一…","""カナガワケンヨコハマシツヅキ…","""045-123-4049""",139.5916,35.56189,831.0
"""S14011""","""日吉本町店""","""14""","""神奈川県""","""神奈川県横浜市港北区日吉本町…","""カナガワケンヨコハマシコウホ…","""045-123-4033""",139.6316,35.54655,890.0
"""S12013""","""習志野店""","""12""","""千葉県""","""千葉県習志野市芝園一丁目""","""チバケンナラシノシシバゾノイ…","""047-123-4002""",140.022,35.66122,808.0


---
> P-010: 店舗データ（df_store）から、店舗コード（store_cd）が"S14"で始まるものだけ全項目抽出し、5件表示せよ。

文字列の条件
| メソッド            | 処理                    |
| --------------- | --------------------- |
| str.starts_with | 指定文字列が先頭にあるか          |
| str.ends_with   | 指定文字列が終端にあるか          |
| str.contains    | 指定文字列を含むかどうか(正規表現に対応) |

In [15]:
df_010 = df_store.filter(pl.col("store_cd").str.starts_with("S14"))

df_010.head()

store_cd,store_name,prefecture_cd,prefecture,address,address_kana,tel_no,longitude,latitude,floor_area
str,str,str,str,str,str,str,f64,f64,f64
"""S14010""","""菊名店""","""14""","""神奈川県""","""神奈川県横浜市港北区菊名一丁…","""カナガワケンヨコハマシコウホ…","""045-123-4032""",139.6326,35.50049,1732.0
"""S14033""","""阿久和店""","""14""","""神奈川県""","""神奈川県横浜市瀬谷区阿久和西…","""カナガワケンヨコハマシセヤク…","""045-123-4043""",139.4961,35.45918,1495.0
"""S14036""","""相模原中央店""","""14""","""神奈川県""","""神奈川県相模原市中央二丁目""","""カナガワケンサガミハラシチュ…","""042-123-4045""",139.3716,35.57327,1679.0
"""S14040""","""長津田店""","""14""","""神奈川県""","""神奈川県横浜市緑区長津田みな…","""カナガワケンヨコハマシミドリ…","""045-123-4046""",139.4994,35.52398,1548.0
"""S14050""","""阿久和西店""","""14""","""神奈川県""","""神奈川県横浜市瀬谷区阿久和西…","""カナガワケンヨコハマシセヤク…","""045-123-4053""",139.4961,35.45918,1830.0


---
> P-011: 顧客データ（df_customer）から顧客ID（customer_id）の末尾が1のものだけ全項目抽出し、5件表示せよ。

In [16]:
df_011 = df_customer.filter(pl.col("customer_id").str.ends_with(1))

df_011.head()

customer_id,customer_name,gender_cd,gender,birth_day,age,postal_cd,address,application_store_cd,application_date,status_cd
str,str,str,str,str,i64,str,str,str,str,str
"""CS037613000071…","""六角 雅彦""","""9""","""不明""","""1952-04-01""",66,"""136-0076""","""東京都江東区南砂******…","""S13037""","""20150414""","""0-00000000-0"""
"""CS028811000001…","""堀井 かおり""","""1""","""女性""","""1933-03-27""",86,"""245-0016""","""神奈川県横浜市泉区和泉町**…","""S14028""","""20160115""","""0-00000000-0"""
"""CS040412000191…","""川井 郁恵""","""1""","""女性""","""1977-01-05""",42,"""226-0021""","""神奈川県横浜市緑区北八朔町*…","""S14040""","""20151101""","""1-20091025-4"""
"""CS028314000011…","""小菅 あおい""","""1""","""女性""","""1983-11-26""",35,"""246-0038""","""神奈川県横浜市瀬谷区宮沢**…","""S14028""","""20151123""","""1-20080426-5"""
"""CS039212000051…","""藤島 恵梨香""","""1""","""女性""","""1997-02-03""",22,"""166-0001""","""東京都杉並区阿佐谷北****…","""S13039""","""20171121""","""1-20100215-4"""


---
> P-012: 店舗データ（df_store）から、住所 (address) に"横浜市"が含まれるものだけ全項目表示せよ。

In [17]:
df_012 = df_store.filter(pl.col("address").str.contains("横浜市"))

df_012

store_cd,store_name,prefecture_cd,prefecture,address,address_kana,tel_no,longitude,latitude,floor_area
str,str,str,str,str,str,str,f64,f64,f64
"""S14010""","""菊名店""","""14""","""神奈川県""","""神奈川県横浜市港北区菊名一丁…","""カナガワケンヨコハマシコウホ…","""045-123-4032""",139.6326,35.50049,1732.0
"""S14033""","""阿久和店""","""14""","""神奈川県""","""神奈川県横浜市瀬谷区阿久和西…","""カナガワケンヨコハマシセヤク…","""045-123-4043""",139.4961,35.45918,1495.0
"""S14040""","""長津田店""","""14""","""神奈川県""","""神奈川県横浜市緑区長津田みな…","""カナガワケンヨコハマシミドリ…","""045-123-4046""",139.4994,35.52398,1548.0
"""S14050""","""阿久和西店""","""14""","""神奈川県""","""神奈川県横浜市瀬谷区阿久和西…","""カナガワケンヨコハマシセヤク…","""045-123-4053""",139.4961,35.45918,1830.0
"""S14028""","""二ツ橋店""","""14""","""神奈川県""","""神奈川県横浜市瀬谷区二ツ橋町…","""カナガワケンヨコハマシセヤク…","""045-123-4042""",139.4963,35.46304,1574.0
"""S14012""","""本牧和田店""","""14""","""神奈川県""","""神奈川県横浜市中区本牧和田""","""カナガワケンヨコハマシナカク…","""045-123-4034""",139.6582,35.42156,1341.0
"""S14046""","""北山田店""","""14""","""神奈川県""","""神奈川県横浜市都筑区北山田一…","""カナガワケンヨコハマシツヅキ…","""045-123-4049""",139.5916,35.56189,831.0
"""S14011""","""日吉本町店""","""14""","""神奈川県""","""神奈川県横浜市港北区日吉本町…","""カナガワケンヨコハマシコウホ…","""045-123-4033""",139.6316,35.54655,890.0
"""S14048""","""中川中央店""","""14""","""神奈川県""","""神奈川県横浜市都筑区中川中央…","""カナガワケンヨコハマシツヅキ…","""045-123-4051""",139.5758,35.54912,1657.0
"""S14042""","""新山下店""","""14""","""神奈川県""","""神奈川県横浜市中区新山下二丁…","""カナガワケンヨコハマシナカク…","""045-123-4047""",139.6593,35.43894,1044.0


---
> P-013: 顧客データ（df_customer）から、ステータスコード（status_cd）の先頭がアルファベットのA〜Fで始まるデータを全項目抽出し、5件表示せよ。

先頭の文字は`^`で表す(https://qiita.com/luohao0404/items/7135b2b96f9b0b196bf3 参照)

In [18]:
df_013 = df_customer.filter(pl.col("status_cd").str.contains(r"^[A-F]"))

df_013.head()

customer_id,customer_name,gender_cd,gender,birth_day,age,postal_cd,address,application_store_cd,application_date,status_cd
str,str,str,str,str,i64,str,str,str,str,str
"""CS031415000172…","""宇多田 貴美子""","""1""","""女性""","""1976-10-04""",42,"""151-0053""","""東京都渋谷区代々木*****…","""S13031""","""20150529""","""D-20100325-C"""
"""CS015414000103…","""奥野 陽子""","""1""","""女性""","""1977-08-09""",41,"""136-0073""","""東京都江東区北砂******…","""S13015""","""20150722""","""B-20100609-B"""
"""CS011215000048…","""芦田 沙耶""","""1""","""女性""","""1992-02-01""",27,"""223-0062""","""神奈川県横浜市港北区日吉本町…","""S14011""","""20150228""","""C-20100421-9"""
"""CS029415000023…","""梅田 里穂""","""1""","""女性""","""1976-01-17""",43,"""279-0043""","""千葉県浦安市富士見*****…","""S12029""","""20150610""","""D-20100918-E"""
"""CS035415000029…","""寺沢 真希""","""9""","""不明""","""1977-09-27""",41,"""158-0096""","""東京都世田谷区玉川台****…","""S13035""","""20141220""","""F-20101029-F"""


---
> P-014: 顧客データ（df_customer）から、ステータスコード（status_cd）の末尾が数字の1〜9で終わるデータを全項目抽出し、5件表示せよ。

最後尾の文字は`$`で表す

In [19]:
df_014 = df_customer.filter(pl.col("status_cd").str.contains("[1-9]$"))

df_014.head()

customer_id,customer_name,gender_cd,gender,birth_day,age,postal_cd,address,application_store_cd,application_date,status_cd
str,str,str,str,str,i64,str,str,str,str,str
"""CS001215000145…","""田崎 美紀""","""1""","""女性""","""1995-03-29""",24,"""144-0055""","""東京都大田区仲六郷*****…","""S13001""","""20170605""","""6-20090929-2"""
"""CS033513000180…","""安斎 遥""","""1""","""女性""","""1962-07-11""",56,"""241-0823""","""神奈川県横浜市旭区善部町**…","""S14033""","""20150728""","""6-20080506-5"""
"""CS011215000048…","""芦田 沙耶""","""1""","""女性""","""1992-02-01""",27,"""223-0062""","""神奈川県横浜市港北区日吉本町…","""S14011""","""20150228""","""C-20100421-9"""
"""CS040412000191…","""川井 郁恵""","""1""","""女性""","""1977-01-05""",42,"""226-0021""","""神奈川県横浜市緑区北八朔町*…","""S14040""","""20151101""","""1-20091025-4"""
"""CS009315000023…","""皆川 文世""","""1""","""女性""","""1980-04-15""",38,"""154-0012""","""東京都世田谷区駒沢*****…","""S13009""","""20150319""","""5-20080322-1"""


---
> P-015: 顧客データ（df_customer）から、ステータスコード（status_cd）の先頭がアルファベットのA〜Fで始まり、末尾が数字の1〜9で終わるデータを全項目抽出し、5件表示せよ。

In [20]:
df_015 = df_customer.filter((pl.col("status_cd").str.contains("^[A-F].*[1-9]$")))

df_015.head()

customer_id,customer_name,gender_cd,gender,birth_day,age,postal_cd,address,application_store_cd,application_date,status_cd
str,str,str,str,str,i64,str,str,str,str,str
"""CS011215000048…","""芦田 沙耶""","""1""","""女性""","""1992-02-01""",27,"""223-0062""","""神奈川県横浜市港北区日吉本町…","""S14011""","""20150228""","""C-20100421-9"""
"""CS022513000105…","""島村 貴美子""","""1""","""女性""","""1962-03-12""",57,"""249-0002""","""神奈川県逗子市山の根****…","""S14022""","""20150320""","""A-20091115-7"""
"""CS001515000096…","""水野 陽子""","""9""","""不明""","""1960-11-29""",58,"""144-0053""","""東京都大田区蒲田本町****…","""S13001""","""20150614""","""A-20100724-7"""
"""CS013615000053…","""西脇 季衣""","""1""","""女性""","""1953-10-18""",65,"""261-0026""","""千葉県千葉市美浜区幕張西**…","""S12013""","""20150128""","""B-20100329-6"""
"""CS020412000161…","""小宮 薫""","""1""","""女性""","""1974-05-21""",44,"""174-0042""","""東京都板橋区東坂下*****…","""S13020""","""20150822""","""B-20081021-3"""


---
> P-016: 店舗データ（df_store）から、電話番号（tel_no）が3桁-3桁-4桁のデータを全項目表示せよ。

In [21]:
df_016 = df_store.filter(pl.col("tel_no").str.contains("[0-9]{3}-[0-9]{3}-[0-9]{4}"))

df_016

store_cd,store_name,prefecture_cd,prefecture,address,address_kana,tel_no,longitude,latitude,floor_area
str,str,str,str,str,str,str,f64,f64,f64
"""S12014""","""千草台店""","""12""","""千葉県""","""千葉県千葉市稲毛区千草台一丁…","""チバケンチバシイナゲクチグサ…","""043-123-4003""",140.118,35.63559,1698.0
"""S13002""","""国分寺店""","""13""","""東京都""","""東京都国分寺市本多二丁目""","""トウキョウトコクブンジシホン…","""042-123-4008""",139.4802,35.70566,1735.0
"""S14010""","""菊名店""","""14""","""神奈川県""","""神奈川県横浜市港北区菊名一丁…","""カナガワケンヨコハマシコウホ…","""045-123-4032""",139.6326,35.50049,1732.0
"""S14033""","""阿久和店""","""14""","""神奈川県""","""神奈川県横浜市瀬谷区阿久和西…","""カナガワケンヨコハマシセヤク…","""045-123-4043""",139.4961,35.45918,1495.0
"""S14036""","""相模原中央店""","""14""","""神奈川県""","""神奈川県相模原市中央二丁目""","""カナガワケンサガミハラシチュ…","""042-123-4045""",139.3716,35.57327,1679.0
"""S14040""","""長津田店""","""14""","""神奈川県""","""神奈川県横浜市緑区長津田みな…","""カナガワケンヨコハマシミドリ…","""045-123-4046""",139.4994,35.52398,1548.0
"""S14050""","""阿久和西店""","""14""","""神奈川県""","""神奈川県横浜市瀬谷区阿久和西…","""カナガワケンヨコハマシセヤク…","""045-123-4053""",139.4961,35.45918,1830.0
"""S13052""","""森野店""","""13""","""東京都""","""東京都町田市森野三丁目""","""トウキョウトマチダシモリノサ…","""042-123-4030""",139.4383,35.55293,1087.0
"""S14028""","""二ツ橋店""","""14""","""神奈川県""","""神奈川県横浜市瀬谷区二ツ橋町…","""カナガワケンヨコハマシセヤク…","""045-123-4042""",139.4963,35.46304,1574.0
"""S14012""","""本牧和田店""","""14""","""神奈川県""","""神奈川県横浜市中区本牧和田""","""カナガワケンヨコハマシナカク…","""045-123-4034""",139.6582,35.42156,1341.0


## ソート

---
> P-017: 顧客データ（df_customer）を生年月日（birth_day）で高齢順にソートし、先頭から全項目を5件表示せよ。

ソートにはエクスプレッションに対して`sort`（選択列のみ）や`sort_by`（選択列に従ってエクスプレッションがソート）を用いる

In [22]:
df_017 = df_customer.sort("birth_day")

df_017.head()

customer_id,customer_name,gender_cd,gender,birth_day,age,postal_cd,address,application_store_cd,application_date,status_cd
str,str,str,str,str,i64,str,str,str,str,str
"""CS003813000014…","""村山 菜々美""","""1""","""女性""","""1928-11-26""",90,"""182-0007""","""東京都調布市菊野台*****…","""S13003""","""20160214""","""0-00000000-0"""
"""CS026813000004…","""吉村 朝陽""","""1""","""女性""","""1928-12-14""",90,"""251-0043""","""神奈川県藤沢市辻堂元町***…","""S14026""","""20150723""","""0-00000000-0"""
"""CS018811000003…","""熊沢 美里""","""1""","""女性""","""1929-01-07""",90,"""204-0004""","""東京都清瀬市野塩******…","""S13018""","""20150403""","""0-00000000-0"""
"""CS027803000004…","""内村 拓郎""","""0""","""男性""","""1929-01-12""",90,"""251-0031""","""神奈川県藤沢市鵠沼藤が谷**…","""S14027""","""20151227""","""0-00000000-0"""
"""CS013801000003…","""天野 拓郎""","""0""","""男性""","""1929-01-15""",90,"""274-0824""","""千葉県船橋市前原東*****…","""S12013""","""20160120""","""0-00000000-0"""


In [23]:
df_017 = df_customer.select(pl.all().sort_by("birth_day"))

df_017.head()

customer_id,customer_name,gender_cd,gender,birth_day,age,postal_cd,address,application_store_cd,application_date,status_cd
str,str,str,str,str,i64,str,str,str,str,str
"""CS003813000014…","""村山 菜々美""","""1""","""女性""","""1928-11-26""",90,"""182-0007""","""東京都調布市菊野台*****…","""S13003""","""20160214""","""0-00000000-0"""
"""CS026813000004…","""吉村 朝陽""","""1""","""女性""","""1928-12-14""",90,"""251-0043""","""神奈川県藤沢市辻堂元町***…","""S14026""","""20150723""","""0-00000000-0"""
"""CS018811000003…","""熊沢 美里""","""1""","""女性""","""1929-01-07""",90,"""204-0004""","""東京都清瀬市野塩******…","""S13018""","""20150403""","""0-00000000-0"""
"""CS027803000004…","""内村 拓郎""","""0""","""男性""","""1929-01-12""",90,"""251-0031""","""神奈川県藤沢市鵠沼藤が谷**…","""S14027""","""20151227""","""0-00000000-0"""
"""CS013801000003…","""天野 拓郎""","""0""","""男性""","""1929-01-15""",90,"""274-0824""","""千葉県船橋市前原東*****…","""S12013""","""20160120""","""0-00000000-0"""


---
> P-018: 顧客データ（df_customer）を生年月日（birth_day）で若い順にソートし、先頭から全項目を5件表示せよ。

In [24]:
# 引数にdescenging=Trueを指定すると降順になる

df_018 = df_customer.sort("birth_day", descending=True)

df_018.head()

customer_id,customer_name,gender_cd,gender,birth_day,age,postal_cd,address,application_store_cd,application_date,status_cd
str,str,str,str,str,i64,str,str,str,str,str
"""CS035114000004…","""大村 美里""","""1""","""女性""","""2007-11-25""",11,"""156-0053""","""東京都世田谷区桜******…","""S13035""","""20150619""","""6-20091205-6"""
"""CS022103000002…","""福山 はじめ""","""9""","""不明""","""2007-10-02""",11,"""249-0006""","""神奈川県逗子市逗子*****…","""S14022""","""20160909""","""0-00000000-0"""
"""CS002113000009…","""柴田 真悠子""","""1""","""女性""","""2007-09-17""",11,"""184-0014""","""東京都小金井市貫井南町***…","""S13002""","""20160304""","""0-00000000-0"""
"""CS004115000014…","""松井 京子""","""1""","""女性""","""2007-08-09""",11,"""165-0031""","""東京都中野区上鷺宮*****…","""S13004""","""20161120""","""1-20081231-1"""
"""CS002114000010…","""山内 遥""","""1""","""女性""","""2007-06-03""",11,"""184-0015""","""東京都小金井市貫井北町***…","""S13002""","""20160920""","""6-20100510-1"""


---
> P-019: レシート明細データ（df_receipt）に対し、1件あたりの売上金額（amount）が高い順にランクを付与し、先頭から10件表示せよ。項目は顧客ID（customer_id）、売上金額（amount）、付与したランクを表示させること。なお、売上金額（amount）が等しい場合は同一順位を付与するものとする。

新しい列を生成するときは`.with_columns()`メソッドを用いる<br>
ランク付けにはエクスプレッションに対して`.rank()`メソッドを用いる。引数に与えるパラメータとして以下がある
- `average`:同じ値のものはランクの平均値になる(例:1, 2, 3, 3, 4, 4, 5) -> rank(1, 2, 3.5, 3.5, 5.5, 5.5, 7)
- `min`:同じ値のものはランクは最小値で表す(例:1, 2, 3, 3, 4, 4, 5) -> rank(1, 2, 3, 3, 5, 5, 7)
- `max`:同じ値のものはランクは最大値で表す(例:1, 2, 3, 3, 4, 4, 5) -> rank(1, 2, 4, 4, 6, 6, 7)
- `dense`:`min`と似ているが、割り当てられるランクはその前のランクの次の値になる(例:1, 2, 3, 3, 4, 4, 5) -> rank(1, 2, 3, 3, 4, 4, 5)
- `ordinal`:`min`と似ているが、同値は値の出現順で重複なしでランクを与えられる
- `random`:`ordinal`と似ているが、同値に与えられるランクはランダム

ちょっとわかりづらいので、上記のメソッドを変更しながら確認するのを推奨

In [25]:
df_019 = (
    df_receipt.select("customer_id", "amount")
    .with_columns(
        (pl.col("amount").rank(method="min", descending=True)).alias("ranking")
    )
    .sort("ranking")
)

df_019.head(10)

customer_id,amount,ranking
str,i64,u32
"""CS011415000006…",10925,1
"""ZZ000000000000…",6800,2
"""CS028605000002…",5780,3
"""CS015515000034…",5480,4
"""ZZ000000000000…",5480,4
"""ZZ000000000000…",5480,4
"""ZZ000000000000…",5440,7
"""CS021515000089…",5440,7
"""CS015515000083…",5280,9
"""CS017414000114…",5280,9


---
> P-020: レシート明細データ（df_receipt）に対し、1件あたりの売上金額（amount）が高い順にランクを付与し、先頭から10件表示せよ。項目は顧客ID（customer_id）、売上金額（amount）、付与したランクを表示させること。なお、売上金額（amount）が等しい場合でも別順位を付与すること。

In [26]:
df_020 = (
    df_receipt.select(["customer_id", "amount"])
    .with_columns(
        [(pl.col("amount").rank(method="ordinal", descending=True)).alias("ranking")]
    )
    .sort("ranking")
)

df_020.head(10)

customer_id,amount,ranking
str,i64,u32
"""CS011415000006…",10925,1
"""ZZ000000000000…",6800,2
"""CS028605000002…",5780,3
"""CS015515000034…",5480,4
"""ZZ000000000000…",5480,5
"""ZZ000000000000…",5480,6
"""ZZ000000000000…",5440,7
"""CS021515000089…",5440,8
"""CS015515000083…",5280,9
"""CS017414000114…",5280,10


## 集計

---
> P-021: レシート明細データ（df_receipt）に対し、件数をカウントせよ。

In [27]:
len(df_receipt)

104681

---
> P-022: レシート明細データ（df_receipt）の顧客ID（customer_id）に対し、ユニーク件数をカウントせよ。

In [28]:
# len(df_receipt.select(pl.col("customer_id").unique()))
df_receipt.select(pl.col("customer_id").unique().count())

customer_id
u32
8307


---
> P-023: レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに売上金額（amount）と売上数量（quantity）を合計せよ。

In [29]:
df_023 = (
    df_receipt.group_by("store_cd")
    .agg(pl.col("amount").sum(), pl.col("quantity").sum())
    .sort(pl.col("store_cd"))
)
df_023.head(5)

store_cd,amount,quantity
str,i64,i64
"""S12007""",638761,2099
"""S12013""",787513,2425
"""S12014""",725167,2358
"""S12029""",794741,2555
"""S12030""",684402,2403


---
> P-024: レシート明細データ（df_receipt）に対し、顧客ID（customer_id）ごとに最も新しい売上年月日（sales_ymd）を求め、5件表示せよ。

In [30]:
df_024 = (
    df_receipt.group_by("customer_id")
    .agg(pl.col("sales_ymd").max())
    .sort(pl.col("sales_ymd"), descending=True)
)

df_024.head(5)

customer_id,sales_ymd
str,i64
"""CS003513000561…",20191031
"""CS039513000140…",20191031
"""CS039513000004…",20191031
"""CS023512000051…",20191031
"""CS022615000144…",20191031


---
> P-025: レシート明細データ（df_receipt）に対し、顧客ID（customer_id）ごとに最も古い売上年月日（sales_ymd）を求め、10件表示せよ。

In [31]:
df_025 = (
    df_receipt.group_by("customer_id")
    .agg(pl.col("sales_ymd").min())
    .sort(pl.col("sales_ymd"), descending=False)
)

df_025.head(5)

customer_id,sales_ymd
str,i64
"""CS029214000004…",20170101
"""CS021514000082…",20170101
"""CS014411000048…",20170101
"""CS006515000089…",20170101
"""CS008414000056…",20170101


---
> P-026: レシート明細データ（df_receipt）に対し、顧客ID（customer_id）ごとに最も新しい売上年月日（sales_ymd）と古い売上年月日を求め、両者が異なるデータを10件表示せよ。

In [32]:
df_026 = df_receipt.group_by("customer_id").agg(
    pl.col("sales_ymd").min().alias("sales_ymd_min"),
    pl.col("sales_ymd").max().alias("sales_ymd_max"),
)
df_026.head(10)

customer_id,sales_ymd_min,sales_ymd_max
str,i64,i64
"""CS020413000033…",20170908,20170908
"""CS002305000010…",20180915,20190518
"""CS039412000056…",20170916,20190314
"""CS011412000049…",20170310,20180209
"""CS011214000001…",20170202,20191023
"""CS032414000002…",20170307,20190909
"""CS019605000001…",20170410,20190413
"""CS011515000016…",20170408,20190831
"""CS002412000172…",20171021,20171021
"""CS038512000063…",20171208,20171208


---
> P-027: レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに売上金額（amount）の平均を計算し、降順でTOP5を表示せよ。

In [33]:
df_027 = (
    df_receipt.group_by("store_cd")
    .agg(pl.col("amount").mean())
    .sort(pl.col("amount"), descending=True)
)

df_027.head(5)

store_cd,amount
str,f64
"""S13052""",402.86747
"""S13015""",351.11196
"""S13003""",350.915519
"""S14010""",348.791262
"""S13001""",348.470386


---
> P-028: レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに売上金額（amount）の中央値を計算し、降順でTOP5を表示せよ。

In [34]:
df_028 = (
    df_receipt.group_by("store_cd")
    .agg(pl.col("amount").median())
    .sort(pl.col("amount"), descending=True)
)
df_028.head(5)

store_cd,amount
str,f64
"""S13052""",190.0
"""S14010""",188.0
"""S14050""",185.0
"""S14040""",180.0
"""S13003""",180.0


---
> P-029: レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに商品コード（product_cd）の最頻値を求め、10件表示させよ。

In [35]:
df_029 = df_receipt.group_by("store_cd").agg(
    pl.col("product_cd").mode(), pl.col("product_cd").mode().count().alias("lebel_1")
)

df_029.head(5)

store_cd,product_cd,lebel_1
str,list[str],u32
"""S12013""","[""P060303001""]",1
"""S12030""","[""P060303001""]",1
"""S14049""","[""P060303001""]",1
"""S14046""","[""P060303001""]",1
"""S13009""","[""P060303001""]",1


---
> P-030: レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに売上金額（amount）の分散を計算し、降順で5件表示せよ。

In [36]:
df_030 = (
    df_receipt.group_by("store_cd")
    .agg(pl.col("amount").var(ddof=0).alias("var_amount"))  # ddof=0:母標準偏差
    .sort("var_amount", descending=True)
)

df_030.head(5)

store_cd,var_amount
str,f64
"""S13052""",440088.701311
"""S14011""",306314.558164
"""S14034""",296920.081011
"""S13001""",295431.993329
"""S13015""",295294.361116


---
> P-031: レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに売上金額（amount）の標準偏差を計算し、降順で5件表示せよ。

In [37]:
df_031 = (
    df_receipt.group_by("store_cd")
    .agg(pl.col("amount").std(ddof=0).alias("std_amount"))
    .sort("std_amount", descending=True)
)
df_031.head(5)

store_cd,std_amount
str,f64
"""S13052""",663.391816
"""S14011""",553.456916
"""S14034""",544.903736
"""S13001""",543.536561
"""S13015""",543.409938


---
> P-032: レシート明細データ（df_receipt）の売上金額（amount）について、25％刻みでパーセンタイル値を求めよ。

In [38]:
df_032 = df_receipt.select(
    pl.col("amount").quantile(0).alias("q_0"),
    pl.col("amount").quantile(0.25).alias("q_25"),
    pl.col("amount").quantile(0.50).alias("q_50"),
    pl.col("amount").quantile(0.75).alias("q_75"),
    pl.col("amount").quantile(1).alias("q_100"),
)

df_032

q_0,q_25,q_50,q_75,q_100
f64,f64,f64,f64,f64
10.0,102.0,170.0,288.0,10925.0


In [39]:
# for文で作っても可
df_032 = df_receipt.select(
    [pl.col("amount").quantile(i / 100).alias(f"q_{i}") for i in [0, 25, 50, 75, 100]]
)

df_032

q_0,q_25,q_50,q_75,q_100
f64,f64,f64,f64,f64
10.0,102.0,170.0,288.0,10925.0


---
> P-033: レシート明細データ（df_receipt）に対し、店舗コード（store_cd）ごとに売上金額（amount）の平均を計算し、330以上のものを抽出せよ。

In [40]:
df_033 = (
    df_receipt.group_by("store_cd")
    .agg(pl.col("amount").mean().alias("amount_mean"))
    .select(pl.all())
    .filter(pl.col("amount_mean") >= 330)
)
df_033

store_cd,amount_mean
str,f64
"""S14010""",348.791262
"""S13019""",330.208616
"""S13052""",402.86747
"""S14011""",335.718333
"""S13001""",348.470386
"""S13015""",351.11196
"""S13020""",337.879932
"""S13003""",350.915519
"""S12013""",330.19413
"""S14045""",330.082073


---
> P-034: レシート明細データ（df_receipt）に対し、顧客ID（customer_id）ごとに売上金額（amount）を合計して全顧客の平均を求めよ。ただし、顧客IDが"Z"から始まるものは非会員を表すため、除外して計算すること。

In [41]:
df_034 = (
    df_receipt.filter(pl.col("customer_id").str.starts_with("Z").not_())
    .group_by("customer_id")
    .agg(pl.col("amount").sum())
    .select(pl.col("amount").mean())
)

df_034

amount
f64
2547.742235


---
> P-035: レシート明細データ（df_receipt）に対し、顧客ID（customer_id）ごとに売上金額（amount）を合計して全顧客の平均を求め、平均以上に買い物をしている顧客を抽出し、10件表示せよ。ただし、顧客IDが"Z"から始まるものは非会員を表すため、除外して計算すること。

In [42]:
df_035 = (
    df_receipt.filter(pl.col("customer_id").str.starts_with("Z").not_())
    .group_by("customer_id")
    .agg(pl.col("amount").sum())
    .with_columns(pl.col("amount").mean().alias("avg_amount"))
    .filter(pl.col("amount") >= pl.col("avg_amount"))
    .select(pl.col("customer_id"), pl.col("amount"))
    .sort(pl.col("customer_id"))
)

df_035.head(10)

customer_id,amount
str,i64
"""CS001115000010…",3044
"""CS001205000006…",3337
"""CS001214000009…",4685
"""CS001214000017…",4132
"""CS001214000052…",5639
"""CS001215000040…",3496
"""CS001304000006…",3726
"""CS001305000005…",3485
"""CS001305000011…",4370
"""CS001315000180…",3300


## DataFrameの結合

---
> P-036: レシート明細データ（df_receipt）と店舗データ（df_store）を内部結合し、レシート明細データの全項目と店舗データの店舗名（store_name）を10件表示せよ。

- 内部結合(inner):結合させる両方のDataFrameに存在する結合キー(on)で、一致する項目のみを抽出する結合方法
- 左外部結合(left):結合させる両方のDataFrameに存在する結合キー(on)で、基準となるDataFrameを左とする。基準DataFrameはすべての行が出力され、結合されたDataFrameは結合キーの値が基準の結合キーと一致した行のみ出力される
- 完全外部結合(outer):結合させる両方のDataFrameに存在する結合キー(on)で、基準となるDataFrameを左とする。基準のDataFrameおよび結合されたDataFrameのすべての行が出力される

In [43]:
df_036 = df_receipt.join(
    df_store.select(pl.col("store_cd"), pl.col("store_name")),
    how="inner",  # 他に左結合(left)と外部結合(outer)がある 右結合(right)はない。
    on=["store_cd"],
)  # listまたはstrで渡す(今回はlistを用いた)

df_036.head(10)

sales_ymd,sales_epoch,store_cd,receipt_no,receipt_sub_no,customer_id,product_cd,quantity,amount,store_name
i64,i64,str,i64,i64,str,str,i64,i64,str
20181103,1541203200,"""S14006""",112,1,"""CS006214000001…","""P070305012""",1,158,"""葛が谷店"""
20181118,1542499200,"""S13008""",1132,2,"""CS008415000097…","""P070701017""",1,81,"""成城店"""
20170712,1499817600,"""S14028""",1102,1,"""CS028414000014…","""P060101005""",1,170,"""二ツ橋店"""
20190205,1549324800,"""S14042""",1132,1,"""ZZ000000000000…","""P050301001""",1,25,"""新山下店"""
20180821,1534809600,"""S14025""",1102,2,"""CS025415000050…","""P060102007""",1,90,"""大和店"""
20190605,1559692800,"""S13003""",1112,1,"""CS003515000195…","""P050102002""",1,138,"""狛江店"""
20181205,1543968000,"""S14024""",1102,2,"""CS024514000042…","""P080101005""",1,30,"""三田店"""
20190922,1569110400,"""S14040""",1102,1,"""CS040415000178…","""P070501004""",1,128,"""長津田店"""
20170504,1493856000,"""S13020""",1112,2,"""ZZ000000000000…","""P071302010""",1,770,"""十条仲原店"""
20191010,1570665600,"""S14027""",1102,1,"""CS027514000015…","""P071101003""",1,680,"""南藤沢店"""


---
> P-037: 商品データ（df_product）とカテゴリデータ（df_category）を内部結合し、商品データの全項目とカテゴリデータのカテゴリ小区分名（category_small_name）を10件表示せよ。

In [44]:
df_037 = df_product.join(
    other=df_category.select("category_small_cd", "category_small_name"),
    how="inner",
    on="category_small_cd",
)

df_037.head(10)

product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_price,unit_cost,category_small_name
str,str,str,str,i64,i64,str
"""P040101001""","""04""","""0401""","""040101""",198,149,"""弁当類"""
"""P040101002""","""04""","""0401""","""040101""",218,164,"""弁当類"""
"""P040101003""","""04""","""0401""","""040101""",230,173,"""弁当類"""
"""P040101004""","""04""","""0401""","""040101""",248,186,"""弁当類"""
"""P040101005""","""04""","""0401""","""040101""",268,201,"""弁当類"""
"""P040101006""","""04""","""0401""","""040101""",298,224,"""弁当類"""
"""P040101007""","""04""","""0401""","""040101""",338,254,"""弁当類"""
"""P040101008""","""04""","""0401""","""040101""",420,315,"""弁当類"""
"""P040101009""","""04""","""0401""","""040101""",498,374,"""弁当類"""
"""P040101010""","""04""","""0401""","""040101""",580,435,"""弁当類"""


---
> P-038: 顧客データ（df_customer）とレシート明細データ（df_receipt）から、顧客ごとの売上金額合計を求め、10件表示せよ。ただし、売上実績がない顧客については売上金額を0として表示させること。また、顧客は性別コード（gender_cd）が女性（1）であるものを対象とし、非会員（顧客IDが"Z"から始まるもの）は除外すること。

In [45]:
df_038 = (
    df_customer.join(df_receipt, how="left", on="customer_id")
    .filter(
        (pl.col("gender_cd") == "1")
        & (pl.col("customer_id").str.starts_with("Z")).not_()
    )
    .group_by("customer_id")
    .agg(pl.col("amount").sum().fill_null(0))
)

df_038.head(10)

customer_id,amount
str,i64
"""CS008415000015…",4769
"""CS023515000032…",3768
"""CS005512000014…",458
"""CS029311000026…",320
"""CS004515000267…",528
"""CS014215000047…",2467
"""CS012515000012…",5996
"""CS035613000125…",0
"""CS005414000110…",3220
"""CS023712000050…",0


---
> P-039: レシート明細データ（df_receipt）から、売上日数の多い顧客の上位20件を抽出したデータと、売上金額合計の多い顧客の上位20件を抽出したデータをそれぞれ作成し、さらにその2つを完全外部結合せよ。ただし、非会員（顧客IDが"Z"から始まるもの）は除外すること。

In [46]:
df_cnt = (
    df_receipt.filter((pl.col("customer_id").str.starts_with("Z")).not_())
    .group_by("customer_id")
    .agg(pl.col("sales_ymd").n_unique().alias("sales_count"))
    .sort("sales_count", descending=True)
    .head(20)
)

df_sum = (
    df_receipt.filter((pl.col("customer_id").str.starts_with("Z")).not_())
    .group_by("customer_id")
    .agg(pl.col("amount").sum().alias("amount_sum"))
    .sort("amount_sum", descending=True)
    .head(20)
)

df_039 = df_cnt.join(df_sum, how="outer", on="customer_id").sort(
    "sales_count", descending=True
)

df_039

customer_id,sales_count,amount_sum
str,u32,i64
"""CS015515000034…",,15300
"""CS030415000034…",,15468
"""CS009414000059…",,15492
"""CS007514000094…",,15735
"""CS034415000047…",,16083
"""CS011415000006…",,16094
"""CS016415000101…",,16348
"""CS032414000072…",,16563
"""CS021515000089…",,17580
"""CS035414000024…",,17615


---
> P-040: 全ての店舗と全ての商品を組み合わせたデータを作成したい。店舗データ（df_store）と商品データ（df_product）を直積し、件数を計算せよ。

In [47]:
df_040 = df_store.with_columns(pl.lit(0).alias("key")).join(
    df_product.with_columns(pl.lit(0).alias("key")), on="key", how="outer"
)

print(df_store.shape, len(df_store))
print(df_product.shape, len(df_product))
len(df_040)

df_040.head()

(53, 10) 53
(10030, 6) 10030


store_cd,store_name,prefecture_cd,prefecture,address,address_kana,tel_no,longitude,latitude,floor_area,key,product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_price,unit_cost
str,str,str,str,str,str,str,f64,f64,f64,i32,str,str,str,str,i64,i64
"""S12014""","""千草台店""","""12""","""千葉県""","""千葉県千葉市稲毛区千草台一丁…","""チバケンチバシイナゲクチグサ…","""043-123-4003""",140.118,35.63559,1698.0,0,"""P040101001""","""04""","""0401""","""040101""",198,149
"""S13002""","""国分寺店""","""13""","""東京都""","""東京都国分寺市本多二丁目""","""トウキョウトコクブンジシホン…","""042-123-4008""",139.4802,35.70566,1735.0,0,"""P040101001""","""04""","""0401""","""040101""",198,149
"""S14010""","""菊名店""","""14""","""神奈川県""","""神奈川県横浜市港北区菊名一丁…","""カナガワケンヨコハマシコウホ…","""045-123-4032""",139.6326,35.50049,1732.0,0,"""P040101001""","""04""","""0401""","""040101""",198,149
"""S14033""","""阿久和店""","""14""","""神奈川県""","""神奈川県横浜市瀬谷区阿久和西…","""カナガワケンヨコハマシセヤク…","""045-123-4043""",139.4961,35.45918,1495.0,0,"""P040101001""","""04""","""0401""","""040101""",198,149
"""S14036""","""相模原中央店""","""14""","""神奈川県""","""神奈川県相模原市中央二丁目""","""カナガワケンサガミハラシチュ…","""042-123-4045""",139.3716,35.57327,1679.0,0,"""P040101001""","""04""","""0401""","""040101""",198,149


---
> P-041: レシート明細データ（df_receipt）の売上金額（amount）を日付（sales_ymd）ごとに集計し、前回売上があった日からの売上金額増減を計算せよ。そして結果を10件表示せよ。

In [48]:
df_041 = (
    df_receipt.group_by("sales_ymd")
    .agg(pl.col("amount").sum())
    .sort("sales_ymd")
    .with_columns(
        (
            pl.col("amount") - pl.col("amount").shift(n=1)
        ).alias(  # polarsにはindexという概念がないため、shift()を用いて場所を指定する
            "diff_amount"
        )
    )
)

df_041.head()

sales_ymd,amount,diff_amount
i64,i64,i64
20170101,33723,
20170102,24165,-9558.0
20170103,27503,3338.0
20170104,36165,8662.0
20170105,37830,1665.0


---
> P-042: レシート明細データ（df_receipt）の売上金額（amount）を日付（sales_ymd）ごとに集計し、各日付のデータに対し、前回、前々回、3回前に売上があった日のデータを結合せよ。そして結果を10件表示せよ。

In [49]:
# filterでnull削除
df_042 = (
    df_receipt.group_by("sales_ymd")
    .agg(pl.col("amount").sum())
    .sort(pl.col("sales_ymd"))
    .with_columns(
        [pl.col("sales_ymd").shift(i).alias(f"sales_ymd_lag{i}") for i in range(1, 4)]
    ).with_columns(
        [pl.col("amount").shift(i).alias(f"amount_lag{i}") for i in range(1, 4)]
    ).filter(pl.col("sales_ymd_lag3").is_not_null())
)

df_042.head()

sales_ymd,amount,sales_ymd_lag1,sales_ymd_lag2,sales_ymd_lag3,amount_lag1,amount_lag2,amount_lag3
i64,i64,i64,i64,i64,i64,i64,i64
20170104,36165,20170103,20170102,20170101,27503,24165,33723
20170105,37830,20170104,20170103,20170102,36165,27503,24165
20170106,32387,20170105,20170104,20170103,37830,36165,27503
20170107,23415,20170106,20170105,20170104,32387,37830,36165
20170108,24737,20170107,20170106,20170105,23415,32387,37830


In [50]:
df_042 = (
    df_receipt.group_by("sales_ymd")
    .agg(pl.col("amount").sum())
    .sort(pl.col("sales_ymd"))
    .with_columns(
        [pl.col("sales_ymd").shift(i).alias(f"sales_ymd_lag{i}") for i in range(1, 4)]
    ).with_columns(
        [pl.col("amount").shift(i).alias(f"amount_lag{i}") for i in range(1, 4)]
    ).drop_nulls()
)

df_042.head()

sales_ymd,amount,sales_ymd_lag1,sales_ymd_lag2,sales_ymd_lag3,amount_lag1,amount_lag2,amount_lag3
i64,i64,i64,i64,i64,i64,i64,i64
20170104,36165,20170103,20170102,20170101,27503,24165,33723
20170105,37830,20170104,20170103,20170102,36165,27503,24165
20170106,32387,20170105,20170104,20170103,37830,36165,27503
20170107,23415,20170106,20170105,20170104,32387,37830,36165
20170108,24737,20170107,20170106,20170105,23415,32387,37830


---
> P-043： レシート明細データ（df_receipt）と顧客データ（df_customer）を結合し、性別コード（gender_cd）と年代（ageから計算）ごとに売上金額（amount）を合計した売上サマリデータを作成せよ。性別コードは0が男性、1が女性、9が不明を表すものとする。
>
> ただし、項目構成は年代、女性の売上金額、男性の売上金額、性別不明の売上金額の4項目とすること（縦に年代、横に性別のクロス集計）。また、年代は10歳ごとの階級とすること。

In [51]:
df_043 = (
    df_receipt.join(df_customer, on="customer_id", how="inner")
    .with_columns(((pl.col("age") / 10).floor() * 10).alias("era"))
    .pivot(values="amount", index="era", columns="gender_cd", aggregate_function="sum")
    .rename({"1": "female", "0": "male", "9": "unknown"})
    .sort("era")
)

df_043

era,female,male,unknown
f64,i64,i64,i64
10.0,149836,1591.0,4317.0
20.0,1363724,72940.0,44328.0
30.0,693047,177322.0,50441.0
40.0,9320791,19355.0,483512.0
50.0,6685192,54320.0,342923.0
60.0,987741,272469.0,71418.0
70.0,29764,13435.0,2427.0
80.0,262923,46360.0,5111.0
90.0,6260,,


In [52]:
# 別解(apply)

gender_mapping = {"0": "male", "1": "female", "9": "unknown"}

df_043 = (
    df_customer.join(df_receipt, how="left", on="customer_id")
    .with_columns(
        ((pl.col("age") / 10).floor() * 10).alias("era"),
        pl.col("gender_cd").replace(gender_mapping).alias("gender"),
    )
    .group_by(["gender", "era"])
    .agg(pl.col("amount").sum())
    .pivot(values="amount", index="era", columns="gender")
    .sort("era")
)

df_043

era,male,unknown,female
f64,i64,i64,i64
10.0,1591,4317.0,149836
20.0,72940,44328.0,1363724
30.0,177322,50441.0,693047
40.0,19355,483512.0,9320791
50.0,54320,342923.0,6685192
60.0,272469,71418.0,987741
70.0,13435,2427.0,29764
80.0,46360,5111.0,262923
90.0,0,,6260


---
> P-044： 043で作成した売上サマリデータ（df_sales_summary）は性別の売上を横持ちさせたものであった。このデータから性別を縦持ちさせ、年代、性別コード、売上金額の3項目に変換せよ。ただし、性別コードは男性を"00"、女性を"01"、不明を"99"とする。

In [53]:
df_044 = (
    df_043.rename({"male": "00", "female": "01", "unknown": "99"})
    .melt(
        id_vars="era",
        value_vars=["00", "01", "99"],
        variable_name="gender_cd",
        value_name="amount",
    )
    .sort("era")
)

df_044

era,gender_cd,amount
f64,str,i64
10.0,"""00""",1591
10.0,"""01""",149836
10.0,"""99""",4317
20.0,"""00""",72940
20.0,"""01""",1363724
20.0,"""99""",44328
30.0,"""00""",177322
30.0,"""01""",693047
30.0,"""99""",50441
40.0,"""00""",19355


## Date型

---
> P-045: 顧客データ（df_customer）の生年月日（birth_day）は日付型でデータを保有している。これをYYYYMMDD形式の文字列に変換し、顧客ID（customer_id）とともに10件表示せよ。

In [54]:
df_045 = df_customer.select(
    "customer_id",
    pl.col("birth_day")
    .str.strptime(dtype=pl.Date, format="%Y-%m-%d")  # string型 -> date型へ一度変換
    .dt.strftime("%Y%m%d"),
)  # date型からstring型(YYYYMMDD)へ変換

df_045.head()

customer_id,birth_day
str,str
"""CS021313000114…","""19810429"""
"""CS037613000071…","""19520401"""
"""CS031415000172…","""19761004"""
"""CS028811000001…","""19330327"""
"""CS001215000145…","""19950329"""


---
> P-046: 顧客データ（df_customer）の申し込み日（application_date）はYYYYMMDD形式の文字列型でデータを保有している。これを日付型に変換し、顧客ID（customer_id）とともに10件表示せよ。

In [55]:
df_046 = df_customer.select(
    "customer_id",
    pl.col("application_date").str.strptime(dtype=pl.Date, format="%Y%m%d")
)
df_046.head()

customer_id,application_date
str,date
"""CS021313000114…",2015-09-05
"""CS037613000071…",2015-04-14
"""CS031415000172…",2015-05-29
"""CS028811000001…",2016-01-15
"""CS001215000145…",2017-06-05


---
> P-047: レシート明細データ（df_receipt）の売上日（sales_ymd）はYYYYMMDD形式の数値型でデータを保有している。これを日付型に変換し、レシート番号（receipt_no）、レシートサブ番号（receipt_sub_no）とともに10件表示せよ。

In [56]:
df_047 = df_receipt.select(
    "receipt_no",
    "receipt_sub_no",
    pl.col("sales_ymd").cast(str).str.strptime(dtype=pl.Date, format="%Y%m%d"),
)

df_047.head()

receipt_no,receipt_sub_no,sales_ymd
i64,i64,date
112,1,2018-11-03
1132,2,2018-11-18
1102,1,2017-07-12
1132,1,2019-02-05
1102,2,2018-08-21


---
> P-048: レシート明細データ（df_receipt）の売上エポック秒（sales_epoch）は数値型のUNIX秒でデータを保有している。これを日付型に変換し、レシート番号(receipt_no)、レシートサブ番号（receipt_sub_no）とともに10件表示せよ。

In [57]:
df_048 = df_receipt.select(
    "receipt_no",
    "receipt_sub_no",
    pl.col("sales_epoch").cast(dtype=pl.Utf8).str.strptime(dtype=pl.Datetime, format="%s"),
)

df_048.head()

receipt_no,receipt_sub_no,sales_epoch
i64,i64,datetime[μs]
112,1,2018-11-03 00:00:00
1132,2,2018-11-18 00:00:00
1102,1,2017-07-12 00:00:00
1132,1,2019-02-05 00:00:00
1102,2,2018-08-21 00:00:00


---
> P-049: レシート明細データ（df_receipt）の売上エポック秒（sales_epoch）を日付型に変換し、「年」だけ取り出してレシート番号(receipt_no)、レシートサブ番号（receipt_sub_no）とともに10件表示せよ。

In [58]:
df_049 = df_receipt.select(
    "receipt_no",
    "receipt_sub_no",
    pl.col("sales_epoch").cast(dtype=pl.Utf8).str.strptime(dtype=pl.Datetime, format="%s").dt.year(),
)

df_049.head()

receipt_no,receipt_sub_no,sales_epoch
i64,i64,i32
112,1,2018
1132,2,2018
1102,1,2017
1132,1,2019
1102,2,2018


---
> P-050: レシート明細データ（df_receipt）の売上エポック秒（sales_epoch）を日付型に変換し、「月」だけ取り出してレシート番号(receipt_no)、レシートサブ番号（receipt_sub_no）とともに10件表示せよ。なお、「月」は0埋め2桁で取り出すこと。

In [59]:
df_050 = df_receipt.select(
    "receipt_no",
    "receipt_sub_no",
    pl.col("sales_epoch")
    .cast(dtype=pl.Utf8)
    .str.strptime(dtype=pl.Datetime, format="%s")
    .dt.strftime("%m"),
)  # 0埋めなので文字列"strftime"で抽出

df_050.head()

receipt_no,receipt_sub_no,sales_epoch
i64,i64,str
112,1,"""11"""
1132,2,"""11"""
1102,1,"""07"""
1132,1,"""02"""
1102,2,"""08"""


---
> P-051: レシート明細データ（df_receipt）の売上エポック秒を日付型に変換し、「日」だけ取り出してレシート番号(receipt_no)、レシートサブ番号（receipt_sub_no）とともに10件表示せよ。なお、「日」は0埋め2桁で取り出すこと。

In [60]:
df_051 = df_receipt.select(
    "receipt_no",
    "receipt_sub_no",
    pl.col("sales_epoch")
    .cast(dtype=pl.Utf8)
    .str.strptime(dtype=pl.Datetime, format="%s")
    .dt.strftime("%d"),
)

df_051.head()

receipt_no,receipt_sub_no,sales_epoch
i64,i64,str
112,1,"""03"""
1132,2,"""18"""
1102,1,"""12"""
1132,1,"""05"""
1102,2,"""21"""


## 条件分岐
.when()や.map_elements()を用いる

---
> P-052: レシート明細データ（df_receipt）の売上金額（amount）を顧客ID（customer_id）ごとに合計の上、売上金額合計に対して2,000円以下を0、2,000円より大きい金額を1に二値化し、顧客ID、売上金額合計とともに10件表示せよ。ただし、顧客IDが"Z"から始まるのものは非会員を表すため、除外して計算すること。

In [61]:
df_052 = (
    df_receipt.filter(pl.col("customer_id").str.starts_with("Z").not_())
    .group_by("customer_id")
    .agg(pl.col("amount").sum())
    .select(
        pl.col("customer_id"),
        pl.col("amount"),
        pl.when(pl.col("amount") <= 2000).then(0).otherwise(1).alias("sales_flg"),
    )
    .sort(pl.col("customer_id"))
)

df_052.head(10)

customer_id,amount,sales_flg
str,i64,i32
"""CS001113000004…",1298,0
"""CS001114000005…",626,0
"""CS001115000010…",3044,1
"""CS001205000004…",1988,0
"""CS001205000006…",3337,1
"""CS001211000025…",456,0
"""CS001212000027…",448,0
"""CS001212000031…",296,0
"""CS001212000046…",228,0
"""CS001212000070…",456,0


In [62]:
# applyを使っても良いが処理がpythonなのでwhenより遅い
df_052 = (
    df_receipt.filter(pl.col("customer_id").str.starts_with("Z").not_())
    .group_by("customer_id")
    .agg(pl.col("amount").sum())
    .select(
        "customer_id",
        "amount",
        pl.col("amount").map_elements(lambda x: 1 if x > 2000 else 0).alias("sales_flg"),
    )
    .sort(pl.col("customer_id"))
)

df_052.head(10)

customer_id,amount,sales_flg
str,i64,i64
"""CS001113000004…",1298,0
"""CS001114000005…",626,0
"""CS001115000010…",3044,1
"""CS001205000004…",1988,0
"""CS001205000006…",3337,1
"""CS001211000025…",456,0
"""CS001212000027…",448,0
"""CS001212000031…",296,0
"""CS001212000046…",228,0
"""CS001212000070…",456,0


---
> P-053: 顧客データ（df_customer）の郵便番号（postal_cd）に対し、東京（先頭3桁が100〜209のもの）を1、それ以外のものを0に二値化せよ。さらにレシート明細データ（df_receipt）と結合し、全期間において売上実績のある顧客数を、作成した二値ごとにカウントせよ。

In [63]:
df_053 = (
    df_customer.select(
        pl.col("customer_id"),
        pl.when(
            pl.col("postal_cd")
            .str.slice(0, 3)
            .cast(pl.Int16)
            .is_between(100, 209, closed="both")
        )
        .then(1)  # .is_between()で範囲指定。colosed = "both", "left", "right"がある
        .otherwise(0)
        .alias("postal_flag"),
    )
    .join(df_receipt, on="customer_id", how="inner")
    .group_by(pl.col("postal_flag"))
    .agg(pl.col("customer_id").n_unique())  # .count()だとcustomer_idの重複を認めてしまうのでNG
)

df_053.head(10)

postal_flag,customer_id
i32,u32
0,3906
1,4400


---
> P-054: 顧客データ（df_customer）の住所（address）は、埼玉県、千葉県、東京都、神奈川県のいずれかとなっている。都道府県毎にコード値を作成し、顧客ID、住所とともに10件表示せよ。値は埼玉県を11、千葉県を12、東京都を13、神奈川県を14とすること。

In [64]:
df_054 = df_customer.select(
    "customer_id",
    "address",
    pl.when(pl.col("address").str.starts_with("埼玉県"))
    .then(pl.lit("11"))
    .when(pl.col("address").str.starts_with("千葉県"))
    .then(pl.lit("12"))
    .when(pl.col("address").str.starts_with("東京都"))
    .then(pl.lit("13"))
    .when(pl.col("address").str.starts_with("神奈川県"))
    .then(pl.lit("14"))
    .otherwise(pl.lit("00"))
    .alias("predecture_cd"),
)

df_054.head(10)

customer_id,address,predecture_cd
str,str,str
"""CS021313000114…","""神奈川県伊勢原市粟窪****…","""14"""
"""CS037613000071…","""東京都江東区南砂******…","""13"""
"""CS031415000172…","""東京都渋谷区代々木*****…","""13"""
"""CS028811000001…","""神奈川県横浜市泉区和泉町**…","""14"""
"""CS001215000145…","""東京都大田区仲六郷*****…","""13"""
"""CS020401000016…","""東京都板橋区若木******…","""13"""
"""CS015414000103…","""東京都江東区北砂******…","""13"""
"""CS029403000008…","""千葉県浦安市海楽******…","""12"""
"""CS015804000004…","""東京都江東区北砂******…","""13"""
"""CS033513000180…","""神奈川県横浜市旭区善部町**…","""14"""


In [65]:
# 別解

df_054 = df_customer.with_columns(
    pl.col("address")
    .str.replace(r"埼玉県.*", "11")
    .str.replace(r"千葉県.*", "12")
    .str.replace(r"東京都.*", "13")
    .str.replace(r"神奈川県.*", "14")
    .alias("prefecture_cd")
).select(["customer_id", "address", "prefecture_cd"])

df_054.head(10)

customer_id,address,prefecture_cd
str,str,str
"""CS021313000114…","""神奈川県伊勢原市粟窪****…","""14"""
"""CS037613000071…","""東京都江東区南砂******…","""13"""
"""CS031415000172…","""東京都渋谷区代々木*****…","""13"""
"""CS028811000001…","""神奈川県横浜市泉区和泉町**…","""14"""
"""CS001215000145…","""東京都大田区仲六郷*****…","""13"""
"""CS020401000016…","""東京都板橋区若木******…","""13"""
"""CS015414000103…","""東京都江東区北砂******…","""13"""
"""CS029403000008…","""千葉県浦安市海楽******…","""12"""
"""CS015804000004…","""東京都江東区北砂******…","""13"""
"""CS033513000180…","""神奈川県横浜市旭区善部町**…","""14"""


---
> P-055: レシート明細（df_receipt）データの売上金額（amount）を顧客ID（customer_id）ごとに合計し、その合計金額の四分位点を求めよ。その上で、顧客ごとの売上金額合計に対して以下の基準でカテゴリ値を作成し、顧客ID、売上金額合計とともに10件表示せよ。カテゴリ値は順に1〜4とする。
>
> - 最小値以上第1四分位未満 ・・・ 1を付与
> - 第1四分位以上第2四分位未満 ・・・ 2を付与
> - 第2四分位以上第3四分位未満 ・・・ 3を付与
> - 第3四分位以上 ・・・ 4を付与

In [66]:
df_055 = (
    df_receipt.group_by("customer_id")
    .agg(pl.col("amount").sum())
    .sort("customer_id")
    .with_columns(
        pl.when(pl.col("amount") < pl.col("amount").quantile(0.25))
        .then(1)
        .when(pl.col("amount") < pl.col("amount").quantile(0.50))
        .then(2)
        .when(pl.col("amount") < pl.col("amount").quantile(0.75))
        .then(3)
        .otherwise(4)
        .alias("pct_group")
    )
)

df_055.head(10)

customer_id,amount,pct_group
str,i64,i32
"""CS001113000004…",1298,2
"""CS001114000005…",626,2
"""CS001115000010…",3044,3
"""CS001205000004…",1988,3
"""CS001205000006…",3337,3
"""CS001211000025…",456,1
"""CS001212000027…",448,1
"""CS001212000031…",296,1
"""CS001212000046…",228,1
"""CS001212000070…",456,1


---
> P-056: 顧客データ（df_customer）の年齢（age）をもとに10歳刻みで年代を算出し、顧客ID（customer_id）、生年月日（birth_day）とともに10件表示せよ。ただし、60歳以上は全て60歳代とすること。年代を表すカテゴリ名は任意とする。

In [67]:
df_056 = df_customer.with_columns(
    ((pl.col("age") / 10).floor() * 10).cast(pl.Int8).alias("era")
).select(
    "customer_id",
    "birth_day",
    pl.when(pl.col("era") >= 60).then(60).otherwise(pl.col("era")).alias("era"),
)

df_056.head()

customer_id,birth_day,era
str,str,i8
"""CS021313000114…","""1981-04-29""",30
"""CS037613000071…","""1952-04-01""",60
"""CS031415000172…","""1976-10-04""",40
"""CS028811000001…","""1933-03-27""",60
"""CS001215000145…","""1995-03-29""",20


In [68]:
# map_elementsを使う方法

df_056 = df_customer.select(
    "customer_id",
    "birth_day",
    pl.col("age").map_elements(lambda x: math.floor(min(x, 60) / 10) * 10).alias("era"),
)

df_056.head()

customer_id,birth_day,era
str,str,i64
"""CS021313000114…","""1981-04-29""",30
"""CS037613000071…","""1952-04-01""",60
"""CS031415000172…","""1976-10-04""",40
"""CS028811000001…","""1933-03-27""",60
"""CS001215000145…","""1995-03-29""",20


In [69]:
# cutするためのseriesは勝手に並び替えられてしてしまう　いつか修正される？
# そのため、cutするまえに一度並び替えを行う
df_056_cut = df_customer.sort("age", descending=False).select(
    "customer_id",
    "birth_day",
    df_customer.select("age")
    .to_series()
    .cut(breaks=[10, 20, 30, 40, 50, 60])
    .alias("era"),
)

df_056_cut

customer_id,birth_day,era
str,str,cat
"""CS025115000002…","""2007-04-18""","""(30, 40]"""
"""CS002114000010…","""2007-06-03""","""(60, inf]"""
"""CS022103000002…","""2007-10-02""","""(40, 50]"""
"""CS002113000009…","""2007-09-17""","""(60, inf]"""
"""CS035114000004…","""2007-11-25""","""(20, 30]"""
"""CS004115000014…","""2007-08-09""","""(40, 50]"""
"""CS021103000002…","""2006-11-24""","""(40, 50]"""
"""CS027112000002…","""2007-01-29""","""(40, 50]"""
"""CS040113000005…","""2006-07-07""","""(60, inf]"""
"""CS004111000005…","""2006-04-07""","""(50, 60]"""


---
> P-057: 056の抽出結果と性別コード（gender_cd）により、新たに性別×年代の組み合わせを表すカテゴリデータを作成し、10件表示せよ。組み合わせを表すカテゴリの値は任意とする。

In [70]:
df_057 = df_customer.with_columns(
    pl.col("age").map_elements(lambda x: str(math.floor(min(x, 60) / 10) * 10)).alias("era")
).select(
    "customer_id",
    "birth_day",
    "era",
    # (pl.col("gender_cd") + pl.col("era")).alias("gender_era")
    pl.concat_str([pl.col("gender_cd"), pl.col("era")]).alias("gender_era"),
)  # pl.concat_str()で文字列の結合

df_057.head(10)

customer_id,birth_day,era,gender_era
str,str,str,str
"""CS021313000114…","""1981-04-29""","""30""","""130"""
"""CS037613000071…","""1952-04-01""","""60""","""960"""
"""CS031415000172…","""1976-10-04""","""40""","""140"""
"""CS028811000001…","""1933-03-27""","""60""","""160"""
"""CS001215000145…","""1995-03-29""","""20""","""120"""
"""CS020401000016…","""1974-09-15""","""40""","""040"""
"""CS015414000103…","""1977-08-09""","""40""","""140"""
"""CS029403000008…","""1973-08-17""","""40""","""040"""
"""CS015804000004…","""1931-05-02""","""60""","""060"""
"""CS033513000180…","""1962-07-11""","""50""","""150"""


---
> P-058: 顧客データ（df_customer）の性別コード（gender_cd）をダミー変数化し、顧客ID（customer_id）とともに10件表示せよ。

In [71]:
df_058 = df_customer.select(
    "customer_id",
    # *pl.get_dummies(df_customer.select(pl.col("gender_cd")))  # 古い書き方　to_dummys()に変更された
    *df_customer.select("gender_cd").to_dummies()
)  # 出力はDataFrame型なのでアンパックして展開する

df_058.head()

customer_id,gender_cd_0,gender_cd_1,gender_cd_9
str,u8,u8,u8
"""CS021313000114…",0,1,0
"""CS037613000071…",0,0,1
"""CS031415000172…",0,1,0
"""CS028811000001…",0,1,0
"""CS001215000145…",0,1,0


## 統計処理

---
> P-059: レシート明細データ（df_receipt）の売上金額（amount）を顧客ID（customer_id）ごとに合計し、売上金額合計を平均0、標準偏差1に標準化して顧客ID、売上金額合計とともに10件表示せよ。標準化に使用する標準偏差は、分散の平方根、もしくは不偏分散の平方根のどちらでも良いものとする。ただし、顧客IDが"Z"から始まるのものは非会員を表すため、除外して計算すること。

- 標準化
$$
x_{new}^i = \frac{x^i - \mu}{\sigma}
$$

In [72]:
df_059 = (
    df_receipt.filter(pl.col("customer_id").str.starts_with("Z").not_())
    .group_by("customer_id")
    .agg(pl.col("amount").sum())
    .with_columns(
        (
            (pl.col("amount") - pl.col("amount").mean()) / pl.col("amount").std(ddof=0)
        ).alias(  # ddof=1:標本標準偏差, ddof=0:母標準偏差
            "std_amount"
        )
    )
    .sort("customer_id")
)

df_059.head()

customer_id,amount,std_amount
str,i64,f64
"""CS001113000004…",1298,-0.459378
"""CS001114000005…",626,-0.70639
"""CS001115000010…",3044,0.182413
"""CS001205000004…",1988,-0.205749
"""CS001205000006…",3337,0.290114


---
> P-060: レシート明細データ（df_receipt）の売上金額（amount）を顧客ID（customer_id）ごとに合計し、売上金額合計を最小値0、最大値1に正規化して顧客ID、売上金額合計とともに10件表示せよ。ただし、顧客IDが"Z"から始まるのものは非会員を表すため、除外して計算すること。

- 正規化
$$
x_{new}^i = \frac{x^i - x_{min}}{x_{max} - x_{min}}
$$

In [73]:
df_060 = (
    df_receipt.filter(pl.col("customer_id").str.starts_with("Z").not_())
    .group_by("customer_id")
    .agg(pl.col("amount").sum())
    .with_columns(
        (
            (pl.col("amount") - pl.col("amount").min())
            / (pl.col("amount").max() - pl.col("amount").min())
        ).alias("scale_amount")
    )
    .sort("customer_id")
)

df_060.head()

customer_id,amount,scale_amount
str,i64,f64
"""CS001113000004…",1298,0.053354
"""CS001114000005…",626,0.024157
"""CS001115000010…",3044,0.129214
"""CS001205000004…",1988,0.083333
"""CS001205000006…",3337,0.141945


---
> P-061: レシート明細データ（df_receipt）の売上金額（amount）を顧客ID（customer_id）ごとに合計し、売上金額合計を常用対数化（底10）して顧客ID、売上金額合計とともに10件表示せよ。ただし、顧客IDが"Z"から始まるのものは非会員を表すため、除外して計算すること。

In [74]:
df_061 = (
    df_receipt.filter(pl.col("customer_id").str.starts_with("Z").not_())
    .group_by("customer_id")
    .agg(pl.col("amount").sum())
    .with_columns(pl.col("amount").log10().alias("log10_amount"))
    .sort("customer_id")
)

df_061.head()

customer_id,amount,log10_amount
str,i64,f64
"""CS001113000004…",1298,3.113275
"""CS001114000005…",626,2.796574
"""CS001115000010…",3044,3.483445
"""CS001205000004…",1988,3.298416
"""CS001205000006…",3337,3.523356


---
> P-062: レシート明細データ（df_receipt）の売上金額（amount）を顧客ID（customer_id）ごとに合計し、売上金額合計を自然対数化（底e）して顧客ID、売上金額合計とともに10件表示せよ。ただし、顧客IDが"Z"から始まるのものは非会員を表すため、除外して計算すること。

In [75]:
df_062 = (
    df_receipt.filter(pl.col("customer_id").str.starts_with("Z").not_())
    .group_by("customer_id")
    .agg(pl.col("amount").sum())
    .with_columns(pl.col("amount").log().alias("log_amount"))
    .sort("customer_id")
)

df_062.head()

customer_id,amount,log_amount
str,i64,f64
"""CS001113000004…",1298,7.16858
"""CS001114000005…",626,6.43935
"""CS001115000010…",3044,8.020928
"""CS001205000004…",1988,7.594884
"""CS001205000006…",3337,8.112827


---
> P-063: 商品データ（df_product）の単価（unit_price）と原価（unit_cost）から各商品の利益額を算出し、結果を10件表示せよ。

In [76]:
df_063 = df_product.with_columns(
    (pl.col("unit_price") - pl.col("unit_cost")).alias("unit_profit")
)

df_063.head()

product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_price,unit_cost,unit_profit
str,str,str,str,i64,i64,i64
"""P040101001""","""04""","""0401""","""040101""",198,149,49
"""P040101002""","""04""","""0401""","""040101""",218,164,54
"""P040101003""","""04""","""0401""","""040101""",230,173,57
"""P040101004""","""04""","""0401""","""040101""",248,186,62
"""P040101005""","""04""","""0401""","""040101""",268,201,67


---
> P-064: 商品データ（df_product）の単価（unit_price）と原価（unit_cost）から、各商品の利益率の全体平均を算出せよ。ただし、単価と原価には欠損が生じていることに注意せよ。

In [77]:
df_064 = df_product.select(
    ((pl.col("unit_price") - pl.col("unit_cost")) / pl.col("unit_price"))
    .filter(pl.col("unit_price").is_not_null())
    .alias("unit_profit_rate")
    .mean()
)

df_064

unit_profit_rate
f64
0.249114


---
> P-065: 商品データ（df_product）の各商品について、利益率が30%となる新たな単価を求めよ。ただし、1円未満は切り捨てること。そして結果を10件表示させ、利益率がおよそ30％付近であることを確認せよ。ただし、単価（unit_price）と原価（unit_cost）には欠損が生じていることに注意せよ。

In [78]:
df_065 = (
    df_product.select("product_cd", "unit_price", "unit_cost")
    .filter((pl.col("unit_price").is_not_null()) & (pl.col("unit_cost").is_not_null()))
    .with_columns(
        (pl.col("unit_cost") / 0.7).floor().cast(pl.Int64).alias("new_price")
    )  # .floor()で切り捨て計算　.round(0)でもOK
    .with_columns(
        ((pl.col("new_price") - pl.col("unit_cost")) / pl.col("new_price")).alias(
            "new_profit_rate"
        )
    )
    .sort("product_cd")
)

df_065.head()

product_cd,unit_price,unit_cost,new_price,new_profit_rate
str,i64,i64,i64,f64
"""P040101001""",198,149,212,0.29717
"""P040101002""",218,164,234,0.299145
"""P040101003""",230,173,247,0.299595
"""P040101004""",248,186,265,0.298113
"""P040101005""",268,201,287,0.299652


In [79]:
df_product.head()

product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_price,unit_cost
str,str,str,str,i64,i64
"""P040101001""","""04""","""0401""","""040101""",198,149
"""P040101002""","""04""","""0401""","""040101""",218,164
"""P040101003""","""04""","""0401""","""040101""",230,173
"""P040101004""","""04""","""0401""","""040101""",248,186
"""P040101005""","""04""","""0401""","""040101""",268,201


---
> P-066: 商品データ（df_product）の各商品について、利益率が30%となる新たな単価を求めよ。今回は、1円未満を丸めること（四捨五入または偶数への丸めで良い）。そして結果を10件表示させ、利益率がおよそ30％付近であることを確認せよ。ただし、単価（unit_price）と原価（unit_cost）には欠損が生じていることに注意せよ。

In [80]:
df_066 = (
    df_product.drop_nulls()
    .with_columns(
        (pl.col("unit_cost") / 0.7).round(0).cast(pl.Int64).alias("new_price")
    )
    .with_columns(
        ((pl.col("new_price") - pl.col("unit_cost")) / pl.col("new_price")).alias(
            "new_profit_rate"
        )
    )
)

df_066.head()

product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_price,unit_cost,new_price,new_profit_rate
str,str,str,str,i64,i64,i64,f64
"""P040101001""","""04""","""0401""","""040101""",198,149,213,0.300469
"""P040101002""","""04""","""0401""","""040101""",218,164,234,0.299145
"""P040101003""","""04""","""0401""","""040101""",230,173,247,0.299595
"""P040101004""","""04""","""0401""","""040101""",248,186,266,0.300752
"""P040101005""","""04""","""0401""","""040101""",268,201,287,0.299652


---
> P-067: 商品データ（df_product）の各商品について、利益率が30%となる新たな単価を求めよ。今回は、1円未満を切り上げること。そして結果を10件表示させ、利益率がおよそ30％付近であることを確認せよ。ただし、単価（unit_price）と原価（unit_cost）には欠損が生じていることに注意せよ。

In [81]:
df_067 = (
    df_product.select("product_cd", "unit_price", "unit_cost")
    .filter((pl.col("unit_price").is_not_null()) & (pl.col("unit_cost").is_not_null()))
    .with_columns((pl.col("unit_cost") / 0.7).ceil().cast(pl.Int64).alias("new_price"))
    .with_columns(
        ((pl.col("new_price") - pl.col("unit_cost")) / pl.col("new_price")).alias(
            "new_profit_rate"
        )
    )
    .sort("product_cd")
)

df_067.head()

product_cd,unit_price,unit_cost,new_price,new_profit_rate
str,i64,i64,i64,f64
"""P040101001""",198,149,213,0.300469
"""P040101002""",218,164,235,0.302128
"""P040101003""",230,173,248,0.302419
"""P040101004""",248,186,266,0.300752
"""P040101005""",268,201,288,0.302083


---
> P-068: 商品データ（df_product）の各商品について、消費税率10％の税込み金額を求めよ。1円未満の端数は切り捨てとし、結果を10件表示せよ。ただし、単価（unit_price）には欠損が生じていることに注意せよ。

In [82]:
df_068 = df_product.filter(pl.col("unit_price").is_not_null()).select(
    "product_cd",
    "unit_price",
    (pl.col("unit_price") * 1.1).cast(pl.Int64).alias("tax_price"),
)

df_068.head()

product_cd,unit_price,tax_price
str,i64,i64
"""P040101001""",198,217
"""P040101002""",218,239
"""P040101003""",230,253
"""P040101004""",248,272
"""P040101005""",268,294


---
> P-069: レシート明細データ（df_receipt）と商品データ（df_product）を結合し、顧客毎に全商品の売上金額合計と、カテゴリ大区分コード（category_major_cd）が"07"（瓶詰缶詰）の売上金額合計を計算の上、両者の比率を求めよ。抽出対象はカテゴリ大区分コード"07"（瓶詰缶詰）の売上実績がある顧客のみとし、結果を10件表示せよ。

In [83]:
df_069 = (
    df_receipt.join(df_product, on="product_cd", how="inner")
    .with_columns(
        (
            pl.when(pl.col("category_major_cd").str.starts_with("07")).then(
                pl.col("unit_price") * pl.col("quantity")
            )
        )
        # .otherwise(pl.lit(0)))
        .alias("category_07_sell")
    )
    .group_by("customer_id")
    .agg(
        (pl.col("amount").sum()).alias("sum_all"),
        (pl.col("category_07_sell").sum()).alias("sum_07"),
    )
    .filter(pl.col("sum_07").is_not_null())
    .with_columns((pl.col("sum_07") / pl.col("sum_all")).alias("sales_rate"))
    .sort("customer_id")
)

df_069.head()

customer_id,sum_all,sum_07,sales_rate
str,i64,i64,f64
"""CS001113000004…",1298,1298,1.0
"""CS001114000005…",626,486,0.776358
"""CS001115000010…",3044,2694,0.88502
"""CS001205000004…",1988,346,0.174044
"""CS001205000006…",3337,2004,0.600539


In [84]:
# group_byで一気に計算（こっちの方がわかりやすいし、無駄なcolumnを作らなくて済む）
df_069 = (
    df_receipt.join(df_product, on="product_cd", how="inner")
    .group_by("customer_id")
    .agg(
        [
            (pl.col("quantity") * pl.col("unit_price")).sum().alias("sum_all"),
            (pl.col("quantity") * pl.col("unit_price"))
            .filter(pl.col("category_major_cd") == "07")
            .sum()
            .alias("sum_07"),
        ]
    )
    .filter(pl.col("sum_07").is_not_null())
    .with_columns((pl.col("sum_07") / pl.col("sum_all")).alias("sales_rate"))
    .sort("customer_id")
)

df_069.head()

customer_id,sum_all,sum_07,sales_rate
str,i64,i64,f64
"""CS001113000004…",1298,1298,1.0
"""CS001114000005…",626,486,0.776358
"""CS001115000010…",3044,2694,0.88502
"""CS001205000004…",1988,346,0.174044
"""CS001205000006…",3337,2004,0.600539


## Date型の計算

---
> P-070: レシート明細データ（df_receipt）の売上日（sales_ymd）に対し、顧客データ（df_customer）の会員申込日（application_date）からの経過日数を計算し、顧客ID（customer_id）、売上日、会員申込日とともに10件表示せよ（sales_ymdは数値、application_dateは文字列でデータを保持している点に注意）。

In [85]:
df_070 = (
    df_receipt.select("customer_id", "sales_ymd")
    .unique()
    .join(
        df_customer.select("customer_id", "application_date"),
        how="inner",
        on="customer_id",
    )
    .select(
        "customer_id",
        pl.col("sales_ymd").cast(pl.Utf8).str.strptime(pl.Date, "%Y%m%d"),
        pl.col("application_date").str.strptime(pl.Date, "%Y%m%d"),
    )
    .with_columns(
        (pl.col("sales_ymd") - pl.col("application_date"))
        .dt.total_days()  # int型へ変換（別にしなくても良い）
        .alias("elapsed_date")
    )
    .filter(pl.col("elapsed_date").is_not_null())
    .sort("customer_id")
)

df_070.head()

customer_id,sales_ymd,application_date,elapsed_date
str,date,date,i64
"""CS001113000004…",2019-03-08,2015-11-05,1219
"""CS001114000005…",2019-07-31,2016-04-12,1205
"""CS001114000005…",2018-05-03,2016-04-12,751
"""CS001115000010…",2017-12-28,2015-04-17,986
"""CS001115000010…",2018-07-01,2015-04-17,1171


---
> P-071: レシート明細データ（df_receipt）の売上日（sales_ymd）に対し、顧客データ（df_customer）の会員申込日（application_date）からの経過月数を計算し、顧客ID（customer_id）、売上日、会員申込日とともに10件表示せよ（sales_ymdは数値、application_dateは文字列でデータを保持している点に注意）。1ヶ月未満は切り捨てること。

In [86]:
df_071 = (
    df_receipt.select("customer_id", "sales_ymd")
    .join(
        df_customer.select("customer_id", "application_date"),
        how="inner",
        on="customer_id",
    )
    .select(
        "customer_id",
        pl.col("sales_ymd").cast(pl.Utf8).str.strptime(pl.Date, "%Y%m%d"),
        pl.col("application_date").str.strptime(pl.Date, "%Y%m%d"),
    )
    .with_columns(
        (
            pl.col("sales_ymd").dt.year() * 12
            - pl.col("application_date").dt.year() * 12
            + pl.col("sales_ymd").dt.month().cast(pl.Int64)
            - pl.col("application_date").dt.month().cast(pl.Int64)
        ).alias(  # Int型にしないとバグる
            "elapsed_month"
        )
    )
    .filter(pl.col("elapsed_month").is_not_null())
    .sort("customer_id")
)

df_071

customer_id,sales_ymd,application_date,elapsed_month
str,date,date,i64
"""CS001113000004…",2019-03-08,2015-11-05,40
"""CS001113000004…",2019-03-08,2015-11-05,40
"""CS001114000005…",2018-05-03,2016-04-12,25
"""CS001114000005…",2018-05-03,2016-04-12,25
"""CS001114000005…",2019-07-31,2016-04-12,39
"""CS001114000005…",2019-07-31,2016-04-12,39
"""CS001115000010…",2019-04-05,2015-04-17,48
"""CS001115000010…",2018-07-01,2015-04-17,39
"""CS001115000010…",2017-12-28,2015-04-17,32
"""CS001115000010…",2019-04-05,2015-04-17,48


---
> P-072: レシート明細データ（df_receipt）の売上日（df_customer）に対し、顧客データ（df_customer）の会員申込日（application_date）からの経過年数を計算し、顧客ID（customer_id）、売上日、会員申込日とともに10件表示せよ（sales_ymdは数値、application_dateは文字列でデータを保持している点に注意）。1年未満は切り捨てること。

In [87]:
df_072 = (
    df_receipt.select("customer_id", "sales_ymd")
    .join(
        df_customer.select("customer_id", "application_date"),
        how="inner",
        on="customer_id",
    )
    .select(
        "customer_id",
        pl.col("sales_ymd").cast(pl.Utf8).str.strptime(pl.Date, "%Y%m%d"),
        pl.col("application_date").str.strptime(pl.Date, "%Y%m%d"),
    )
    .with_columns(
        ((pl.col("sales_ymd").dt.year() - pl.col("application_date").dt.year())).alias(
            "elapsed_year"
        )
    )
    .filter(pl.col("elapsed_year").is_not_null())
    .sort("customer_id")
)

df_072.head()

customer_id,sales_ymd,application_date,elapsed_year
str,date,date,i32
"""CS001113000004…",2019-03-08,2015-11-05,4
"""CS001113000004…",2019-03-08,2015-11-05,4
"""CS001114000005…",2018-05-03,2016-04-12,2
"""CS001114000005…",2018-05-03,2016-04-12,2
"""CS001114000005…",2019-07-31,2016-04-12,3


---
> P-073: レシート明細データ（df_receipt）の売上日（sales_ymd）に対し、顧客データ（df_customer）の会員申込日（application_date）からのエポック秒による経過時間を計算し、顧客ID（customer_id）、売上日、会員申込日とともに10件表示せよ（なお、sales_ymdは数値、application_dateは文字列でデータを保持している点に注意）。なお、時間情報は保有していないため各日付は0時0分0秒を表すものとする。

In [88]:
df_073 = (
    df_receipt.select("customer_id", "sales_ymd")
    .join(
        df_customer.select("customer_id", "application_date"),
        how="inner",
        on="customer_id",
    )
    .select(
        "customer_id",
        pl.col("sales_ymd").cast(pl.Utf8).str.strptime(pl.Date, "%Y%m%d"),
        pl.col("application_date").str.strptime(pl.Date, "%Y%m%d"),
    )
    .with_columns(
        (
            (
                pl.col("sales_ymd").dt.epoch(time_unit="s")
                - pl.col("application_date").dt.epoch(time_unit="s")
            )  # defaultはμs(10e-6)
        ).alias("elapsed_epoch")
    )
    .filter(pl.col("elapsed_epoch").is_not_null())
    .sort("customer_id")
)

df_073.head()

customer_id,sales_ymd,application_date,elapsed_epoch
str,date,date,i64
"""CS001113000004…",2019-03-08,2015-11-05,105321600
"""CS001113000004…",2019-03-08,2015-11-05,105321600
"""CS001114000005…",2018-05-03,2016-04-12,64886400
"""CS001114000005…",2018-05-03,2016-04-12,64886400
"""CS001114000005…",2019-07-31,2016-04-12,104112000


---
> P-074: レシート明細データ（df_receipt）の売上日（sales_ymd）に対し、当該週の月曜日からの経過日数を計算し、売上日、直前の月曜日付とともに10件表示せよ（sales_ymdは数値でデータを保持している点に注意）。

In [89]:
from datetime import timedelta

df_074 = (
    df_receipt.select(pl.col("sales_ymd").cast(pl.Utf8).str.strptime(pl.Date, "%Y%m%d"))
    .with_columns(
        (pl.col("sales_ymd").dt.weekday() - 1).alias(
            "elapsed_days"
        ),  # monday=1, sunday=9
        (pl.col("sales_ymd").map_elements(lambda x: x - timedelta(days=(x.weekday())))).alias(
            "monday"
        ),
    )
    .filter(pl.col("monday").is_not_null())
)

df_074.head()

sales_ymd,elapsed_days,monday
date,u32,date
2018-11-03,5,2018-10-29
2018-11-18,6,2018-11-12
2017-07-12,2,2017-07-10
2019-02-05,1,2019-02-04
2018-08-21,1,2018-08-20


In [90]:
from datetime import timedelta

df_074 = (
    df_receipt.select(pl.col("sales_ymd").cast(pl.Utf8).str.strptime(pl.Date, "%Y%m%d"))
    .with_columns(
        (pl.col("sales_ymd").dt.weekday() - 1).alias("elapsed_days"),
        (pl.col("sales_ymd").dt.truncate("1w")).alias("monday"),
    )
    .filter(pl.col("monday").is_not_null())
)

df_074.head()

sales_ymd,elapsed_days,monday
date,u32,date
2018-11-03,5,2018-10-29
2018-11-18,6,2018-11-12
2017-07-12,2,2017-07-10
2019-02-05,1,2019-02-04
2018-08-21,1,2018-08-20


## ランダム抽出

---
> P-075:顧客データ（df_customer）からランダムに1%のデータを抽出し、先頭から10件表示せよ。

In [91]:
df_075 = df_customer.sample(fraction=0.01)  # ランダム抽出

df_075.head(10)

customer_id,customer_name,gender_cd,gender,birth_day,age,postal_cd,address,application_store_cd,application_date,status_cd
str,str,str,str,str,i64,str,str,str,str,str
"""CS040414000058…","""多部 遥""","""1""","""女性""","""1972-07-12""",46,"""226-0028""","""神奈川県横浜市緑区いぶき野*…","""S14040""","""20151023""","""F-20100504-E"""
"""CS028715000060…","""市村 育子""","""1""","""女性""","""1940-11-27""",78,"""246-0034""","""神奈川県横浜市瀬谷区南瀬谷*…","""S14028""","""20151224""","""0-00000000-0"""
"""CS034315000005…","""沢 さやか""","""1""","""女性""","""1980-05-11""",38,"""216-0003""","""神奈川県川崎市宮前区有馬**…","""S14034""","""20160110""","""3-20080219-5"""
"""CS028411000010…","""生田 コウ""","""1""","""女性""","""1977-04-19""",41,"""245-0016""","""神奈川県横浜市泉区和泉町**…","""S14028""","""20150614""","""D-20100904-A"""
"""CS030513000115…","""小田 花""","""1""","""女性""","""1966-11-20""",52,"""272-0031""","""千葉県市川市平田******…","""S12030""","""20150530""","""6-20090809-6"""
"""CS040402000007…","""小沼 明""","""0""","""男性""","""1971-02-13""",48,"""226-0021""","""神奈川県横浜市緑区北八朔町*…","""S14040""","""20150906""","""0-00000000-0"""
"""CS017514000077…","""竹内 結衣""","""1""","""女性""","""1966-08-23""",52,"""166-0003""","""東京都杉並区高円寺南****…","""S13017""","""20150723""","""5-20100307-7"""
"""CS040413000102…","""喜多 璃奈子""","""1""","""女性""","""1976-07-31""",42,"""226-0018""","""神奈川県横浜市緑区長津田みな…","""S14040""","""20151208""","""0-00000000-0"""
"""CS008712000067…","""大山 麻緒""","""1""","""女性""","""1947-07-12""",71,"""201-0002""","""東京都狛江市東野川*****…","""S13008""","""20180301""","""0-00000000-0"""
"""CS009314000084…","""竹村 まみ""","""1""","""女性""","""1985-02-24""",34,"""158-0096""","""東京都世田谷区玉川台****…","""S13009""","""20170127""","""0-00000000-0"""


---
> P-076: 顧客データ（df_customer）から性別コード（gender_cd）の割合に基づきランダムに10%のデータを層化抽出し、性別コードごとに件数を集計せよ。

In [92]:
_, df_tmp = train_test_split(
    df_customer, test_size=0.1, stratify=df_customer["gender_cd"]
)

df_076 = df_tmp.group_by("gender_cd").agg(pl.col("customer_id").count())

df_076.head()

gender_cd,customer_id
str,u32
"""0""",298
"""1""",1793
"""9""",107


## 外れ値と欠損値

---
> P-077: レシート明細データ（df_receipt）の売上金額を顧客単位に合計し、合計した売上金額の外れ値を抽出せよ。なお、外れ値は売上金額合計を対数化したうえで平均と標準偏差を計算し、その平均から3σを超えて離れたものとする（自然対数と常用対数のどちらでも可）。結果は10件表示せよ。

In [93]:
df_077 = (
    df_receipt.group_by("customer_id")
    .agg((pl.col("amount").sum()).alias("sum_amount"))
    .with_columns(pl.col("sum_amount").log().alias("log_amount"))
    .filter(
        (
            (pl.col("log_amount") - pl.col("log_amount").mean()).abs()
            > (pl.col("log_amount").std() * 3)
        ).alias("log_amount_ss")
    )
)

df_077.head()

customer_id,sum_amount,log_amount
str,i64,f64
"""ZZ000000000000…",12395003,16.332804


---
> P-078: レシート明細データ（df_receipt）の売上金額（amount）を顧客単位に合計し、合計した売上金額の外れ値を抽出せよ。ただし、顧客IDが"Z"から始まるのものは非会員を表すため、除外して計算すること。なお、ここでは外れ値を第1四分位と第3四分位の差であるIQRを用いて、「第1四分位数-1.5×IQR」を下回るもの、または「第3四分位数+1.5×IQR」を超えるものとする。結果は10件表示せよ。

In [94]:
pct25 = pl.col("sum_amount").quantile(0.25)
pct75 = pl.col("sum_amount").quantile(0.75)
iqr = pct75 - pct25


df_078 = (
    df_receipt.group_by("customer_id")
    .agg((pl.col("amount").sum()).alias("sum_amount"))
    .filter(
        (pl.col("customer_id").str.starts_with("Z").not_())
        & (pl.col("sum_amount") < (pct25 - 1.5 * iqr))
        | (pl.col("sum_amount") > (pct75 + 1.5 * iqr))
    )
    .sort("customer_id")
)

df_078.head(10)

customer_id,sum_amount
str,i64
"""CS001414000048…",8584
"""CS001605000009…",18925
"""CS002415000594…",9568
"""CS004414000181…",9584
"""CS005415000137…",8734
"""CS006414000001…",9156
"""CS006414000029…",9179
"""CS006415000105…",10042
"""CS006415000147…",12723
"""CS006415000157…",10648


---
> P-079: 商品データ（df_product）の各項目に対し、欠損数を確認せよ。

In [95]:
# df_ans = (
#     df_product
#     .select(pl.all().is_null().sum())
# )

# df_ans

df_product.null_count()

product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_price,unit_cost
u32,u32,u32,u32,u32,u32
0,0,0,0,7,7


---
> P-080: 商品データ（df_product）のいずれかの項目に欠損が発生しているレコードを全て削除した新たな商品データを作成せよ。なお、削除前後の件数を表示させ、079で確認した件数だけ減少していることも確認すること。

In [96]:
df_080 = df_product.drop_nulls()

print(len(df_product))
print(len(df_080))

10030
10023


---
> P-081: 単価（unit_price）と原価（unit_cost）の欠損値について、それぞれの平均値で補完した新たな商品データを作成せよ。なお、平均値については1円未満を丸めること（四捨五入または偶数への丸めで良い）。補完実施後、各項目について欠損が生じていないことも確認すること。

In [97]:
# df_ans = (
#     df_product
#     .select(pl.col("unit_price").fill_null(strategy="mean"),
#             pl.col("unit_cost").fill_null(strategy="mean"))
# )

df_081 = df_product.fill_null(strategy="mean")

df_081.null_count()

product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_price,unit_cost
u32,u32,u32,u32,u32,u32
0,0,0,0,0,0


---
> P-082: 単価（unit_price）と原価（unit_cost）の欠損値について、それぞれの中央値で補完した新たな商品データを作成せよ。なお、中央値については1円未満を丸めること（四捨五入または偶数への丸めで良い）。補完実施後、各項目について欠損が生じていないことも確認すること。

In [98]:
df_082 = df_product.select(
    pl.all().exclude("unit_cost", "unit_price"),
    pl.col("unit_cost").fill_null(pl.col("unit_cost").median().cast(pl.Int64)),
    pl.col("unit_price").fill_null(pl.median("unit_price").cast(pl.Int64)),
)

display(df_082.head())
df_082.null_count()

product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_cost,unit_price
str,str,str,str,i64,i64
"""P040101001""","""04""","""0401""","""040101""",149,198
"""P040101002""","""04""","""0401""","""040101""",164,218
"""P040101003""","""04""","""0401""","""040101""",173,230
"""P040101004""","""04""","""0401""","""040101""",186,248
"""P040101005""","""04""","""0401""","""040101""",201,268


product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_cost,unit_price
u32,u32,u32,u32,u32,u32
0,0,0,0,0,0


---
> P-083: 単価（unit_price）と原価（unit_cost）の欠損値について、各商品のカテゴリ小区分コード（category_small_cd）ごとに算出した中央値で補完した新たな商品データを作成せよ。なお、中央値については1円未満を丸めること（四捨五入または偶数への丸めで良い）。補完実施後、各項目について欠損が生じていないことも確認すること。

In [99]:
df_083 = df_product.select(
    pl.exclude("unit_price", "unit_cost"),
    pl.coalesce(
        pl.col(
            "unit_price"
        ),  # pl.coalesce():上から順番にNull以外の値を入れていく-> "unit_cost"の値を入れていき、Nullにはpl.medean()を入れる
        pl.median("unit_price")
        .over(
            "category_small_cd"
        )  # SQLのウィンドウ関数のようなメソッド。"category_scall_cd"ごとに処理を行う。group_byの後から条件版みたいな感じ?
        .cast(pl.Int64),
    ),
    pl.coalesce(
        pl.col("unit_cost"),
        pl.median("unit_cost").over("category_small_cd").cast(pl.Int64),
    ),
)

display(df_083.head())
display(df_083.null_count())

product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_price,unit_cost
str,str,str,str,i64,i64
"""P040101001""","""04""","""0401""","""040101""",198,149
"""P040101002""","""04""","""0401""","""040101""",218,164
"""P040101003""","""04""","""0401""","""040101""",230,173
"""P040101004""","""04""","""0401""","""040101""",248,186
"""P040101005""","""04""","""0401""","""040101""",268,201


product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_price,unit_cost
u32,u32,u32,u32,u32,u32
0,0,0,0,0,0


## 応用

---
> P-084: 顧客データ（df_customer）の全顧客に対して全期間の売上金額に占める2019年売上金額の割合を計算し、新たなデータを作成せよ。ただし、売上実績がない場合は0として扱うこと。そして計算した割合が0超のものを抽出し、結果を10件表示せよ。また、作成したデータに欠損が存在しないことを確認せよ。

In [100]:
df_084 = (
    df_customer.join(df_receipt, how="inner", on="customer_id")
    .select(
        "customer_id",
        "amount",
        pl.col("sales_ymd").cast(pl.Utf8).str.strptime(pl.Date, "%Y%m%d").dt.year(),
    )
    .group_by("customer_id")
    .agg(
        pl.col("amount").sum().alias("amount_all"),
        pl.col("amount")
        .filter(pl.col("sales_ymd") == 2019)
        .sum()
        .fill_null(0)
        .alias("amount_2019"),
    )
    .with_columns((pl.col("amount_2019") / pl.col("amount_all")).alias("amount_rate"))
    .sort("customer_id")
)

display(df_084.head())
display(df_084.null_count())

customer_id,amount_all,amount_2019,amount_rate
str,i64,i64,f64
"""CS001113000004…",1298,1298,1.0
"""CS001114000005…",626,188,0.300319
"""CS001115000010…",3044,578,0.189882
"""CS001205000004…",1988,702,0.353119
"""CS001205000006…",3337,486,0.14564


customer_id,amount_all,amount_2019,amount_rate
u32,u32,u32,u32
0,0,0,0


---
> P-085: 顧客データ（df_customer）の全顧客に対し、郵便番号（postal_cd）を用いてジオコードデータ（df_geocode）を紐付け、新たな顧客データを作成せよ。ただし、1つの郵便番号（postal_cd）に複数の経度（longitude）、緯度（latitude）情報が紐づく場合は、経度（longitude）、緯度（latitude）の平均値を算出して使用すること。また、作成結果を確認するために結果を10件表示せよ。

In [101]:
df_085 = df_customer.join(
    df_geocode.group_by("postal_cd").agg(
        pl.col("longitude").mean(), pl.col("latitude").mean()
    ),
    how="inner",
    on="postal_cd",
).sort("customer_id")

df_085.head()

customer_id,customer_name,gender_cd,gender,birth_day,age,postal_cd,address,application_store_cd,application_date,status_cd,longitude,latitude
str,str,str,str,str,i64,str,str,str,str,str,f64,f64
"""CS001105000001…","""中島 利夫""","""0""","""男性""","""2000-01-14""",19,"""144-0056""","""東京都大田区西六郷*****…","""S13001""","""20170310""","""0-00000000-0""",139.70238,35.54137
"""CS001112000009…","""秦 美里""","""1""","""女性""","""2006-08-24""",12,"""143-0026""","""東京都大田区西馬込*****…","""S13001""","""20150703""","""0-00000000-0""",139.70386,35.5867
"""CS001112000019…","""門脇 莉沙""","""1""","""女性""","""2001-01-31""",18,"""143-0004""","""東京都大田区昭和島*****…","""S13001""","""20170207""","""0-00000000-0""",139.74687,35.57153
"""CS001112000021…","""長澤 麗奈""","""1""","""女性""","""2001-12-15""",17,"""144-0056""","""東京都大田区西六郷*****…","""S13001""","""20170612""","""0-00000000-0""",139.70238,35.54137
"""CS001112000023…","""戸田 一恵""","""1""","""女性""","""2004-01-26""",15,"""143-0004""","""東京都大田区昭和島*****…","""S13001""","""20170724""","""0-00000000-0""",139.74687,35.57153


---
> P-086: 085で作成した緯度経度つき顧客データに対し、会員申込店舗コード（application_store_cd）をキーに店舗データ（df_store）と結合せよ。そして申込み店舗の緯度（latitude）・経度情報（longitude)と顧客住所（address）の緯度・経度を用いて申込み店舗と顧客住所の距離（単位：km）を求め、顧客ID（customer_id）、顧客住所（address）、店舗住所（address）とともに表示せよ。計算式は以下の簡易式で良いものとするが、その他精度の高い方式を利用したライブラリを利用してもかまわない。結果は10件表示せよ。

$$
\begin{aligned}
& longitude(radian)：\phi \\
& latitude(radian)：\lambda \\
& distance, L = 6371 * \arccos(\sin \phi_1 * \sin \phi_2 + \cos \phi_1 * \cos \phi_2 * \cos(\lambda_1 − \lambda_2))
\end{aligned}
$$

In [102]:
df_086 = (
    df_085
    # .join(df_store.rename({"store_cd":"application_store_cd",
    #                        "longitude":"store_longitude",
    #                        "latitude":"store_latitude",
    #                        "address":"store_address"}),
    #       how="inner", on="application_store_cd")
    .join(
        df_store,
        how="inner",
        suffix="_store",
        left_on="application_store_cd",
        right_on="store_cd",
    )
    .select(
        "customer_id",
        pl.col("address").alias("customer_address"),
        "address_store",
        (
            6371
            * (
                (math.pi * pl.col("latitude") / 180).sin()
                * (math.pi * pl.col("latitude_store") / 180).sin()
                + (math.pi * pl.col("latitude") / 180).cos()
                * (math.pi * pl.col("latitude_store") / 180).cos()
                * (
                    math.pi * (pl.col("longitude") - pl.col("longitude_store")) / 180
                ).cos()
            ).arccos()
        ).alias("distance"),
    )
    .sort("customer_id")
)

df_086.head()

customer_id,customer_address,address_store,distance
str,str,str,f64
"""CS001105000001…","""東京都大田区西六郷*****…","""東京都大田区仲六郷二丁目""",1.479789
"""CS001112000009…","""東京都大田区西馬込*****…","""東京都大田区仲六郷二丁目""",4.020495
"""CS001112000019…","""東京都大田区昭和島*****…","""東京都大田区仲六郷二丁目""",3.783015
"""CS001112000021…","""東京都大田区西六郷*****…","""東京都大田区仲六郷二丁目""",1.479789
"""CS001112000023…","""東京都大田区昭和島*****…","""東京都大田区仲六郷二丁目""",3.783015


In [103]:
def distance_expr(lon1: str, lat1: str, lon2: str, lat2: str) -> pl.Expr:
    # radian = degrees * pi / 180
    lon1_rad = pl.col(lon1) * math.pi / 180
    lon2_rad = pl.col(lon2) * math.pi / 180
    lat1_rad = pl.col(lat1) * math.pi / 180
    lat2_rad = pl.col(lat2) * math.pi / 180

    return (
        6371
        * (
            lat1_rad.sin() * lat2_rad.sin()
            + lat1_rad.cos() * lat2_rad.cos() * (lon1_rad - lon2_rad).cos()
        ).arccos()
    )


df_086 = df_085.join(
    df_store,
    how="inner",
    suffix="_store",
    left_on="application_store_cd",
    right_on="store_cd",
).select(
    [
        "customer_id",
        "address",
        "address_store",
        distance_expr(
            "longitude", "latitude", "longitude_store", "latitude_store"
        ).alias("distance"),
    ]
)

df_086.head()

customer_id,address,address_store,distance
str,str,str,f64
"""CS001105000001…","""東京都大田区西六郷*****…","""東京都大田区仲六郷二丁目""",1.479789
"""CS001112000009…","""東京都大田区西馬込*****…","""東京都大田区仲六郷二丁目""",4.020495
"""CS001112000019…","""東京都大田区昭和島*****…","""東京都大田区仲六郷二丁目""",3.783015
"""CS001112000021…","""東京都大田区西六郷*****…","""東京都大田区仲六郷二丁目""",1.479789
"""CS001112000023…","""東京都大田区昭和島*****…","""東京都大田区仲六郷二丁目""",3.783015


---
> P-087: 顧客データ（df_customer）では、異なる店舗での申込みなどにより同一顧客が複数登録されている。名前（customer_name）と郵便番号（postal_cd）が同じ顧客は同一顧客とみなして1顧客1レコードとなるように名寄せした名寄顧客データを作成し、顧客データの件数、名寄顧客データの件数、重複数を算出せよ。ただし、同一顧客に対しては売上金額合計が最も高いものを残し、売上金額合計が同一もしくは売上実績がない顧客については顧客ID（customer_id）の番号が小さいものを残すこととする。

In [104]:
df_087 = (
    df_receipt.group_by("customer_id")
    .agg(pl.col("amount").sum())
    .join(df_customer, how="outer", on="customer_id")
    .filter(pl.col("customer_id").str.starts_with("Z").not_())
    .sort(
        ["amount", "customer_id"], descending=[True, False]
    )  # amountは大きい、coustomer_idは小さいの順
    .unique(
        subset=["customer_name", "postal_cd"], maintain_order=True
    )  # subset:識別する列を選択する, unique()は最初の行を残す
)
print(
    f"df_customer_cnt: {len(df_customer)}",
    f"df_customer_u_cnt: {len(df_087)}",
    f"diff: {len(df_customer) - len(df_087)}",
)
display(df_087.head())

df_customer_cnt: 21971 df_customer_u_cnt: 21941 diff: 30


customer_id,amount,customer_name,gender_cd,gender,birth_day,age,postal_cd,address,application_store_cd,application_date,status_cd
str,i64,str,str,str,str,i64,str,str,str,str,str
"""CS017415000097…",23086,"""福士 千夏""","""1""","""女性""","""1973-04-03""",45,"""166-0014""","""東京都杉並区松ノ木*****…","""S13017""","""20151209""","""F-20101006-F"""
"""CS015415000185…",20153,"""岩淵 はるみ""","""1""","""女性""","""1973-09-19""",45,"""135-0043""","""東京都江東区塩浜******…","""S13015""","""20150322""","""F-20101014-F"""
"""CS031414000051…",19202,"""長澤 沙知絵""","""1""","""女性""","""1973-04-25""",45,"""151-0064""","""東京都渋谷区上原******…","""S13031""","""20150823""","""F-20101009-F"""
"""CS028415000007…",19127,"""紺野 あい""","""1""","""女性""","""1969-07-28""",49,"""246-0023""","""神奈川県横浜市瀬谷区阿久和東…","""S14028""","""20151212""","""F-20100922-F"""
"""CS001605000009…",18925,"""安部 耕司""","""0""","""男性""","""1952-10-22""",66,"""144-0035""","""東京都大田区南蒲田*****…","""S13001""","""20160203""","""F-20101019-E"""


---
> P-088: 087で作成したデータを元に、顧客データに統合名寄IDを付与したデータを作成せよ。ただし、統合名寄IDは以下の仕様で付与するものとする。
>
> - 重複していない顧客：顧客ID（customer_id）を設定
> - 重複している顧客：前設問で抽出したレコードの顧客IDを設定
> 
> 顧客IDのユニーク件数と、統合名寄IDのユニーク件数の差も確認すること。

In [105]:
df_088 = df_customer.join(
    df_087.select("customer_name", "postal_cd", "customer_id"),
    how="inner",
    on=["customer_name", "postal_cd"],
).rename({"customer_id_right": "integration_id"})

diff = len(df_088.get_column("customer_id").unique()) - len(
    df_088.get_column("integration_id").unique()
)

print("IDの差:", diff)
df_088.head()

IDの差: 30


customer_id,customer_name,gender_cd,gender,birth_day,age,postal_cd,address,application_store_cd,application_date,status_cd,integration_id
str,str,str,str,str,i64,str,str,str,str,str,str
"""CS021313000114…","""大野 あや子""","""1""","""女性""","""1981-04-29""",37,"""259-1113""","""神奈川県伊勢原市粟窪****…","""S14021""","""20150905""","""0-00000000-0""","""CS021313000114…"
"""CS037613000071…","""六角 雅彦""","""9""","""不明""","""1952-04-01""",66,"""136-0076""","""東京都江東区南砂******…","""S13037""","""20150414""","""0-00000000-0""","""CS037613000071…"
"""CS031415000172…","""宇多田 貴美子""","""1""","""女性""","""1976-10-04""",42,"""151-0053""","""東京都渋谷区代々木*****…","""S13031""","""20150529""","""D-20100325-C""","""CS031415000172…"
"""CS028811000001…","""堀井 かおり""","""1""","""女性""","""1933-03-27""",86,"""245-0016""","""神奈川県横浜市泉区和泉町**…","""S14028""","""20160115""","""0-00000000-0""","""CS028811000001…"
"""CS001215000145…","""田崎 美紀""","""1""","""女性""","""1995-03-29""",24,"""144-0055""","""東京都大田区仲六郷*****…","""S13001""","""20170605""","""6-20090929-2""","""CS001215000145…"


---
> P-089: 売上実績がある顧客を、予測モデル構築のため学習用データとテスト用データに分割したい。それぞれ8:2の割合でランダムにデータを分割せよ。

In [106]:
df_089 = (
    df_receipt.group_by("customer_id")
    .agg(pl.col("amount").sum())
    .filter(pl.col("amount") > 0)
)

df_train, df_test = train_test_split(df_089, test_size=0.2, random_state=71)

print(f"学習データ割合: {len(df_train)/len(df_089)}")
print(f"テストデータ割合: {len(df_test)/len(df_089)}")
display(df_test.head())

学習データ割合: 0.7999277717587576
テストデータ割合: 0.20007222824124232


customer_id,amount
str,i64
"""CS040314000065…",243
"""CS030614000005…",448
"""CS014315000058…",1348
"""CS005601000008…",466
"""CS020515000228…",2302


In [107]:
# polarsの機能でもOK
df_089 = (
    df_receipt.group_by("customer_id")
    .agg(pl.col("amount").sum())
    .filter(pl.col("amount") > 0)
    .with_row_count("index")  # with_row_countはpandasのindexみたいな列を作れる
    .with_columns(
        pl.col("index").shuffle(seed=71) < 0.8 * pl.col("index").len()
    )  # True, Falseを8:2の割合でシャッフルする
)

df_train, df_test = df_089.partition_by(by="index")

print(f"学習データ割合: {len(df_train)/len(df_089)}")
print(f"テストデータ割合: {len(df_test)/len(df_089)}")
display(df_089.head())
display(df_test.head())
print(type(df_test))

学習データ割合: 0.19995184783917178
テストデータ割合: 0.8000481521608283


index,customer_id,amount
bool,str,i64
False,"""CS019513000065…",1129
True,"""CS009413000056…",383
True,"""CS023211000003…",861
True,"""CS019315000036…",2024
True,"""CS010415000009…",4125


index,customer_id,amount
bool,str,i64
True,"""CS009413000056…",383
True,"""CS023211000003…",861
True,"""CS019315000036…",2024
True,"""CS010415000009…",4125
True,"""CS001413000156…",203


<class 'polars.dataframe.frame.DataFrame'>


---
> P-090: レシート明細データ（df_receipt）は2017年1月1日〜2019年10月31日までのデータを有している。売上金額（amount）を月次で集計し、学習用に12ヶ月、テスト用に6ヶ月の時系列モデル構築用データを3セット作成せよ。

In [108]:
df_090 = (
    df_receipt.select(
        pl.exclude("sales_ymd"),
        pl.col("sales_ymd")
        .cast(pl.Utf8)
        .str.slice(offset=0, length=6)
        .alias("sales_ym"),
    )
    .group_by("sales_ym")
    .agg(pl.col("amount").sum())
    .sort("sales_ym")
)


def split_data(
    df: pl.DataFrame,
    train_size: int,
    test_size: int,
    slide_window: int,
    start_point: int,
):
    train_start = start_point * slide_window
    test_start = train_start + train_size
    return df[train_start:test_start], df[test_start : test_start + test_size]


df_090_train1, df_090_test1 = split_data(
    df_090, train_size=12, test_size=6, slide_window=6, start_point=0
)

df_090_train2, df_090_test2 = split_data(
    df_090, train_size=12, test_size=6, slide_window=6, start_point=1
)

df_090_train3, df_090_test3 = split_data(
    df_090, train_size=12, test_size=6, slide_window=6, start_point=2
)

display(df_090_train1.head())

sales_ym,amount
str,i64
"""201701""",902056
"""201702""",764413
"""201703""",962945
"""201704""",847566
"""201705""",884010


In [109]:
display(df_090_test1.head())

sales_ym,amount
str,i64
"""201801""",944509
"""201802""",864128
"""201803""",946588
"""201804""",937099
"""201805""",1004438


In [110]:
df_090_train2.head()

sales_ym,amount
str,i64
"""201707""",959205
"""201708""",954836
"""201709""",902037
"""201710""",905739
"""201711""",932157


In [111]:
# scikit_learnを使った方法
tscv = TimeSeriesSplit(gap=0, max_train_size=12, n_splits=3, test_size=6)

series_list = []
for train_index, test_index in tscv.split(df_090):
    print(train_index)
    series_list.append((df_090[train_index], df_090[test_index]))

df_090_train1, df_090_test1 = series_list[0]
df_090_train2, df_090_test2 = series_list[1]
df_090_train3, df_090_test3 = series_list[2]

[ 4  5  6  7  8  9 10 11 12 13 14 15]
[10 11 12 13 14 15 16 17 18 19 20 21]
[16 17 18 19 20 21 22 23 24 25 26 27]


---
> P-091: 顧客データ（df_customer）の各顧客に対し、売上実績がある顧客数と売上実績がない顧客数が1:1となるようにアンダーサンプリングで抽出せよ。

In [112]:
df_091 = (
    df_receipt
    .group_by("customer_id")
    .agg(pl.col("amount").sum())
    .join(df_customer, how="outer", on="customer_id")
    .filter(pl.col("customer_id").str.starts_with("Z").not_())
    .with_columns(pl.col("amount").is_null().alias("is_buy_flag"))
)

df_091_sampling = df_091.group_by('is_buy_flag').map_groups(
    lambda x: x.sample(n=df_091.filter((pl.col('is_buy_flag') == 0)).height)
)
print(f"0の数: {len(df_091_sampling.filter(pl.col('is_buy_flag')==False))}")
print(f"1の数: {len(df_091_sampling.filter(pl.col('is_buy_flag')==True))}")

0の数: 8306
1の数: 8306


---
> P-092: 顧客データ（df_customer）の性別について、第三正規形へと正規化せよ。

- 第1正規形：1つのセルには1つの値しか含まれない　list型などで登録しない
- 第2正規形：部分関数従属を排除し、完全関数従属にする　Nullとか無くす
- 第3正規形：第2正規形のテーブルから、推移的関数従属している列が切り出されたもの　段階的な従属を無くす

In [113]:
df_092_gender = df_customer.select(
    "gender_cd", "gender"
).unique()  # genderはgender_cdの従属関係 -> 切り離す
df_092_customer = df_customer.drop("gender")

display(df_092_gender)

display(df_092_customer.head())

gender_cd,gender
str,str
"""0""","""男性"""
"""9""","""不明"""
"""1""","""女性"""


customer_id,customer_name,gender_cd,birth_day,age,postal_cd,address,application_store_cd,application_date,status_cd
str,str,str,str,i64,str,str,str,str,str
"""CS021313000114…","""大野 あや子""","""1""","""1981-04-29""",37,"""259-1113""","""神奈川県伊勢原市粟窪****…","""S14021""","""20150905""","""0-00000000-0"""
"""CS037613000071…","""六角 雅彦""","""9""","""1952-04-01""",66,"""136-0076""","""東京都江東区南砂******…","""S13037""","""20150414""","""0-00000000-0"""
"""CS031415000172…","""宇多田 貴美子""","""1""","""1976-10-04""",42,"""151-0053""","""東京都渋谷区代々木*****…","""S13031""","""20150529""","""D-20100325-C"""
"""CS028811000001…","""堀井 かおり""","""1""","""1933-03-27""",86,"""245-0016""","""神奈川県横浜市泉区和泉町**…","""S14028""","""20160115""","""0-00000000-0"""
"""CS001215000145…","""田崎 美紀""","""1""","""1995-03-29""",24,"""144-0055""","""東京都大田区仲六郷*****…","""S13001""","""20170605""","""6-20090929-2"""


---
> P-093: 商品データ（df_product）では各カテゴリのコード値だけを保有し、カテゴリ名は保有していない。カテゴリデータ（df_category）と組み合わせて非正規化し、カテゴリ名を保有した新たな商品データを作成せよ。

In [114]:
df_093 = df_product.join(
    df_category.select(
        "category_small_cd",
        "category_major_name",
        "category_medium_name",
        "category_small_name",
    ),
    how="left",
    on="category_small_cd",
)

df_093.head()

product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_price,unit_cost,category_major_name,category_medium_name,category_small_name
str,str,str,str,i64,i64,str,str,str
"""P040101001""","""04""","""0401""","""040101""",198,149,"""惣菜""","""御飯類""","""弁当類"""
"""P040101002""","""04""","""0401""","""040101""",218,164,"""惣菜""","""御飯類""","""弁当類"""
"""P040101003""","""04""","""0401""","""040101""",230,173,"""惣菜""","""御飯類""","""弁当類"""
"""P040101004""","""04""","""0401""","""040101""",248,186,"""惣菜""","""御飯類""","""弁当類"""
"""P040101005""","""04""","""0401""","""040101""",268,201,"""惣菜""","""御飯類""","""弁当類"""


## ファイル入出力

---
> P-094: 093で作成したカテゴリ名付き商品データを以下の仕様でファイル出力せよ。
>
> |ファイル形式|ヘッダ有無|文字エンコーディング|
> |:--:|:--:|:--:|
> |CSV（カンマ区切り）|有り|UTF-8|
> 
> ファイル出力先のパスは以下のようにすること
> 
> |出力先|
> |:--:|
> |./data|

In [115]:
df_093.write_csv(
    file="../data/P_df_093_UTF-9_header.csv",
    include_header=True,
    separator=","
)

---
> P-095: 093で作成したカテゴリ名付き商品データを以下の仕様でファイル出力せよ。
>
> |ファイル形式|ヘッダ有無|文字エンコーディング|
> |:--:|:--:|:--:|
> |CSV（カンマ区切り）|有り|CP932|
> 
> ファイル出力先のパスは以下のようにすること。
> 
> |出力先|
> |:--:|
> |./data|

In [116]:
# polarsはencoding:utf-8しか対応していない
# pandasに変換
# import pyarrow as pa
# pa.Table()
# df_093.to_pandas()
# write_csv(file="../data/P_df_093_CP932_header.csv", encoding="CP932", header=False
# )

---
> P-096: 093で作成したカテゴリ名付き商品データを以下の仕様でファイル出力せよ。
>
> |ファイル形式|ヘッダ有無|文字エンコーディング|
> |:--:|:--:|:--:|
> |CSV（カンマ区切り）|無し|UTF-8|
> 
> ファイル出力先のパスは以下のようにすること。
> 
> |出力先|
> |:--:|
> |./data|

In [117]:
df_093.write_csv(file="../data/P_df_093_UTF-9_noh.csv", include_header=False)

---
> P-097: 094で作成した以下形式のファイルを読み込み、データを3件を表示させて正しく取り込まれていることを確認せよ。
> 
> |ファイル形式|ヘッダ有無|文字エンコーディング|
> |:--:|:--:|:--:|
> |CSV（カンマ区切り）|有り|UTF-8|

In [118]:
df_097 = pl.read_csv("../data/P_df_093_UTF-9_header.csv")
df_097.head(3)

product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_price,unit_cost,category_major_name,category_medium_name,category_small_name
str,i64,i64,i64,i64,i64,str,str,str
"""P040101001""",4,401,40101,198,149,"""惣菜""","""御飯類""","""弁当類"""
"""P040101002""",4,401,40101,218,164,"""惣菜""","""御飯類""","""弁当類"""
"""P040101003""",4,401,40101,230,173,"""惣菜""","""御飯類""","""弁当類"""


---
> P-098: 096で作成した以下形式のファイルを読み込み、データを3件を表示させて正しく取り込まれていることを確認せよ。
> 
> |ファイル形式|ヘッダ有無|文字エンコーディング|
> |:--:|:--:|:--:|
> |CSV（カンマ区切り）|ヘッダ無し|UTF-8|

In [119]:
df_098 = pl.read_csv(
    source="../data/P_df_093_UTF-9_noh.csv",
    has_header=False,
    # new_columnsなしだと、column1, column2,...のようなカラム名になる
    new_columns=[
        "product_cd",
        "category_major_cd",
        "category_medium_cd",
        "category_small_cd",
        "unit_price",
        "unit_cost",
        "category_major_name",
        "category_medium_name",
        "category_small_name",
    ],
)

df_098.head(3)

product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_price,unit_cost,category_major_name,category_medium_name,category_small_name
str,i64,i64,i64,i64,i64,str,str,str
"""P040101001""",4,401,40101,198,149,"""惣菜""","""御飯類""","""弁当類"""
"""P040101002""",4,401,40101,218,164,"""惣菜""","""御飯類""","""弁当類"""
"""P040101003""",4,401,40101,230,173,"""惣菜""","""御飯類""","""弁当類"""


---
> P-099: 093で作成したカテゴリ名付き商品データを以下の仕様でファイル出力せよ。
>
> |ファイル形式|ヘッダ有無|文字エンコーディング|
> |:--:|:--:|:--:|
> |TSV（タブ区切り）|有り|UTF-8|
> 
> ファイル出力先のパスは以下のようにすること
> 
> |出力先|
> |:--:|
> |./data|

In [120]:
# separatorでタブで分割
df_093.write_csv("../data/P_df_093_UTF-9_header.tsv", include_header=True, separator="\t")

---
> P-100: 099で作成した以下形式のファイルを読み込み、データを3件を表示させて正しく取り込まれていることを確認せよ。
> 
> |ファイル形式|ヘッダ有無|文字エンコーディング|
> |:--:|:--:|:--:|
> |TSV（タブ区切り）|有り|UTF-8|

In [121]:
df_100 = pl.read_csv(
    source="../data/P_df_093_UTF-9_header.tsv", has_header=True, separator="\t"
)

df_100.head(3)

product_cd,category_major_cd,category_medium_cd,category_small_cd,unit_price,unit_cost,category_major_name,category_medium_name,category_small_name
str,i64,i64,i64,i64,i64,str,str,str
"""P040101001""",4,401,40101,198,149,"""惣菜""","""御飯類""","""弁当類"""
"""P040101002""",4,401,40101,218,164,"""惣菜""","""御飯類""","""弁当類"""
"""P040101003""",4,401,40101,230,173,"""惣菜""","""御飯類""","""弁当類"""


問題はここで終了です。お疲れ様でした。