In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os 
os.chdir('/content/drive/MyDrive/データ分析プロジェクト/src') 

Mounted at /content/drive


## **1. ライブラリのインポート**

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import pickle

## **2. 初期定義**

In [None]:
data_dir = '../data' # このipynbファイルから見たdataフォルダのパス
output_dir = f'{data_dir}/output'
input_file = f'{output_dir}/order_processed.csv' # 加工済みデータのパス
output_file = f'{output_dir}/kmeans_result.csv' # クラスタリングの結果データのパス
model_file = f'{output_dir}/kmeans_model.pkl' # 構築したモデルファイルのパス

## **3. データの読み込み**

In [None]:
df = pd.read_csv(input_file)
display(df.head())

Unnamed: 0,オーダーID,オーダー日,顧客ID,商品ID,数量,売上,出荷日,キャンセル日,ステータス,氏名,ひらがな,生年月日,性別,都道府県,商品名,カテゴリ,サブカテゴリ,色,サイズ,定価
0,201901010005-1,2019-01-01,C00002159,TB-BUNN,1,9800,2019-01-01,,配達済み,竹田 朋実,たけだ ともみ,1995-07-01,女,東京都,トートバッグ B,バッグ,Unisex,,,9800
1,201901010001-1,2019-01-01,C00000112,TO-AUNN,1,2280,2019-01-04,,配達済み,小川 幸枝,おがわ ゆきえ,1991-06-13,女,神奈川県,タオル A,小物,Unisex,,,2280
2,201901010005-2,2019-01-01,C00002159,PA-BMBS,1,20000,2019-01-02,,配達済み,竹田 朋実,たけだ ともみ,1995-07-01,女,東京都,Men Black パンツ B S,ボトムス,Men,Black,S,20000
3,201901010004-1,2019-01-01,C00001710,BP-BUNN,1,9800,2019-01-01,,配達済み,植田 陽一,うえだ よういち,1995-06-23,男,東京都,バックパック B,バッグ,Unisex,,,9800
4,201901020019-2,2019-01-02,C00001925,PA-AMBL,1,16500,2019-01-07,,配達済み,原田 由香,はらだ ゆか,1975-10-12,女,大阪府,Men Black パンツ A L,ボトムス,Men,Black,L,16500


## **4. メイン処理 ～AIモデル構築～**

#### **① 学習用にデータを加工しよう**

##### **データを機械学習の粒度に揃える**

In [None]:
use_cols = ["顧客ID","売上","カテゴリ"]
tg_df = df[use_cols]

display(tg_df)

Unnamed: 0,顧客ID,売上,カテゴリ
0,C00002159,9800,バッグ
1,C00000112,2280,小物
2,C00002159,20000,ボトムス
3,C00001710,9800,バッグ
4,C00001925,16500,ボトムス
...,...,...,...
13665,C00000279,4800,小物
13666,C00001949,7900,トップス
13667,C00001779,16800,アウター
13668,C00001192,1200,小物


In [None]:
category = tg_df['カテゴリ'].unique()

print(category)

['バッグ' '小物' 'ボトムス' 'トップス' 'アウター']


In [None]:
tg_df_yoko = []
for tg_category in category:
    tmp_df = tg_df.loc[tg_df['カテゴリ']==tg_category].copy()
    tmp_df[f'購買回数_{tg_category}'] = 1
    tmp_df.rename(columns={'売上':f'売上_{tg_category}'}, inplace=True)
    tg_df_yoko.append(tmp_df)
tg_df_yoko = pd.concat(tg_df_yoko, ignore_index=False)

display(tg_df_yoko)

Unnamed: 0,顧客ID,売上_バッグ,カテゴリ,購買回数_バッグ,売上_小物,購買回数_小物,売上_ボトムス,購買回数_ボトムス,売上_トップス,購買回数_トップス,売上_アウター,購買回数_アウター
0,C00002159,9800.0,バッグ,1.0,,,,,,,,
3,C00001710,9800.0,バッグ,1.0,,,,,,,,
5,C00001884,10200.0,バッグ,1.0,,,,,,,,
30,C00001482,7800.0,バッグ,1.0,,,,,,,,
37,C00001989,12900.0,バッグ,1.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
13585,C00000980,,アウター,,,,,,,,35200.0,1.0
13589,C00001835,,アウター,,,,,,,,16800.0,1.0
13600,C00001258,,アウター,,,,,,,,38900.0,1.0
13657,C00000946,,アウター,,,,,,,,18900.0,1.0


In [None]:
X_df = tg_df_yoko.groupby("顧客ID").sum()

display(X_df)

Unnamed: 0_level_0,売上_バッグ,購買回数_バッグ,売上_小物,購買回数_小物,売上_ボトムス,購買回数_ボトムス,売上_トップス,購買回数_トップス,売上_アウター,購買回数_アウター
顧客ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C00000001,0.0,0.0,1200.0,1.0,20000.0,1.0,0.0,0.0,0.0,0.0
C00000002,0.0,0.0,3480.0,2.0,52200.0,3.0,67600.0,4.0,0.0,0.0
C00000003,0.0,0.0,10560.0,1.0,0.0,0.0,0.0,0.0,39600.0,1.0
C00000004,6800.0,1.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
C00000005,18600.0,2.0,0.0,0.0,45100.0,2.0,0.0,0.0,19800.0,1.0
...,...,...,...,...,...,...,...,...,...,...
C00002227,0.0,0.0,1200.0,1.0,40000.0,1.0,46100.0,2.0,20800.0,1.0
C00002228,16600.0,2.0,2180.0,1.0,36200.0,2.0,61000.0,4.0,0.0,0.0
C00002229,23400.0,1.0,0.0,0.0,59400.0,1.0,33600.0,3.0,52000.0,2.0
C00002230,9800.0,1.0,3380.0,2.0,95100.0,5.0,35900.0,3.0,95300.0,3.0


In [None]:
print(X_df.isnull().sum())

売上_バッグ       0
購買回数_バッグ     0
売上_小物        0
購買回数_小物      0
売上_ボトムス      0
購買回数_ボトムス    0
売上_トップス      0
購買回数_トップス    0
売上_アウター      0
購買回数_アウター    0
dtype: int64


##### **変数を整える**

In [None]:
scaler = MinMaxScaler()
X_df_scaled = scaler.fit_transform(X_df)

print(X_df_scaled)

[[0.         0.         0.03262643 ... 0.         0.         0.        ]
 [0.         0.         0.09461664 ... 0.33333333 0.         0.        ]
 [0.         0.         0.28711256 ... 0.         0.16058394 0.2       ]
 ...
 [0.26086957 0.14285714 0.         ... 0.25       0.2108678  0.4       ]
 [0.10925307 0.14285714 0.09189777 ... 0.25       0.3864558  0.6       ]
 [0.         0.         0.27569331 ... 0.16666667 0.         0.        ]]


#### **② モデルの学習をしよう**

In [None]:
model = KMeans(n_clusters=4, random_state=0).fit(X_df_scaled)

In [None]:
pred = model.predict(X_df_scaled)
print(pred)

[3 3 0 ... 0 2 2]


#### **③ モデルの評価をしよう**

In [None]:
X_df["group"] = pred
display(X_df.head())

Unnamed: 0_level_0,売上_バッグ,購買回数_バッグ,売上_小物,購買回数_小物,売上_ボトムス,購買回数_ボトムス,売上_トップス,購買回数_トップス,売上_アウター,購買回数_アウター,group
顧客ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C00000001,0.0,0.0,1200.0,1.0,20000.0,1.0,0.0,0.0,0.0,0.0,3
C00000002,0.0,0.0,3480.0,2.0,52200.0,3.0,67600.0,4.0,0.0,0.0,3
C00000003,0.0,0.0,10560.0,1.0,0.0,0.0,0.0,0.0,39600.0,1.0,0
C00000004,6800.0,1.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3
C00000005,18600.0,2.0,0.0,0.0,45100.0,2.0,0.0,0.0,19800.0,1.0,0


In [None]:
print(X_df["group"].value_counts())

3    958
0    626
1    333
2    314
Name: group, dtype: int64


#### **④ モデルの解釈をしよう**

In [None]:
group_df = X_df.groupby("group").mean()

display(group_df)

Unnamed: 0_level_0,売上_バッグ,購買回数_バッグ,売上_小物,購買回数_小物,売上_ボトムス,購買回数_ボトムス,売上_トップス,購買回数_トップス,売上_アウター,購買回数_アウター
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,5780.766773,0.560703,2410.638978,0.738019,27781.15016,1.276358,22367.571885,1.407348,43161.980831,1.303514
1,31751.351351,2.516517,3967.867868,1.063063,39012.312312,1.927928,32416.816817,2.129129,13320.720721,0.444444
2,14670.191083,1.312102,8651.719745,2.070064,95954.458599,4.11465,65446.496815,3.910828,34287.898089,1.127389
3,4646.273486,0.468685,2237.620042,0.676409,29557.202505,1.352818,22546.033403,1.399791,0.0,0.0


In [None]:
group_rank_df = group_df.rank(ascending=False)
display(group_rank_df)

Unnamed: 0_level_0,売上_バッグ,購買回数_バッグ,売上_小物,購買回数_小物,売上_ボトムス,購買回数_ボトムス,売上_トップス,購買回数_トップス,売上_アウター,購買回数_アウター
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,1.0,1.0
1,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0
2,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0
3,4.0,4.0,4.0,4.0,3.0,3.0,3.0,4.0,4.0,4.0


## **5. データの出力**

In [None]:
X_df.to_csv(output_file)

In [None]:
pickle.dump(model, open(model_file, 'wb'))