In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os 
os.chdir('/content/drive/MyDrive/演習プロジェクト/src') 

Mounted at /content/drive


## **1. ライブラリのインポート**

In [22]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import pickle

## **2. 初期定義**

In [23]:
data_dir = '../data' # このipynbファイルから見たdataフォルダのパス
output_dir = f'{data_dir}/output'
input_file = f'{output_dir}/transaction_processed.csv' # 加工済みデータのパス
output_file = f'{output_dir}/kmeans_result.csv' # クラスタリングの結果データのパス
model_file = f'{output_dir}/kmeans_model.pkl' # 構築したモデルファイルのパス

## **3. データの読み込み**

In [24]:
df = pd.read_csv(input_file)
display(df.head())

Unnamed: 0,transaction_id,transaction_date,customer_id,product_id,count,sales,shipping_date,cancellation_date,status,customer_name,birth_date,gender,prefectures,type,product_name,brand,category,color,price
0,201904010000-1,2019-04-01,C-001213,CANB3,3,449400,2019-04-12,,配達済み,押切 彩,1991-01-30,女,東京都,法人,Caps-A3,caps,ノートPC,Black,149800
1,201904010003-1,2019-04-01,C-001572,ID16,1,16300,2019-04-09,,配達済み,牟田 健,1980-10-21,男,愛知県,個人,Display-16,ディスプレイ,周辺機器,,16300
2,201904030001-1,2019-04-03,C-002610,INDB5,3,464400,2019-04-12,,配達済み,鈴木 博志,1983-09-11,男,大阪府,法人,Inter-W5,Inter,デスクトップ,Black,154800
3,201904030001-2,2019-04-03,C-002610,IKW,6,22686,2019-04-14,,配達済み,鈴木 博志,1983-09-11,男,大阪府,法人,ワイヤレスキーボード,キーボード,周辺機器,,3781
4,201904040006-1,2019-04-04,C-001267,ZENB9,6,1679400,2019-04-15,,配達済み,松浦 美菜子,2001-01-28,女,大阪府,個人,Zerace-J9,zerace,ノートPC,Black,279900


## **4. メイン処理 ～AIモデル構築～**

#### **① 学習用にデータを加工しよう**

##### **データを機械学習の粒度に揃える**

In [25]:
use_cols = ["customer_id","sales","category"]
tg_df = df[use_cols]

display(tg_df)

Unnamed: 0,customer_id,sales,category
0,C-001213,449400,ノートPC
1,C-001572,16300,周辺機器
2,C-002610,464400,デスクトップ
3,C-002610,22686,周辺機器
4,C-001267,1679400,ノートPC
...,...,...,...
8378,C-002067,169800,ノートPC
8379,C-002067,0,ノートPC
8380,C-002417,1216600,ノートPC
8381,C-002821,2428200,ノートPC


In [26]:
category = tg_df['category'].unique()

print(category)

['ノートPC' '周辺機器' 'デスクトップ' 'タブレット']


In [27]:
tg_df_yoko = []
for tg_category in category:
    tmp_df = tg_df.loc[tg_df['category']==tg_category].copy()
    tmp_df[f'購買回数_{tg_category}'] = 1
    tmp_df.rename(columns={'sales':f'sales_{tg_category}'}, inplace=True)
    tg_df_yoko.append(tmp_df)
tg_df_yoko = pd.concat(tg_df_yoko, ignore_index=False)

display(tg_df_yoko)

Unnamed: 0,customer_id,sales_ノートPC,category,購買回数_ノートPC,sales_周辺機器,購買回数_周辺機器,sales_デスクトップ,購買回数_デスクトップ,sales_タブレット,購買回数_タブレット
0,C-001213,449400.0,ノートPC,1.0,,,,,,
4,C-001267,1679400.0,ノートPC,1.0,,,,,,
6,C-002604,299600.0,ノートPC,1.0,,,,,,
7,C-003126,259600.0,ノートPC,1.0,,,,,,
8,C-003247,296220.0,ノートPC,1.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...
8294,C-001851,,タブレット,,,,,,14970.0,1.0
8305,C-000314,,タブレット,,,,,,248684.0,1.0
8322,C-002679,,タブレット,,,,,,124342.0,1.0
8344,C-001253,,タブレット,,,,,,29940.0,1.0


In [28]:
X_df = tg_df_yoko.groupby("customer_id").sum()

display(X_df)

Unnamed: 0_level_0,sales_ノートPC,購買回数_ノートPC,sales_周辺機器,購買回数_周辺機器,sales_デスクトップ,購買回数_デスクトップ,sales_タブレット,購買回数_タブレット
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C-000001,67800.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
C-000002,1080400.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
C-000003,363600.0,1.0,3690.0,1.0,0.0,0.0,0.0,0.0
C-000004,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
C-000005,154800.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
C-003796,1159600.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
C-003797,682600.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
C-003798,559800.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
C-003799,0.0,0.0,0.0,0.0,2047800.0,3.0,0.0,0.0


In [29]:
print(X_df.isnull().sum())

sales_ノートPC     0
購買回数_ノートPC      0
sales_周辺機器      0
購買回数_周辺機器       0
sales_デスクトップ    0
購買回数_デスクトップ     0
sales_タブレット     0
購買回数_タブレット      0
dtype: int64


##### **変数を整える**

In [30]:
scaler = MinMaxScaler()
X_df_scaled = scaler.fit_transform(X_df)

print(X_df_scaled)

[[0.00382354 0.08333333 0.         ... 0.         0.         0.        ]
 [0.06092857 0.16666667 0.         ... 0.         0.         0.        ]
 [0.02050502 0.08333333 0.01191476 ... 0.         0.         0.        ]
 ...
 [0.03156962 0.08333333 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.33333333 0.         0.        ]
 [0.05929877 0.16666667 0.         ... 0.         0.         0.        ]]


#### **② モデルの学習をしよう**

In [31]:
model = KMeans(n_clusters=4, random_state=0).fit(X_df_scaled)

In [32]:
pred = model.predict(X_df_scaled)
print(pred)

[0 0 3 ... 0 0 0]


#### **③ モデルの評価をしよう**

In [33]:
X_df["group"] = pred
display(X_df.head())

Unnamed: 0_level_0,sales_ノートPC,購買回数_ノートPC,sales_周辺機器,購買回数_周辺機器,sales_デスクトップ,購買回数_デスクトップ,sales_タブレット,購買回数_タブレット,group
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
C-000001,67800.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
C-000002,1080400.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0
C-000003,363600.0,1.0,3690.0,1.0,0.0,0.0,0.0,0.0,3
C-000004,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3
C-000005,154800.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [34]:
print(X_df["group"].value_counts())

0    2482
3     555
2     489
1     206
Name: group, dtype: int64


#### **④ モデルの解釈をしよう**

In [35]:
group_df = X_df.groupby("group").mean()

display(group_df)

Unnamed: 0_level_0,sales_ノートPC,購買回数_ノートPC,sales_周辺機器,購買回数_周辺機器,sales_デスクトップ,購買回数_デスクトップ,sales_タブレット,購買回数_タブレット
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,469011.8,1.064464,0.0,0.0,387520.8,0.602337,0.0,0.0
1,2445074.0,3.427184,15556.81068,0.65534,3066971.0,3.174757,87131.184466,0.247573
2,392603.3,0.91411,1666.936605,0.137014,392504.7,0.603272,142097.97137,1.087935
3,399763.8,0.769369,21474.464865,1.111712,416231.8,0.535135,4926.111712,0.032432


In [36]:
group_rank_df = group_df.rank(ascending=False)
display(group_rank_df)

Unnamed: 0_level_0,sales_ノートPC,購買回数_ノートPC,sales_周辺機器,購買回数_周辺機器,sales_デスクトップ,購買回数_デスクトップ,sales_タブレット,購買回数_タブレット
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2.0,2.0,4.0,4.0,4.0,3.0,4.0,4.0
1,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0
2,4.0,3.0,3.0,3.0,3.0,2.0,1.0,1.0
3,3.0,4.0,1.0,1.0,2.0,4.0,3.0,3.0


## **5. データの出力**

In [37]:
X_df.to_csv(output_file)

In [38]:
pickle.dump(model, open(model_file, 'wb'))