In [45]:
import pandas as pd
import numpy as np

In [46]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)

### <font color="orange">1_mergeの実行</font>

In [47]:
# 自作関数の呼び出し
import sys
sys.path.append("../../../../")    # <- 親フォルダから呼び出すためにpathを追加する。
from my_package.excute_notebook import execute_notebook

In [48]:
notebook_path = "../1_merge/data_merge.ipynb"
execute_notebook(notebook_path)

### <font color="orange">データの読み込み</font>

In [49]:
data = pd.read_csv("../../tmp_data_folder/2_merge_data/merge_data.csv", dtype={'review_comment_title': 'str'})

### <font color="orange">特徴量生成</font>

In [50]:
# タイムスタンプより日数を算出し、特徴量とする。

def calc_number_of_days(date_1,date_2):
    date_1 = pd.to_datetime(date_1)
    date_2 = pd.to_datetime(date_2)
    
    return (date_2-date_1).dt.days

#　顧客が支払いをしてから、商品が届いた日数
data["days_approved_deliverd"] = calc_number_of_days(data["order_approved_at"], data["order_delivered_customer_date"])

# 購入時に通知したお届け予定日から、実際の納品日の差。
data["days_estimated_deliverd"] = calc_number_of_days(data["order_estimated_delivery_date"], data["order_delivered_customer_date"])

In [51]:
data.head()

Unnamed: 0,order_id,review_score,order_status,order_approved_at,order_delivered_customer_date,order_estimated_delivery_date,count_payment_sequential,payment_type_credit_card,payment_type_boleto,payment_type_voucher,payment_type_debit_card,mean_credit_card_payment_installments,payment_value,seller_id,product_category_name_english,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_volume,days_approved_deliverd,days_estimated_deliverd
0,e481f51cbdc54678b7cc49136f2d6af7,4.0,delivered,2017-10-02 11:07:15,2017-10-10 21:25:13,2017-10-18 00:00:00,3.0,1.0,0.0,1.0,0.0,1.0,38.71,3504c0cb71d7fa48d967e0e4c94d59d9,housewares,40.0,268.0,4.0,500.0,1976.0,8.0,-8.0
1,53cdb2fc8bc7dce0b6741e2150273451,4.0,delivered,2018-07-26 03:24:27,2018-08-07 15:27:45,2018-08-13 00:00:00,1.0,0.0,1.0,0.0,0.0,0.0,141.46,289cdb325fb7e7f891c38608bf9e0962,perfumery,29.0,178.0,1.0,400.0,4693.0,12.0,-6.0
2,47770eb9100c2d0c44946d9cf07ec65d,5.0,delivered,2018-08-08 08:55:23,2018-08-17 18:06:29,2018-09-04 00:00:00,1.0,1.0,0.0,0.0,0.0,3.0,179.12,4869f7a5dfa277a7dca6462dcf3b52b2,auto,46.0,232.0,1.0,420.0,9576.0,9.0,-18.0
3,949d5b44dbf5de918fe9c16f97b45f8a,5.0,delivered,2017-11-18 19:45:59,2017-12-02 00:28:42,2017-12-15 00:00:00,1.0,1.0,0.0,0.0,0.0,1.0,72.2,66922902710d126a0e7d26b0e3805106,pet_shop,59.0,468.0,3.0,450.0,6000.0,13.0,-13.0
4,ad21c59c0840e6cb83a9ceb5573f8159,5.0,delivered,2018-02-13 22:20:29,2018-02-16 18:17:02,2018-02-26 00:00:00,1.0,1.0,0.0,0.0,0.0,1.0,28.62,2c9e548be18521d1c43cde1c582c6de8,stationery,38.0,316.0,4.0,250.0,11475.0,2.0,-10.0


### <font color="orange">使用する説明変数の選択</font>

In [52]:
data.columns.to_list()

['order_id',
 'review_score',
 'order_status',
 'order_approved_at',
 'order_delivered_customer_date',
 'order_estimated_delivery_date',
 'count_payment_sequential',
 'payment_type_credit_card',
 'payment_type_boleto',
 'payment_type_voucher',
 'payment_type_debit_card',
 'mean_credit_card_payment_installments',
 'payment_value',
 'seller_id',
 'product_category_name_english',
 'product_name_lenght',
 'product_description_lenght',
 'product_photos_qty',
 'product_weight_g',
 'product_volume',
 'days_approved_deliverd',
 'days_estimated_deliverd']

In [53]:
# 不要なカラムを削除
del_list = [
    "order_id",
    "order_approved_at",
    "order_delivered_customer_date",
    "order_estimated_delivery_date"
]

for del_column in del_list:
    del data[del_column]

In [54]:
data.shape[1]

18

In [55]:
data.head()

Unnamed: 0,review_score,order_status,count_payment_sequential,payment_type_credit_card,payment_type_boleto,payment_type_voucher,payment_type_debit_card,mean_credit_card_payment_installments,payment_value,seller_id,product_category_name_english,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_volume,days_approved_deliverd,days_estimated_deliverd
0,4.0,delivered,3.0,1.0,0.0,1.0,0.0,1.0,38.71,3504c0cb71d7fa48d967e0e4c94d59d9,housewares,40.0,268.0,4.0,500.0,1976.0,8.0,-8.0
1,4.0,delivered,1.0,0.0,1.0,0.0,0.0,0.0,141.46,289cdb325fb7e7f891c38608bf9e0962,perfumery,29.0,178.0,1.0,400.0,4693.0,12.0,-6.0
2,5.0,delivered,1.0,1.0,0.0,0.0,0.0,3.0,179.12,4869f7a5dfa277a7dca6462dcf3b52b2,auto,46.0,232.0,1.0,420.0,9576.0,9.0,-18.0
3,5.0,delivered,1.0,1.0,0.0,0.0,0.0,1.0,72.2,66922902710d126a0e7d26b0e3805106,pet_shop,59.0,468.0,3.0,450.0,6000.0,13.0,-13.0
4,5.0,delivered,1.0,1.0,0.0,0.0,0.0,1.0,28.62,2c9e548be18521d1c43cde1c582c6de8,stationery,38.0,316.0,4.0,250.0,11475.0,2.0,-10.0


### <font color="orange">欠損値補完</font>

In [56]:
train_data = data[data["review_score"].notna()]
test_data = data[data["review_score"].isna()]
del test_data["review_score"]
del data

In [57]:
train_data.isna().sum()

review_score                                0
order_status                                0
count_payment_sequential                    4
payment_type_credit_card                    4
payment_type_boleto                         4
payment_type_voucher                        4
payment_type_debit_card                     4
mean_credit_card_payment_installments       4
payment_value                               4
seller_id                                 756
product_category_name_english            2223
product_name_lenght                      2201
product_description_lenght               2201
product_photos_qty                       2201
product_weight_g                          772
product_volume                            772
days_approved_deliverd                   2896
days_estimated_deliverd                  2882
dtype: int64

In [58]:
test_data.isna().sum()

order_status                               0
count_payment_sequential                   0
payment_type_credit_card                   0
payment_type_boleto                        0
payment_type_voucher                       0
payment_type_debit_card                    0
mean_credit_card_payment_installments      0
payment_value                              0
seller_id                                 19
product_category_name_english             31
product_name_lenght                       31
product_description_lenght                31
product_photos_qty                        31
product_weight_g                          19
product_volume                            19
days_approved_deliverd                   123
days_estimated_deliverd                  123
dtype: int64

In [59]:
# count_payment_sequential, payment_type_credit_card , payment_type_boleto, payment_type_voucher, payment_type_debit_cardについて
# count_payment_sequentialは最頻値である1を代入する。
# payment_typeは最頻値であるpayment_credit_cardにフラグを立てる。
# mean_credit_card_payment_installmentsは平均値を代入する。
# payment_value は平均値を代入する。

def missing_value_imputation_payment(data):
    
    data["count_payment_sequential"].fillna(1, inplace=True)
    data["payment_type_credit_card"].fillna(1, inplace=True)
    data["payment_type_boleto"].fillna(0, inplace=True)
    data["payment_type_voucher"].fillna(0, inplace=True)
    data["payment_type_debit_card"].fillna(0, inplace=True)

    tmp_mean = data[data["payment_type_credit_card"] == 1].mean_credit_card_payment_installments.mean()
    data["mean_credit_card_payment_installments"].fillna(tmp_mean, inplace=True)

    data["payment_value"].fillna(data["payment_value"].mean().round(2), inplace=True)
    
    return data

train_data = missing_value_imputation_payment(train_data)
test_data = missing_value_imputation_payment(test_data)

In [60]:
# seller_idについて
# seller_idは販売者の質を学習することができる重要な変数と考える。
# seller_idの欠損値は756件あるが、最頻値などで保管すると既存のseller_idにとってノイズとなってしまう。
# また、販売者がseller_idを登録していないなどの理由で、同一販売者が再度販売したときに、欠損値になる可能性が十分にあるので、
# "Unkown"という文字列で欠損値補完をしておく。

def missing_value_imputation_seller_id(data):
    data["seller_id"].fillna("Unkown", inplace=True)

    # product_category_name_englishもseller_idと同様の理由で、"Unkown"で欠損値保管する。
    data["product_category_name_english"].fillna("Unkown", inplace=True)
    return data

train_data = missing_value_imputation_seller_id(train_data)
test_data = missing_value_imputation_seller_id(test_data)

In [61]:
# product_name_lenght, product_description_lenght, product_photos_qtyについて
# それぞれの最小値が1以上だったため、欠損値はもともと0であった可能性があるので、0で欠損値保管する。

def missing_value_imputation_product_introduction(data):
    data["product_name_lenght"].fillna(0, inplace=True)
    data["product_description_lenght"].fillna(0, inplace=True)
    data["product_photos_qty"].fillna(0, inplace=True)
    return data

train_data = missing_value_imputation_product_introduction(train_data)
test_data = missing_value_imputation_product_introduction(test_data)

In [62]:
# product_weight_g, product_volumeは平均値で欠損値保管する。
def missing_value_imputation_product_size(data):
    data["product_weight_g"].fillna(data["product_weight_g"].mean(), inplace=True)
    data["product_volume"].fillna(data["product_volume"].mean(), inplace=True)
    return data

train_data = missing_value_imputation_product_size(train_data)
test_data = missing_value_imputation_product_size(test_data)

In [63]:
# days_approved_deliverd, days_estimated_deliverdについて
# 平均値で欠損値保管する。
def missing_value_imputation_days(data):
    data["days_approved_deliverd"].fillna(data["days_approved_deliverd"].mean(), inplace=True)
    data["days_estimated_deliverd"].fillna(data["days_estimated_deliverd"].mean(), inplace=True)
    return data

train_data = missing_value_imputation_days(train_data)
test_data = missing_value_imputation_days(test_data)
    

In [64]:
train_data.isna().sum()

review_score                             0
order_status                             0
count_payment_sequential                 0
payment_type_credit_card                 0
payment_type_boleto                      0
payment_type_voucher                     0
payment_type_debit_card                  0
mean_credit_card_payment_installments    0
payment_value                            0
seller_id                                0
product_category_name_english            0
product_name_lenght                      0
product_description_lenght               0
product_photos_qty                       0
product_weight_g                         0
product_volume                           0
days_approved_deliverd                   0
days_estimated_deliverd                  0
dtype: int64

In [65]:
test_data.isna().sum()

order_status                             0
count_payment_sequential                 0
payment_type_credit_card                 0
payment_type_boleto                      0
payment_type_voucher                     0
payment_type_debit_card                  0
mean_credit_card_payment_installments    0
payment_value                            0
seller_id                                0
product_category_name_english            0
product_name_lenght                      0
product_description_lenght               0
product_photos_qty                       0
product_weight_g                         0
product_volume                           0
days_approved_deliverd                   0
days_estimated_deliverd                  0
dtype: int64

In [None]:
# int型に変換できるカラムは変換する。
def change_type(data):
    data["count_payment_sequential"] = data["count_payment_sequential"].astype(int)
    data["payment_type_credit_card"] = data["payment_type_credit_card"].astype(int)
    data["payment_type_boleto"] = data["payment_type_boleto"].astype(int)
    data["payment_type_voucher"] = data["payment_type_voucher"].astype(int)
    data["payment_type_debit_card"] = data["payment_type_debit_card"].astype(int)
    data["mean_credit_card_payment_installments"] = data["mean_credit_card_payment_installments"].round(0).astype(int)
    data["product_name_lenght"] = data["product_name_lenght"].astype(int)
    data["product_description_lenght"] = data["product_description_lenght"].astype(int)
    data["product_photos_qty"] = data["product_photos_qty"].astype(int)
    data["product_weight_g"] = data["product_weight_g"].round(0).astype(int)
    data["product_volume"] = data["product_volume"].round(0).astype(int)
    data["days_approved_deliverd"] = data["days_approved_deliverd"].round(0).astype(int)
    data["days_estimated_deliverd"] = data["days_estimated_deliverd"].round(0).astype(int)
    return data

train_data = change_type(train_data)
test_data = change_type(test_data)

NameError: name 'data' is not defined

### <font color="orange">データを保存</font>

In [None]:
import os
save_path = "../../tmp_data_folder/3_main_data/"

# フォルダが存在しない場合に作成
if not os.path.exists(save_path):
    os.makedirs(save_path)
    print(f'フォルダ {save_path} を作成しました。')
else:
    print(f'フォルダ {save_path} は既に存在します。')
    
train_data.to_csv("../../tmp_data_folder/3_main_data/train_data.csv", index=False)
test_data.to_csv("../../tmp_data_folder/3_main_data/test_data.csv", index=False)
print("main_data.csvを保存しました。")