# Configs
設定を保存していく

In [86]:
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error as RMSE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import optuna.integration.lightgbm as lgb


class Config:
    def __init__(self):
        self.NB = '001'
        self.path_train_data = '../datas/train_raw.csv'
        self.path_test_data = '../datas/test_raw.csv'
        self.path_submission = f'../submissions/submission{self.NB}.csv'
        self.docs_pathname = f'../docs/{self.NB}'
        self.img_pathname = f'../images/{self.NB}'
        self.data_pathname = f'../datas/{self.NB}'
        self.modified_data_pathname = f'../datas/{self.NB}modified_'
        self.submit_col = ['id', 'num_sold']
        self.pred_col = 'Price'
        self.n_estimators = 100
        self.n_splits = 5
        self.early_stopping_rounds = 150
        self.cv_folds = 5
        self.ramdom_state = 1234
        self.settings = {
            'ratio_cols': ['store_sold_ratio', 'product_sold_ratio', 'country_sold_ratio'],
            'pred_cols': ['store', 'product', 'country'],
            'train_col': ['store', 'product', 'country', 'month','day','year','weekday','week','quarter']
        }

config = Config()

ModuleNotFoundError: 
Could not find `optuna-integration` for `lightgbm`.
Please run `pip install optuna-integration[lightgbm]`.

# Processing

In [83]:
# Ferature Engineering
def one_hot_encoding(test_df:pd.DataFrame,train_df:pd.DataFrame, cols:list)->pd.DataFrame:
    oe = OneHotEncoder(sparse_output=False)
    oe.fit(train_df[cols])
    encoded_train = oe.transform(train_df[cols])
    encoded_test = oe.transform(test_df[cols])
    train_df = train_df.drop(cols, axis=1)
    test_df = test_df.drop(cols, axis=1)
    train_df = pd.concat([train_df, pd.DataFrame(encoded_train, columns=oe.get_feature_names_out(cols))], axis=1)
    test_df = pd.concat([test_df, pd.DataFrame(encoded_test, columns=oe.get_feature_names_out(cols))], axis=1)
    return test_df, train_df

def label_encoding(test_df:pd.DataFrame,train_df:pd.DataFrame, cols:list)->pd.DataFrame:
    le = LabelEncoder()
    for col in cols:
        le.fit(train_df[col])
        test_df[col] = le.transform(test_df[col])
        train_df[col] = le.transform(train_df[col])
    return test_df, train_df

def holdout_target_encoding(train_df:pd.DataFrame, test_df:pd.DataFrame, target_col:str)->pd.DataFrame:
    kf = KFold(n_splits=config.n_splits, shuffle=True, random_state=config.ramdom_state)

    box = np.zeros(len(train_df))
    for idx_1, idx_2 in kf.split(train_df): # idx_1: train, idx_2: encoding target
        target_mean = train_df.iloc[idx_1].groupby(target_col)[config.pred_col].mean()
        box[idx_2] = train_df[target_col].iloc[idx_2].map(target_mean)
    train_df[f'{target_col}_holdout_te'] = box

    target_mean = train_df.groupby(target_col)[config.pred_col].mean()
    test_df[f'{target_col}_holdout_te'] = test_df[target_col].map(target_mean)
    return train_df, test_df

train_datas = pd.read_csv(config.path_train_data)
test_datas = pd.read_csv(config.path_test_data)
train_datas['Brand-Material'] = train_datas['Brand'] + '-' + train_datas['Material']
test_datas['Brand-Material'] = test_datas['Brand'] + '-' + test_datas['Material']
train_datas['Brand-Style'] = train_datas['Brand'] + '-' + train_datas['Style']
test_datas['Brand-Style'] = test_datas['Brand'] + '-' + test_datas['Style']
train_datas, test_datas = holdout_target_encoding(train_datas, test_datas, 'Brand')
train_datas, test_datas = holdout_target_encoding(train_datas, test_datas, 'Material')
train_datas, test_datas = holdout_target_encoding(train_datas, test_datas, 'Style')
train_datas, test_datas = holdout_target_encoding(train_datas, test_datas, 'Color')
train_datas, test_datas = holdout_target_encoding(train_datas, test_datas, 'Brand-Material')
train_datas, test_datas = holdout_target_encoding(train_datas, test_datas, 'Brand-Style')
train_datas = train_datas.drop(['Brand', 'Material', 'Style', 'Brand-Material', 'Brand-Style', 'Color'], axis=1)
test_datas = test_datas.drop(['Brand', 'Material', 'Style', 'Brand-Material', 'Brand-Style', 'Color'], axis=1)
test_datas, train_datas = label_encoding(test_datas, train_datas, ['Size', 'Waterproof', 'Laptop Compartment'])
test_datas.to_csv(f'{config.modified_data_pathname}_test.csv', index=False)
train_datas.to_csv(f'{config.modified_data_pathname}_train.csv', index=False)
display(train_datas.head())

Unnamed: 0,id,Size,Compartments,Laptop Compartment,Waterproof,Weight Capacity (kg),Price,Brand_holdout_te,Material_holdout_te,Style_holdout_te,Color_holdout_te,Brand-Material_holdout_te,Brand-Style_holdout_te
0,0,1,7.0,1,0,11.611723,112.15875,81.819558,80.455028,81.521997,80.501769,80.855173,81.694854
1,1,2,10.0,1,1,27.078537,68.88056,81.769874,82.140962,81.353154,82.417498,82.341131,81.969576
2,2,2,2.0,1,0,16.64376,39.1732,82.068684,80.340901,81.353154,80.886046,80.756973,81.985421
3,3,2,8.0,1,0,12.93722,80.60793,81.202773,80.95239,81.353154,82.417498,80.601946,81.321276
4,4,1,1.0,1,1,17.749338,86.02312,80.592686,82.097783,81.491078,82.447494,81.334032,80.76798


In [None]:
train_datas = pd.read_csv(f'{config.modified_data_pathname}_train.csv')

train_datas, test_datas = train_test_split(train_datas, test_size=0.2, random_state=config.ramdom_state)



## EDA

In [75]:
import itertools
import seaborn as sns

# EDA用の関数を格納する

def visualize_statistical_relationships(train_df, cols, stats, **kwargs):
    for pair in itertools.combinations(cols, 2):
        fig, ax = plt.subplots(1, len(stats), figsize=(10, 5))
        for stat in stats:
            analysis_df = train_df[[config.pred_col]+list(pair)].pivot_table(index=pair[0], columns=pair[1], values=config.pred_col, aggfunc=stat)
            heatmap = sns.heatmap(analysis_df, annot=True, fmt=".0f", ax=ax[stats.index(stat)])
            ax[stats.index(stat)].set_title(f'train {stat}')
        fig.tight_layout()
        fig.savefig(f'{config.img_pathname}_heatmap_{pair[0]}-{pair[1]}.png')
        plt.close()

def visualize_statistical_relationships_compare(train_df, cols, stats, **kwargs):
    for stat in stats:
        fig, ax = plt.subplots(5,2 , figsize=(10, 15))
        analysis_dfs = {}
        for i, pair in enumerate(itertools.combinations(cols, 2)):
            analysis_dfs[pair] = {"data":train_df[[config.pred_col]+list(pair)].pivot_table(index=pair[0], columns=pair[1], values=config.pred_col, aggfunc=stat)}
            analysis_dfs[pair]["min"] = analysis_dfs[pair]["data"].min().min()
            analysis_dfs[pair]["max"] = analysis_dfs[pair]["data"].max().max()
        vmin = min([analysis_dfs[pair]["min"] for pair in analysis_dfs])
        vmax = max([analysis_dfs[pair]["max"] for pair in analysis_dfs])
        for i, key in enumerate(analysis_dfs.keys()):
            sns.heatmap(analysis_dfs[key]["data"], ax=ax[i//2, i%2], vmin=vmin, vmax=vmax, annot=True, fmt=".0f")
        fig.suptitle(f'{stat}', fontsize=16)
        fig.tight_layout()
        fig.savefig(f'{config.img_pathname}_heatmap_compare_{stat}.png')
        plt.close()

In [62]:
train_datas = pd.read_csv(config.path_train_data)

with open(f'{config.docs_pathname}_eda_train.txt', 'w') as f:
    f.write('------------------------------------\n')
    f.write('Datas Info\n')
    f.write('------------------------------------\n')
    train_datas.info(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Statical Values \n')
    f.write('------------------------------------\n')
    train_datas.describe().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Heads \n')
    f.write('------------------------------------\n')
    train_datas.head().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('How many nulls \n')
    f.write('------------------------------------\n')
    train_datas.isnull().sum().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Unique Values \n')
    train_datas.nunique().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Columns feature \n')
    for col in train_datas.columns:
        f.write('\n')
        f.write(f'{col}\n')
        f.write(f'{train_datas[col].unique()}\n')


test_datas = pd.read_csv(config.path_test_data)

with open(f'{config.docs_pathname}_eda_test.txt', 'w') as f:
    f.write('------------------------------------\n')
    f.write('Datas Info\n')
    f.write('------------------------------------\n')
    test_datas.info(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Statical Values \n')
    f.write('------------------------------------\n')
    test_datas.describe().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Heads \n')
    f.write('------------------------------------\n')
    test_datas.head().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('How many nulls \n')
    f.write('------------------------------------\n')
    test_datas.isnull().sum().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Unique Values \n')
    test_datas.nunique().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Columns feature \n')
    for col in test_datas.columns:
        f.write('\n')
        f.write(f'{col}\n')
        f.write(f'{test_datas[col].unique()}\n')

In [19]:
display(train_datas.head())

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [22]:
train_datas1 = train_datas.copy()
train_datas = train_datas1.dropna()
grouped = train_datas1.groupby(['Brand', 'Material', 'Laptop Compartment', 'Style', 'Color'])

In [76]:
import itertools
cols = ['Brand', 'Material', 'Laptop Compartment', 'Style', 'Color']

stats = ['mean', 'std','count']

visualize_statistical_relationships(train_datas, cols, stats)

visualize_statistical_relationships_compare(train_datas, cols, stats)