# Configs
設定を保存していく

In [40]:
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error as RMSE
from sklearn.preprocessing import OneHotEncoder

class Config:
    def __init__(self):
        self.NB = '001'
        self.path_train_data = '../datas/train_raw.csv'
        self.path_test_data = '../datas/test_raw.csv'
        self.path_submission = f'../submissions/submission{self.NB}.csv'
        self.docs_pathname = f'../docs/{self.NB}'
        self.img_pathname = f'../images/{self.NB}'
        self.data_pathname = f'../datas/{self.NB}'
        self.modified_data_pathname = f'../modified_datas/{self.NB}'
        self.submit_col = ['id', 'num_sold']
        self.pred_col = 'Price'
        self.n_estimators = 100
        self.early_stopping_rounds = 150
        self.cv_folds = 5
        self.ramdom_state = 1234
        self.settings = {
            'ratio_cols': ['store_sold_ratio', 'product_sold_ratio', 'country_sold_ratio'],
            'pred_cols': ['store', 'product', 'country'],
            'train_col': ['store', 'product', 'country', 'month','day','year','weekday','week','quarter']
        }

config = Config()

# Processing

In [18]:
def one_hot_encoding(df:pd.DataFrame, cols:list)->pd.DataFrame:
    oe = OneHotEncoder(sparse_output=False)
    encoded_data = oe.fit_transform(df[cols])
    df_encoded = pd.DataFrame(encoded_data, columns=oe.get_feature_names_out(cols))
    df = df.drop(cols, axis=1)
    df = pd.concat([df, df_encoded], axis=1)
    return df

train_datas = pd.read_csv(config.path_train_data)
cols = ['Brand', 'Material']
encoded_df = one_hot_encoding(train_datas, cols)
display(encoded_df.head())

Unnamed: 0,id,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price,Brand_Adidas,Brand_Jansport,Brand_Nike,Brand_Puma,Brand_Under Armour,Brand_nan,Material_Canvas,Material_Leather,Material_Nylon,Material_Polyester,Material_nan
0,0,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,3,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## EDA

In [None]:
import itertools
import seaborn as sns

def visualize_statistical_relationships(df, cols, stats):
    for pair in itertools.combinations(cols, 2):
        fig, ax = plt.subplots(1, len(stats), figsize=(10, 5))
        for stat in stats:
            analysis_df = df[[config.pred_col]+list(pair)].pivot_table(index=pair[0], columns=pair[1], values=config.pred_col, aggfunc=stat)
            heatmap = sns.heatmap(analysis_df, annot=True, fmt=".0f", ax=ax[stats.index(stat)])
            ax[stats.index(stat)].set_title(f'{stat}')
        fig.tight_layout()
        fig.savefig(f'{config.img_pathname}_heatmap_{pair}.png')
        plt.close()

In [11]:
train_datas = pd.read_csv(config.path_train_data)

with open(f'{config.docs_pathname}_eda_train.txt', 'w') as f:
    f.write('------------------------------------\n')
    f.write('Datas Info\n')
    f.write('------------------------------------\n')
    train_datas.info(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Statical Values \n')
    f.write('------------------------------------\n')
    train_datas.describe().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Heads \n')
    f.write('------------------------------------\n')
    train_datas.head().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('How many nulls \n')
    f.write('------------------------------------\n')
    train_datas.isnull().sum().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Unique Values \n')
    train_datas.nunique().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Columns feature \n')
    for col in train_datas.columns:
        f.write(f'{col}\n')
        f.write('\n')
        f.write(f'{train_datas[col].unique()}\n')


test_datas = pd.read_csv(config.path_test_data)

with open(f'{config.docs_pathname}_eda_test.txt', 'w') as f:
    f.write('------------------------------------\n')
    f.write('Datas Info\n')
    f.write('------------------------------------\n')
    test_datas.info(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Statical Values \n')
    f.write('------------------------------------\n')
    test_datas.describe().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Heads \n')
    f.write('------------------------------------\n')
    test_datas.head().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('How many nulls \n')
    f.write('------------------------------------\n')
    test_datas.isnull().sum().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Unique Values \n')
    test_datas.nunique().to_string(buf=f)
    f.write('\n')
    f.write('\n')
    f.write('------------------------------------\n')
    f.write('Columns feature \n')
    for col in test_datas.columns:
        f.write(f'{col}\n')
        f.write(f'{test_datas[col].unique()}\n')
        f.write('\n')

In [19]:
display(train_datas.head())

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [22]:
train_datas1 = train_datas.copy()
train_datas = train_datas1.dropna()
grouped = train_datas1.groupby(['Brand', 'Material', 'Laptop Compartment', 'Style', 'Color'])

In [53]:
import itertools
cols = ['Brand', 'Material', 'Laptop Compartment', 'Style', 'Color']

stats = ['mean', 'std','count']

visualize_statistical_relationships(cols, stats)
