In [124]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt


In [125]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [126]:
brand_keyword = pd.read_csv('./data/brand_keyword_cnt.csv')

In [127]:
brand_keyword

Unnamed: 0,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-08,2022-01-09,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,B002-00001,0.84131,0.91383,1.450530,2.422390,1.871190,1.581080,1.232950,1.174930,1.145920,...,0.319110,0.391640,0.377130,0.49318,0.072520,0.29010,0.31911,0.232080,0.333620,0.44966
1,B002-00002,12.64868,20.27850,15.332170,12.750210,13.562510,13.707570,11.937910,15.564250,14.084710,...,10.269790,11.966920,10.646930,10.41485,10.487380,9.48651,9.28343,10.429350,11.154620,11.38671
2,B002-00003,0.33362,0.43516,0.362630,0.174060,0.217580,0.464170,0.420650,0.290100,0.377130,...,0.536690,0.696250,0.449660,0.39164,1.029880,0.49318,0.91383,0.797790,1.015370,0.88482
3,B002-00005,1.07339,1.71163,2.016240,1.914700,1.987230,2.146790,1.682620,1.378000,1.421520,...,2.219320,2.509420,2.872060,2.37888,2.030750,1.53756,1.34899,1.261960,2.320850,2.30635
4,B002-00006,0.00000,0.00000,0.188558,0.246574,0.246574,0.246574,0.377139,0.087012,0.261084,...,0.072526,0.290103,0.087012,0.00000,0.130542,0.00000,0.00000,0.072526,0.217577,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3165,B002-03794,2.32085,2.98810,3.611830,4.061500,3.669850,3.771390,3.031620,2.988100,3.133150,...,2.422390,2.422390,2.756010,2.32085,2.088770,1.98723,1.07339,1.929210,2.509420,1.78416
3166,B002-03795,0.14505,0.00000,0.087030,0.072520,0.087030,0.101530,0.072520,0.130540,0.116040,...,0.000000,0.072520,0.000000,0.10153,0.101530,0.00000,0.00000,0.000000,0.000000,0.00000
3167,B002-03796,0.00000,0.00000,0.000000,0.000000,0.000000,0.101530,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.00000,0.000000,0.072520,0.07252
3168,B002-03798,0.14505,0.00000,0.116040,0.072520,0.116040,0.275600,0.217580,0.116040,0.101530,...,0.101530,0.087030,0.145050,0.17406,0.188560,0.11604,0.11604,0.087030,0.174060,0.10153


### 각 행마다 브랜드 평균내기

In [128]:
# brand_keyword DataFrame을 예시로 가정

# 브랜드 정보를 가진 열의 인덱스를 가져오기 (예시: 0번 열)
brand_column_index = 0

# 브랜드 열과 나머지 열들을 분리
brands = brand_keyword.iloc[:, brand_column_index]
data_values = brand_keyword.iloc[:, 1:]

# 각 행의 평균값 계산
row_means_brand_keyword = np.mean(data_values, axis=1)

# 결과를 DataFrame으로 변환하여 브랜드 열과 평균값 열을 함께 유지
result_df = pd.DataFrame({'Brand': brands, 'RowMean': row_means_brand_keyword})

In [129]:
brand_list = brand_keyword.iloc[:,0]
brand_list

0       B002-00001
1       B002-00002
2       B002-00003
3       B002-00005
4       B002-00006
           ...    
3165    B002-03794
3166    B002-03795
3167    B002-03796
3168    B002-03798
3169    B002-03799
Name: 브랜드, Length: 3170, dtype: object

In [130]:
result_df

Unnamed: 0,Brand,RowMean
0,B002-00001,0.709779
1,B002-00002,11.609882
2,B002-00003,0.732911
3,B002-00005,1.879088
4,B002-00006,0.158759
...,...,...
3165,B002-03794,3.664386
3166,B002-03795,0.068289
3167,B002-03796,0.011534
3168,B002-03798,0.147356


### train.csv 불러오기

In [131]:
train_data = pd.read_csv('./data/train.csv').drop(columns=['ID', '제품'])

In [132]:
train_data.iloc[0,4:]

2022-01-01    0
2022-01-02    0
2022-01-03    0
2022-01-04    0
2022-01-05    0
             ..
2023-03-31    0
2023-04-01    0
2023-04-02    0
2023-04-03    0
2023-04-04    0
Name: 0, Length: 459, dtype: object

In [133]:
target_brand_list = train_data.iloc[:,3]
target_brand_list

0        B002-00001
1        B002-00002
2        B002-00002
3        B002-00002
4        B002-00003
            ...    
15885    B002-03799
15886    B002-03799
15887    B002-03799
15888    B002-03799
15889    B002-03799
Name: 브랜드, Length: 15890, dtype: object

In [134]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':10,    # 학습횟수
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':1024,
    'SEED':41   # 시드 고정
}

In [135]:
def seed_everything(seed):  # Seed 고정 함수
    random.seed(seed)      # random seed 고정
    os.environ['PYTHONHASHSEED'] = str(seed)    # PYTHONHASHSEED 값 설정
    np.random.seed(seed)    # numpy seed 고정
    torch.manual_seed(seed) # torch seed 고정
    torch.cuda.manual_seed(seed)    # torch cuda seed 고정
    torch.backends.cudnn.deterministic = True   # torch cudnn seed 고정
    torch.backends.cudnn.benchmark = True   # cudnn을 빠르게 하기 위한 옵션으로, 연산 진행시 어떤 알고리즘을 쓸지를 정하는 부분이다.

seed_everything(CFG['SEED']) # Seed 고정 함수실행

In [136]:
# 2번째방법 : min-max scaling
# 숫자형 변수들의 min-max scaling을 수행하는 코드입니다.
numeric_cols = train_data.columns[4:]

# 칵 column의 min 및 max 계산
min_values = train_data[numeric_cols].min(axis=1)
max_values = train_data[numeric_cols].max(axis=1)

# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1

# min-max scaling 수행
train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)

# max와 min 값을 dictionary 형태로 저장
scale_min_dict = min_values.to_dict()
scale_max_dict = max_values.to_dict()

 ### 브랜드 평균값을 각 행에 곱하기 해당 브랜드에만 곱해야함
 
 이중 for문 돌려서, 브랜드를 리스트 변환하고, 하나씩 꺼내서
 train 데이터에서 브랜드 열을 행마다 비교를 해서, 
 for i in brand_list:
     for j in target_brand_list:
         if i == j:
             # train data의 해당 행에 있는 값에 row_means_brand_keyword 전부 곱하기
             

In [137]:
brand_column_index = 3

# train 데이터에서 브랜드 열을 추출
train_brands = train_data.iloc[:, brand_column_index]
train_brands

0        B002-00001
1        B002-00002
2        B002-00002
3        B002-00002
4        B002-00003
            ...    
15885    B002-03799
15886    B002-03799
15887    B002-03799
15888    B002-03799
15889    B002-03799
Name: 브랜드, Length: 15890, dtype: object

In [138]:
# for i, brand in enumerate(brand_list):
#     if brand in target_brand_list:
#         brand_rows_indices = np.where(train_brands == brand)[0]
#         print(brand_rows_indices)
        
        

In [223]:
target_brand_list = train_data.iloc[:,3:]

# train 데이터에서 브랜드 열을 추출
train_brands = train_data.iloc[:, brand_column_index]

# 결과를 저장할 리스트 초기화
scaled_train_data = []

# 브랜드별로 train 데이터 행 비교 및 스케일링
for idx, row in target_brand_list.iterrows():
    brand_row = brand_keyword[brand_keyword['브랜드'] == row['브랜드']].iloc[0]
    multiplied_value = (row.iloc[1:] * brand_row.iloc[1:]).tolist()
    multiplied_value.insert(0, row['브랜드'])
    scaled_train_data.append(multiplied_value)

columns = target_brand_list.columns.tolist()
scaled_df = pd.DataFrame(scaled_train_data, columns=columns)

print(scaled_df.head())

          브랜드  2022-01-01  2022-01-02  2022-01-03  2022-01-04  2022-01-05  \
0  B002-00001         0.0         0.0         0.0         0.0         0.0   
1  B002-00002         0.0         0.0         0.0         0.0         0.0   
2  B002-00002         0.0         0.0         0.0         0.0         0.0   
3  B002-00002         0.0         0.0         0.0         0.0         0.0   
4  B002-00003         0.0         0.0         0.0         0.0         0.0   

   2022-01-06  2022-01-07  2022-01-08  2022-01-09  ...  2023-03-26  \
0         0.0         0.0         0.0         0.0  ...         0.0   
1         0.0         0.0         0.0         0.0  ...         0.0   
2         0.0         0.0         0.0         0.0  ...         0.0   
3         0.0         0.0         0.0         0.0  ...         0.0   
4         0.0         0.0         0.0         0.0  ...         0.0   

   2023-03-27  2023-03-28  2023-03-29  2023-03-30  2023-03-31  2023-04-01  \
0         0.0         0.0    0.000000  

In [200]:
brand_keyword

Unnamed: 0,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-08,2022-01-09,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,B002-00001,0.84131,0.91383,1.450530,2.422390,1.871190,1.581080,1.232950,1.174930,1.145920,...,0.319110,0.391640,0.377130,0.49318,0.072520,0.29010,0.31911,0.232080,0.333620,0.44966
1,B002-00002,12.64868,20.27850,15.332170,12.750210,13.562510,13.707570,11.937910,15.564250,14.084710,...,10.269790,11.966920,10.646930,10.41485,10.487380,9.48651,9.28343,10.429350,11.154620,11.38671
2,B002-00003,0.33362,0.43516,0.362630,0.174060,0.217580,0.464170,0.420650,0.290100,0.377130,...,0.536690,0.696250,0.449660,0.39164,1.029880,0.49318,0.91383,0.797790,1.015370,0.88482
3,B002-00005,1.07339,1.71163,2.016240,1.914700,1.987230,2.146790,1.682620,1.378000,1.421520,...,2.219320,2.509420,2.872060,2.37888,2.030750,1.53756,1.34899,1.261960,2.320850,2.30635
4,B002-00006,0.00000,0.00000,0.188558,0.246574,0.246574,0.246574,0.377139,0.087012,0.261084,...,0.072526,0.290103,0.087012,0.00000,0.130542,0.00000,0.00000,0.072526,0.217577,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3165,B002-03794,2.32085,2.98810,3.611830,4.061500,3.669850,3.771390,3.031620,2.988100,3.133150,...,2.422390,2.422390,2.756010,2.32085,2.088770,1.98723,1.07339,1.929210,2.509420,1.78416
3166,B002-03795,0.14505,0.00000,0.087030,0.072520,0.087030,0.101530,0.072520,0.130540,0.116040,...,0.000000,0.072520,0.000000,0.10153,0.101530,0.00000,0.00000,0.000000,0.000000,0.00000
3167,B002-03796,0.00000,0.00000,0.000000,0.000000,0.000000,0.101530,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.00000,0.000000,0.072520,0.07252
3168,B002-03798,0.14505,0.00000,0.116040,0.072520,0.116040,0.275600,0.217580,0.116040,0.101530,...,0.101530,0.087030,0.145050,0.17406,0.188560,0.11604,0.11604,0.087030,0.174060,0.10153


pandas.core.frame.DataFrame