# 1. 개발 환경 소개

- Google Colab CPU 환경에서 진행했습니다.
- 개발 환경은 다음과 같습니다.

In [None]:
import platform
print('- os:',platform.platform())
print('- 운영체제:', end="")
!cat /etc/issue.net
print('- Process information:', platform.processor())
print('- Process Architecture:', platform.machine())
print("- RAM: 12.68GB")

- os: Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic
- 운영체제:Ubuntu 18.04.6 LTS
- Process information: x86_64
- Process Architecture: x86_64
- RAM: 12.68GB


# 2. 라이브러리 불러오기 및 경로 지정

### 라이브러리 불러오기

In [None]:
# 전처리 라이브러리
import pandas as pd
import os
import pickle
from datetime import datetime
import random

# 분석 라이브러리
from sklearn.preprocessing import StandardScaler

# 분석에 문제가 없는 경고 메시지 숨김
import warnings
warnings.filterwarnings('ignore')

### 라이브러리 버전

In [None]:
print('- ', end="")
!python --version
print('- pandas:', pd.__version__)

- Python 3.7.13
- pandas: 1.3.5


### 경로 설정 및 데이터 불러오기

In [None]:
# 구글 드라이브 연결
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


코드 실행을 위해 경로를 설정합니다.

In [None]:
data_dir = '/content/drive/MyDrive/롯데멤버스_경진대회/3. 안다비젼_ 데이터 및 모델 세이브 파일/'

In [None]:
# 데이터 불러오기
pdde = pd.read_csv(data_dir+'data/LPOINT_BIG_COMP_02_PDDE.csv', parse_dates=['de_dt']) # 상품 구매 정보 - 유통사 상품 구매 내역 
pd_clac = pd.read_csv(data_dir+'data/LPOINT_BIG_COMP_04_PD_CLAC.csv') # 상품 분류 정보 - 유통사 상품 카테고리 마스터 
data_reord = pd.read_csv(data_dir+'output/data_reord.csv') # 상품 재주문 정보

# 3. 데이터 전처리 및 분리

* 모델 입력을 위해 텍스트를 인코딩합니다.

In [None]:
# 0. 데이터 호출
data_reord = data_reord[data_reord['chnl_dv']==2].drop(['chnl_dv'], axis=1) # 온라인 데이터만 사용


# 1. 구매 금액 스케일링
scaler = StandardScaler()
scaler.fit(data_reord[['buy_am']])
data_reord['buy_am'] = scaler.transform(data_reord[['buy_am']])
data_reord['buy_am'] = round(data_reord['buy_am'], 10)


# 2. 인코딩
# 유저 인덱스 인코딩
user_ids = data_reord["user_id"].unique().tolist()
user2user_encoded = {x: int(i) for i, x in enumerate(user_ids)}
user2user_decoded = {x: int(i) for i, x in enumerate(user_ids)}

# 주문 인덱스 인코딩
order_ids = data_reord["order_id"].unique().tolist()
order2order_encoded = {x: int(i) for i, x in enumerate(order_ids)}
order2order_decoded = {x: int(i) for i, x in enumerate(order_ids)}

# 상품 인덱스 인코딩
product_ids = data_reord["product_id"].unique().tolist()
product2product_encoded = {x: int(i) for i, x in enumerate(product_ids)}
product2product_decoded = {int(i): x for i, x in enumerate(product_ids)}

# 상품 이름 인코딩
pd_name_ids = data_reord["product_name"].unique().tolist()
pd_name2pd_name_encoded = {x: int(i) for i, x in enumerate(pd_name_ids)}
pd_name2pd_name_decoded = {int(i): x  for i, x in enumerate(pd_name_ids)}

# 상품 대분류명 인덱스 인코딩
dept_name_ids = data_reord["department"].unique().tolist()
dept_name2dept_name_encoded = {x: int(i) for i, x in enumerate(dept_name_ids)}
dept_name2dept_name_decoded = {int(i): x  for i, x in enumerate(dept_name_ids)}

# 상품 소분류명 인덱스 인코딩
aisle_name_ids = data_reord["aisle"].unique().tolist()
aisle_name2aisle_name_encoded = {x: int(i) for i, x in enumerate(aisle_name_ids)}
aisle_name2aisle_name_decoded = {int(i): x  for i, x in enumerate(aisle_name_ids)}

# 성별 인코딩
gender_ids = data_reord["ma_fem_dv"].unique().tolist()
gender2gender_encoded = {x: int(i) for i, x in enumerate(gender_ids)}
gender2gender_decoded = {int(i): x  for i, x in enumerate(gender_ids)}

# 연령대 인코딩
age_ids = data_reord["ages"].unique().tolist()
age2age_encoded = {x: int(i) for i, x in enumerate(age_ids)}
age2age_decoded = {int(i): x  for i, x in enumerate(age_ids)}

# 지역 인코딩
reg_ids = data_reord["zon_hlv"].unique().tolist()
reg2reg_encoded = {x: int(i) for i, x in enumerate(reg_ids)}
reg2reg_decoded = {int(i): x  for i, x in enumerate(reg_ids)}

data_reord["user"] = data_reord["user_id"].map(user2user_encoded)
data_reord["product"] = data_reord["product_id"].map(product2product_encoded)
data_reord["order"] = data_reord["order_id"].map(order2order_encoded)
data_reord["pd_name"] = data_reord["product_name"].map(pd_name2pd_name_encoded)
data_reord["gender"] = data_reord["ma_fem_dv"].map(gender2gender_encoded)
data_reord["ages"] = data_reord["ages"].map(age2age_encoded)
data_reord["region"] = data_reord["zon_hlv"].map(reg2reg_encoded)

* 데이터 분리를 위해 비율을 확인합니다.

In [None]:
# 3. 데이터 분리 및 비율 확인
data_reord['order_date'] = pd.to_datetime(data_reord['order_date'])
data_train = data_reord[data_reord['order_date'] <= datetime(2021,6,1)]
data_test = data_reord[data_reord['order_date'] > datetime(2021,6,1)]

print(len(data_train)/len(data_reord))

0.8052629982193741


* 분리한 데이터에 대해 유저별로 그룹화한 데이터프레임을 준비합니다.

In [None]:
# 4. user 기준으로 데이터 조정
def generate_hist(df, typ):
  globals()['order_hist_'+typ] = df.groupby(['user'])['order'].unique().apply(list).reset_index()
  globals()['product_hist_'+typ] = df.groupby(['user'])['product'].apply(list).reset_index()
  globals()['order_dow_hist_'+typ] = df.groupby(['user'])['order_dow'].apply(list).reset_index()
  globals()['order_date_hist_'+typ] = df.groupby(['user'])['order_date'].apply(list).reset_index()
  globals()['order_hour_of_day_hist_'+typ] = df.groupby(['user'])['order_hour_of_day'].apply(list).reset_index()
  globals()['days_since_prior_order_hist_'+typ] = df.groupby(['user'])['day_since_prior_order'].apply(list).reset_index()
  globals()['buyam_hist_'+typ] = df.groupby(['user'])['buy_am'].apply(list).reset_index()

generate_hist(data_train, 'train')
generate_hist(data_test, 'test')

* 위에서 준비한 데이터프레임을 합쳐 하나의 그룹화된 데이터프레임을 생성합니다.

In [None]:
# 5. User dataset 생성 및 label 생성
def generate_user_data(df, typ):
  globals()['user_data_'+typ] = df[['user','user_id']].merge(globals()['order_hist_'+typ], how='left').merge(globals()['product_hist_'+typ], how='left').merge(globals()['order_dow_hist_'+typ], how='left').merge(globals()['order_hour_of_day_hist_'+typ], how = 'left').merge(globals()['days_since_prior_order_hist_'+typ], how='left').merge(globals()['buyam_hist_'+typ], how = 'left')
  globals()['user_data_'+typ] = globals()['user_data_'+typ].drop_duplicates('user')
  globals()['user_data_'+typ]['predict_labels'] = globals()['user_data_'+typ]['product'].apply(lambda x: int(random.uniform(0, globals()['data_'+typ]['product'].max()))) # 나눠서 라벨 생성 - future data leakage 방지

generate_user_data(data_train, 'train')
generate_user_data(data_test, 'test')

In [None]:
# 6. 결과 확인
print(data_train.shape)
data_train.head(3)

(345056, 21)


Unnamed: 0,order_id,user_id,order_dow,order_date,order_hour_of_day,day_since_prior_order,product_id,reordered,product_name,aisle,...,buy_am,ma_fem_dv,ages,zon_hlv,user,product,order,pd_name,gender,region
4243,E06000117783,M023350361,5,2021-01-01,8,0.0,PD1115,0.0,여아원피스,여아의류전신,...,0.408321,여성,0,Z14,0,0,0,0,0,0
4244,E06000117783,M023350361,5,2021-01-01,8,0.0,PD1101,0.0,여아티셔츠/탑,여아의류상의,...,0.275923,여성,0,Z14,0,1,0,1,0,0
4245,E06000117783,M023350361,5,2021-01-01,8,0.0,PD1101,0.0,여아티셔츠/탑,여아의류상의,...,0.275923,여성,0,Z14,0,1,0,1,0,0


In [None]:
print(data_test.shape)
data_test.head(3)

(83445, 21)


Unnamed: 0,order_id,user_id,order_dow,order_date,order_hour_of_day,day_since_prior_order,product_id,reordered,product_name,aisle,...,buy_am,ma_fem_dv,ages,zon_hlv,user,product,order,pd_name,gender,region
2023688,E02026230783,M562512968,3,2021-06-02,13,13.0,PD0116,1.0,김자반/부각/튀각,해조류,...,-0.137748,남성,0,Z17,1356,243,77844,243,1,2
2023698,E02026230783,M562512968,3,2021-06-02,13,0.0,PD1156,0.0,일반우유,우유,...,-0.181881,남성,0,Z17,1356,26,77844,26,1,2
2023699,E02026230783,M562512968,3,2021-06-02,13,0.0,PD0324,0.0,냉동디저트,냉동간편식,...,-0.187765,남성,0,Z17,1356,197,77844,197,1,2


In [None]:
print(user_data_train.shape)
user_data_train.head(3)

(8751, 9)


Unnamed: 0,user,user_id,order,product,order_dow,order_hour_of_day,day_since_prior_order,buy_am,predict_labels
0,0,M023350361,"[0, 371, 899, 1308, 1508, 1741, 1753, 6037, 90...","[0, 1, 1, 223, 130, 189, 141, 294, 167, 88, 88...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, ...","[8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 20, 20, 20, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.4083210904, 0.2759228176, 0.2759228176, -0....",819
3,1,M042338805,"[1, 160, 325, 1149, 2144, 19468, 22290]","[2, 148, 341, 259, 173, 187, 221, 343, 122, 16...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...","[8, 15, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[3.0430467198, 0.0376059265, -0.1389251039, -0...",1160
4,2,M218261740,"[2, 1550, 1759, 1804, 1891, 3563, 3687, 3751, ...","[3, 3, 467, 648, 365, 261, 88, 98, 98, 191, 24...","[5, 5, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, ...","[8, 8, 10, 10, 15, 15, 15, 14, 14, 14, 14, 14,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0285932099, -0.1536360231, 0.0773254084, 0...",929


In [None]:
print(user_data_test.shape)
user_data_test.head(3)

(4074, 9)


Unnamed: 0,user,user_id,order,product,order_dow,order_hour_of_day,day_since_prior_order,buy_am,predict_labels
0,1356,M562512968,"[77844, 78338, 78354, 78526, 79279, 79337, 801...","[243, 26, 197, 141, 141, 701, 141, 11, 324, 17...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, ...","[13, 13, 13, 13, 13, 13, 13, 13, 13, 10, 10, 1...","[13.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[-0.1377482304, -0.181880988, -0.1877653557, -...",951
8,5364,M935338625,"[77845, 78027, 78029, 78087, 78088, 78597, 796...","[111, 895, 170, 174, 56, 210, 32, 26, 56, 54, ...","[3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, ...","[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1...","[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 103.0...","[-0.1097974839, -0.0227088422, -0.2230715618, ...",671
16,664,M023794546,"[77846, 78350, 79028, 79139, 79617, 85605, 899...","[895, 88, 26, 119, 49, 522, 287, 266, 266, 170...","[3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[12, 12, 12, 12, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0374197614, -0.1965919072, -0.2126268091, ...",45


In [None]:
# 7. 데이터 저장
data_train.to_csv(data_dir+'output/model_data_train.csv', index=False)
data_test.to_csv(data_dir+'output/model_data_test.csv', index=False)
user_data_train.to_csv(data_dir+'output/user_data_train.csv', index=False)
user_data_test.to_csv(data_dir+'output/user_data_test.csv', index=False)

# 8. 디코딩 정보 저장
path_dir = data_dir+'output/pkl/'
with open(path_dir+'user2user_decoded.pkl','wb') as fw: pickle.dump(user2user_decoded, fw)
with open(path_dir+'order2order_decoded.pkl','wb') as fw: pickle.dump(order2order_decoded, fw)
with open(path_dir+'product2product_decoded.pkl','wb') as fw: pickle.dump(product2product_decoded, fw)
with open(path_dir+'pd_name2pd_name_decoded.pkl','wb') as fw: pickle.dump(pd_name2pd_name_decoded, fw)
with open(path_dir+'dept_name2dept_name_decoded.pkl','wb') as fw: pickle.dump(dept_name2dept_name_decoded, fw)
with open(path_dir+'aisle_name2aisle_name_decoded.pkl','wb') as fw: pickle.dump(aisle_name2aisle_name_decoded, fw)
with open(path_dir+'gender2gender_decoded.pkl','wb') as fw: pickle.dump(gender2gender_decoded, fw)
with open(path_dir+'age2age_decoded.pkl','wb') as fw: pickle.dump(age2age_decoded, fw)
with open(path_dir+'reg2reg_decoded.pkl','wb') as fw: pickle.dump(reg2reg_decoded, fw)