In [5]:
import os
import re
import json
import h5py

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
!pip install h5py

Collecting h5py
  Downloading h5py-3.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0mm
Installing collected packages: h5py
Successfully installed h5py-3.8.0


In [6]:
pd.__version__

'1.3.5'

# 데이터 불러오기

- 참고 : https://github.com/lime-robot/categories-prediction/blob/1cdaf3797285bfc8b9dc1118adc6a5230a394b97/code/preprocess.ipynb

In [8]:
RAW_DATA_DIR = "./input/raw_data" # 카카오에서 다운로드 받은 데이터의 디렉터리

train_file_list = [
    "train.chunk.01",
    "train.chunk.02",
    "train.chunk.03",
    "train.chunk.04",
    "train.chunk.05",
    "train.chunk.06",
    "train.chunk.07",
    "train.chunk.08",
    "train.chunk.09"
]

dev_file_list = [
    "dev.chunk.01"    
]

test_file_list = [
    "test.chunk.01",
    "test.chunk.02", 
]

train_path_list = [os.path.join(RAW_DATA_DIR, fn) for fn in train_file_list]
dev_path_list = [os.path.join(RAW_DATA_DIR, fn) for fn in dev_file_list]
test_path_list = [os.path.join(RAW_DATA_DIR, fn) for fn in test_file_list]

In [12]:
# path_list의 파일에서 col 변수에 해당하는 컬럼 값들을 가져온다.
def get_column_data(path_list, div, col):
    col_data = []
    for path in path_list:
        h = h5py.File(path, 'r')
        col_data.append(h[div][col][:])
        h.close()
    return np.concatenate(col_data)


# path_list의 파일에서 학습에 필요한 컬럼들을 DataFrame 포맷으로 반환한다.
def get_dataframe(path_list, div):
    pids = get_column_data(path_list, div, col='pid')
    products = get_column_data(path_list, div, col='product')
    brands = get_column_data(path_list, div, col='brand')
    makers = get_column_data(path_list, div, col='maker')
    models = get_column_data(path_list, div, col='model') 
    prices = get_column_data(path_list, div, col='price')
    updttms = get_column_data(path_list, div, col='updttm')
    bcates = get_column_data(path_list, div, col='bcateid')
    mcates = get_column_data(path_list, div, col='mcateid')
    scates = get_column_data(path_list, div, col='scateid')
    dcates = get_column_data(path_list, div, col='dcateid')
    
    df = pd.DataFrame({'pid': pids, 
                       'product': products, 
                       'brand': brands, 
                       'maker': makers, 
                       'model': models, 
                       'price': prices, 
                       'updttm': updttms, 
                       'bcateid': bcates, 
                       'mcateid': mcates, 
                       'scateid': scates, 
                       'dcateid': dcates})
    
    # 바이트 열로 인코딩 상품제목과 상품ID를 유니코드 변환한다.
    df['pid'] = df['pid'].map(lambda x: x.decode('utf-8'))
    df['product'] = df['product'].map(lambda x: x.decode('utf-8'))
    df['brand'] = df['brand'].map(lambda x: x.decode('utf-8'))
    df['maker'] = df['maker'].map(lambda x: x.decode('utf-8'))
    df['model'] = df['model'].map(lambda x: x.decode('utf-8'))
    df['updttm'] = df['updttm'].map(lambda x: x.decode('utf-8'))     
    
    return df

In [13]:
df = get_dataframe(train_path_list, 'train')

df.head() #8134818

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = './input/raw_data/train.chunk.01', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
# 카테고리 이름과 ID의 매핑 정보를 불러온다.
cate_json = json.load(open(os.path.join(RAW_DATA_DIR, 'cate1.json')))

# (이름, ID) 순서를 (ID, 이름)으로 바꾼 후 dictionary로 만든다.
bid2nm = dict([(cid, name) for name, cid in cate_json['b'].items()]) # 대 
mid2nm = dict([(cid, name) for name, cid in cate_json['m'].items()]) # 중
sid2nm = dict([(cid, name) for name, cid in cate_json['s'].items()]) # 소
did2nm = dict([(cid, name) for name, cid in cate_json['d'].items()]) # 세

In [None]:
df['bcatenm'] = df['bcateid'].map(bid2nm)
df['mcatenm'] = df['mcateid'].map(mid2nm)
df['scatenm'] = df['scateid'].map(sid2nm)
df['dcatenm'] = df['dcateid'].map(did2nm)

df['updttm'] = pd.to_datetime(df['updttm'], format='%Y%m%d%H%M%S')

In [None]:
df.drop(columns=['bcateid', 'mcateid', 'scateid', 'dcateid'], inplace=True)

# 디지털 / 가전 관련 카테고리 

In [None]:
# 디지털 / 가전

bctgr_list = ['계절가전/에어컨/온열기기',
              '내비/블랙박스/하이패스',
              '노트북/태블릿PC',
              '데스크탑/모니터/PC부품',
              '디카/캠코더/주변기기',
              '생활가전/세탁기/청소기',
              '영상가전/TV/홈시어터',
              '음향가전/스피커/전자사전',
              '주방가전/냉장고/전기밥솥',
              '프린터/PC주변/사무기기']

In [None]:
digital_df = df[df['bcatenm'].isin(bctgr_list)].copy()
digital_df = digital_df.reset_index(drop=True)

In [None]:
digital_df.to_pickle('./input/digital_prd_list.pkl')

In [None]:
digital_df

# 데이터 전처리

In [None]:
digital_df = pd.read_pickle('./input/digital_prd_list.pkl')

In [None]:
# 괄호 제거
def remove_brackets(text):
    text = re.sub('\[([^\[^\]]+)\]', lambda m: f" {m.group(1)} ", text)
    text = re.sub('\(([^\(^\)]+)\)', lambda m: f" {m.group(1)} ", text)
    return text.strip()


def add_space(text):
    return re.sub('([가-힣]+)', r' \1 ', text).strip()


# 참고 : 정규식표현(패스워드) : https://heeya7.tistory.com/37
prd_code_re = re.compile(
    "(^(?=.*[a-z])(?=.*\d)[a-z\d\-\_\.]{3,}$)" + "|" + \
    "(^(?=.*[a-z])(?=.*[\-\_\.])[a-z\d\-\_\.]{3,}$)" + "|" + \
    "(^(?=.*\d)(?=.*[\-\_\.])[a-z\d\-\_\.]{3,}$)"
)

def get_prd_code(string):
    matched_prd_code = prd_code_re.match(string)
    if matched_prd_code:
        return matched_prd_code.group()

In [None]:
digital_df['product'] = digital_df['product'].str.lower()

In [None]:
digital_df['product']

In [None]:
# 괄호 제거
digital_df.loc[:, 'product'] = digital_df.loc[:, 'product'].map(remove_brackets)
# /, + 제거
digital_df.loc[:, 'product'] = digital_df.loc[:, 'product'].str.replace('/|\+', ' ', regex=True)

In [None]:
# 한글단어 앞뒤로 공백 추가
digital_df.loc[:, 'product'] = digital_df.loc[:, 'product'].map(add_space)
# '      ' -> ' '
digital_df.loc[:, 'product'] = digital_df.loc[:, 'product'].str.replace('\s+', ' ', regex=True)

In [None]:
digital_df['product']

In [None]:
# 모델명 후보군 추출
digital_df.loc[:, 'product_terms'] = digital_df.loc[:, 'product'].str.split()

In [None]:
digital_df.loc[:, 'product_terms']

In [None]:
digital_df.loc[:, 'product_terms'] = digital_df.loc[:, 'product_terms'].map(lambda x: list(filter(get_prd_code, x)))

In [None]:
digital_df[['product', 'product_terms']].head()

In [None]:
# model column 에도 동일 로직 적용
digital_df['model'] = digital_df['model'].str.lower()

digital_df.loc[:, 'model'] = digital_df.loc[:, 'model'].map(remove_brackets)
digital_df.loc[:, 'model'] = digital_df.loc[:, 'model'].str.replace('/|\+', ' ', regex=True)

digital_df.loc[:, 'model'] = digital_df.loc[:, 'model'].map(add_space)
digital_df.loc[:, 'model'] = digital_df.loc[:, 'model'].str.replace('\s+', ' ', regex=True)

digital_df.loc[:, 'model_terms'] = digital_df.loc[:, 'model'].str.split()
digital_df.loc[:, 'model_terms'] = digital_df.loc[:, 'model_terms'].map(lambda x: list(filter(get_prd_code, x)))

In [None]:
digital_df[['product', 'product_terms', 'model_terms', 'model']].head()

In [None]:
# model-term 과 일치하는 product-term 은 True, 아니면 False

digital_df.loc[:, 'dataset'] = digital_df[['product_terms', 'model_terms']].apply(
    lambda x: {term: term in x[1] for term in x[0]}, axis=1)

In [None]:
digital_df[['product', 'product_terms', 'model_terms', 'dataset']].head()

In [None]:
from collections import defaultdict, Counter
from itertools import chain

In [None]:
model_name_dict = defaultdict(list)

for term_dict in digital_df['dataset'].values:
    for term, v in term_dict.items():
        model_name_dict[term].append(v)
        
for term in list(chain(*digital_df['model_terms'])):
    model_name_dict[term].append(True)
    model_name_dict[term].append(True)

In [None]:
datasets = [(name, (sum(count) / len(count)) >= 0.5) for name, count in model_name_dict.items() if len(count) >= 10]

In [None]:
print(sum([labels for term, labels in datasets]), "/" , len(datasets))

In [None]:
datasets

In [None]:
digital_df['brand_norm'] = digital_df['brand'].str.lower().str.replace(' ', '')
digital_df['brand_norm'] = digital_df['brand_norm'].str.strip().str.replace('\(|\)', '', regex=True)

digital_df['product_norm'] = digital_df['product'].str.lower().str.replace(' ', '')
digital_df['product_norm'] = digital_df['product_norm'].str.strip().str.replace('\(|\)', '', regex=True)

digital_df['maker_norm'] = digital_df['maker'].str.lower().str.replace(' ', '')
digital_df['maker_norm'] = digital_df['maker_norm'].str.strip().str.replace('\(|\)', '', regex=True)

In [None]:
model_name_df = digital_df.loc[
    digital_df['dataset'] != {}, 
    ['product', 'dataset', 'brand', 'maker']]

In [None]:
model_name_df['dataset'] = model_name_df['dataset'].map(dict.items)

In [None]:
model_name_df

In [None]:
# dict item to pandas row
model_name_df = model_name_df.explode('dataset')

In [None]:
model_name_df['model_name'] = model_name_df['dataset'].str[0]
model_name_df['label'] = model_name_df['dataset'].str[1]

In [None]:
# model_name_df.to_pickle('model_name_dataset.pkl')