In [1]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import matplotlib.font_manager as fm
import matplotlib
import itertools

In [2]:
fm.get_fontconfig_fonts()
font_location = 'C:\\WINDOWS\\Fonts\\malgun.ttf' # For Windows
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)

The get_fontconfig_fonts function was deprecated in Matplotlib 3.5 and will be removed two minor releases later.
  fm.get_fontconfig_fonts()


In [3]:
FILES_DIR = '../data/a1/'
log_df = pd.read_csv(FILES_DIR + 'log_csv.csv')

In [92]:
# null값들을 un_** 으로 채워서 확인해보기
log_drop_null = log_df
log_drop_null['geoip_city_name'].fillna('un_city_name', inplace = True)
log_drop_null['category3'].fillna('un_category', inplace = True)
log_drop_null['uid'].fillna('un_name', inplace = True)
log_drop_null.dropna(axis = 0, inplace = True)
print(log_drop_null.isnull().sum().sort_values(ascending=False)) # null값 퍼센트로 표현

Unnamed: 0         0
uid                0
action_type        0
category1          0
category2          0
category3          0
collect_time       0
geoip_city_name    0
useragent_os       0
viewrate           0
viewtime           0
rating             0
dtype: int64


In [93]:
log_drop_null['rating'] = log_drop_null['action_type']
re_name = {'rating' : {'View':1, 'Highlight':2, 'Basket':4, 'Copy':3, 'SINF':0}}
log_drop_null = log_drop_null.replace(re_name)
log_drop_null = log_drop_null[['uid', 'category2', 'rating']]
log_drop_null                                        

Unnamed: 0,uid,category2,rating
0,avpKyWMn/xEADNlb,그릇·홈세트,1
1,0+5vEmGkOuAAB5j2,매트,4
2,r8DXy2OtSl8AAojG,커튼·부자재,1
3,dC9Fl2Ly9lcADrTn,플라워·식물,2
4,09gKcGLg0I8ACyQy,진열장·책장,1
...,...,...,...
5659536,PUpxWWMWdcIACR2B,칼·커팅기구,1
5659537,PVMstmNg4I4ADq9R,진열장·책장,4
5659538,diRXdmN49dUABrVo,테이블·식탁·책상,1
5659539,dusFzWJvp5cADhZW,계절가전,1


In [94]:
log = log_drop_null.drop_duplicates()
log

Unnamed: 0,uid,category2,rating
0,avpKyWMn/xEADNlb,그릇·홈세트,1
1,0+5vEmGkOuAAB5j2,매트,4
2,r8DXy2OtSl8AAojG,커튼·부자재,1
3,dC9Fl2Ly9lcADrTn,플라워·식물,2
4,09gKcGLg0I8ACyQy,진열장·책장,1
...,...,...,...
5659504,eiIedGNsz7MAC4Wi,선반,1
5659515,AeCSamOsMCsACJDD,옷걸이,1
5659530,0rIeEmOs/WwABY+T,수납장·옷장,1
5659536,PUpxWWMWdcIACR2B,칼·커팅기구,1


In [105]:
log['uid'].nunique()

828075

In [103]:
log_sample = log.sample(frac=0.01, random_state=42)
log_sample = log_sample.drop_duplicates()
log_sample['uid'].nunique()

16867

In [104]:
log_sample

Unnamed: 0,uid,category2,rating
2905050,3HSqnWOmn7oADJH4,테이블·식탁·책상,1
5267375,AdbYGmI78zwABqxk,리빙박스·바구니,1
274023,3Z1LXGOQI+4ACHnS,매트,1
673227,2p4mBGGl+OgACASX,가벽·파티션,1
1888515,amWBqWOoOccABzk1,욕실용품,1
...,...,...,...
5636345,cKrDh2Otlu8ADAqe,침대,1
1622670,0yzXAmOgAB4AAD/h,생활·건강가전,1
1146452,fbkZ4GFY1f8AC9u4,화장대·테이블정리,1
2474365,DjNnamGfMSsAAdr4,이불·이불솜,1


In [106]:
train_df, test_df = train_test_split(log, test_size=0.2, random_state=42)

print(train_df.shape)
print(test_df.shape)

(1408698, 3)
(352175, 3)


In [109]:
train_df = train_df.drop_duplicates()
train_df['category2'].unique()

array(['소파', '홈패브릭', '그릇·홈세트', '거실장·TV장', '꾸미기팁', '매트리스·토퍼', '컵·잔·텀블러',
       '서랍·수납장', '매트·안전용품', '침대', '파티·완구', '플라워·식물', '생활·건강가전', '청소기',
       '베개·베개커버', '수납장·옷장', '매트', '보관·용기·도시락', '시공정보', '테이블·식탁·책상',
       '계절가전', '러그·카페트', '의자', '이불·이불솜', '안전·방범용품', '커튼·부자재', '토퍼·패드',
       '디퓨저·캔들', '진열장·책장', '주방수납·정리', '무드등·장식조명', '홈갤러리', '화장대·콘솔',
       '스킨케어', '장스탠드', '수도', '옷정리·이불정리', '냉동·냉장·간편식', '주방잡화', '주방패브릭',
       '야외가구', '리빙박스·바구니', '데스크·디자인문구', '생활정보', '인테리어소품', '선반', '반찬·장류',
       '강아지', '지식백과', '공간별수납정리', '가구', '주방가전', '전기', 'O!SelectShop',
       '크리스마스', '청소용품', '스킨케어·목욕용품', '수납', '음향가전', '세탁·세제용품', '커피·티용품',
       '바디케어', '수저·커트러리', '행거·옷장', '생활잡화', '벽지·시트지', '거울', '게임기·드론',
       '냉장고', '태블릿PC', '현관·신발정리', '고양이', '식기건조대', '청소세제·세정제', '단스탠드',
       '책상', '냄비·프라이팬·솥', '캠핑가구', '가벽·파티션', '형광등·조명부속품', '블라인드·롤스크린',
       '컴퓨터·노트북', '후크·수납걸이', '조리도구·도마', '화장대·테이블정리', '월데코·장식', 'TV',
       'LED 등', '키친', '유아동가구', '시계', '욕실용품', '매트리스커버', '천장등', '주방용품',
       '외출용품', 'D

In [108]:
sparse_matrix = train_df.groupby('category2').apply(lambda x: pd.Series(x['rating'].values, index=x['uid'])).unstack()
sparse_matrix.index.name = 'category2'

sparse_matrix

ValueError: Index contains duplicate entries, cannot reshape