In [1]:
import shutil, os, warnings, re, datetime, multiprocessing
warnings.filterwarnings(action='ignore')
import numpy as np
import pandas as pd
import tensorflow as tf
import _pickle as pickle
from gensim.models import Doc2Vec
from itertools import chain, combinations
from sklearn.model_selection import train_test_split
from collections import namedtuple, Counter
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
warnings.filterwarnings(action='default')

In [2]:
def ItemOneHotGenerator(data, threshold = 30):
    """
    data : Data format must be pandas Series with rows having list of items.
              reset_index() required.
              example : '6601000010,6601000024,....,6601000001'
    threshold : To exclude super sparse case of cross columns
    dependencies : collections.Counter
                              sklearn.feature_extraction.text.CountVectorizer
    """    
    cross_cnt = Counter()
    for ii, jj in data.iteritems():
        litem = sorted(set(jj.split(',')))
        cross_cnt.update(litem)
    target_lcross = set({x : cross_cnt[x] for x in cross_cnt if cross_cnt[x] > threshold}.keys()) 
    
    cross_feature = []
    for ii, jj in data.iteritems():
        litem = sorted(set(jj.split(',')))
        lcross = set(litem).intersection(target_lcross)
        cross_feature.append(lcross)
        
    lcross_feature = map(lambda x:' '.join(x), cross_feature)
    vect = CountVectorizer(tokenizer=str.split)
    mcross_feature = vect.fit_transform(lcross_feature)
    with open('./ItemOneHotGenerator.pkl','wb') as f:
        pickle.dump(vect, f)
    return pd.DataFrame(mcross_feature.todense(), columns=vect.get_feature_names())

def CrossColumnGenerator(data, threshold = 5):
    """
    data : Data format must be pandas Series with rows having list of items.
              reset_index() required.
              example : '6601000010,6601000024,....,6601000001'
    threshold : To exclude super sparse case of cross columns
    dependencies : collections.Counter
                              sklearn.feature_extraction.text.CountVectorizer
    """
    cross_cnt = Counter()
    for ii, jj in data.iteritems():
        litem = sorted(set(jj.split(',')))
        cross_cnt.update([str(kk + '_' + ll) for kk, ll in combinations(litem, 2)])
        
    target_lcross = set({x : cross_cnt[x] for x in cross_cnt if cross_cnt[x] >= threshold}.keys()) 
    
    cross_feature = []
    for ii, jj in data.iteritems():
        litem = sorted(set(jj.split(',')))
        lcross = set([str(kk + '_' + ll) for kk, ll in combinations(litem, 2)]).intersection(target_lcross)
        cross_feature.append(lcross)

    lcross_feature = map(lambda x:' '.join(x), cross_feature)
    vect = CountVectorizer(tokenizer=str.split)
    mcross_feature = vect.fit_transform(lcross_feature)
    with open('./CrossColumnGenerator.pkl','wb') as f:
        pickle.dump(vect, f)
    return pd.DataFrame(mcross_feature.todense(), columns=vect.get_feature_names())

def myDoc2Vec(data, d2v_size, model_loc, epoch=None, trainTF = False):
    """
    data : DataFrame([['DocID','Item']])
    """
    data.columns = ['docID','item']
    data1 = [(str(row['item']).split(), row['docID']) for idx, row in data.iterrows()]
    data2 = namedtuple('TaggedDocument', 'words tags')
    tagged_data2 = [data2(d, [c]) for d, c in data1]
    
    if trainTF:
        model = Doc2Vec(
            dm = 0,  # 0 : PV-DBOW
            dbow_words = 0,  # 0 : train doc-vec only(faster)
            window = 8, vector_size = d2v_size, alpha = 0.025, min_alpha = 0.025, seed = 0, sample= 1e-5, min_count=3, 
            workers=multiprocessing.cpu_count(), hs = 0, negative = 10)
        model.build_vocab(tagged_data2)
        print('New model training started')
        model.train(documents  = tagged_data2, total_examples = data.shape[0], epochs = epoch)
        model.save(model_loc)
        print('New model training done. check--',model_loc)
    else:
        model=Doc2Vec.load(model_loc)
        embedding_df = pd.DataFrame(
            data = [model.infer_vector(doc.words) for doc in tagged_data2], 
            columns = ["d2v"+str(i) for i in range(d2v_size)])
        return embedding_df
    
def train_eval_filesplit(filenames, train_rate = 0.8):
    trainfiles = []
    evalfiles = []
    filecnt = len(filenames)
    trainloop = int(round(filecnt/(filecnt - int(filecnt*train_rate)),0))
    for ii in range(filecnt):
        if ii % trainloop == 0:
            evalfiles.append(filenames[ii])
        else:
            trainfiles.append(filenames[ii])
    filedict = {'train_files':trainfiles,'eval_files':evalfiles}
    return filedict

In [5]:
# start = datetime.datetime.now()
# 주문_2014 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문_2014.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문_2015 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문_2015.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문_2016 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문_2016.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문_2017 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문_2017.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문_2018 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문_2018.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문상품_2014 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문상품_2014.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문상품_2015 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문상품_2015.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문상품_2016 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문상품_2016.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문상품_2017 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문상품_2017.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문상품_2018 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문상품_2018.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문상품출하_2014 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문상품출하_2014.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문상품출하_2015 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문상품출하_2015.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문상품출하_2016 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문상품출하_2016.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문상품출하_2017 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문상품출하_2017.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 주문상품출하_2018 = pd.read_csv('C:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가5차_180920/주문상품출하_2018.csv', encoding = 'utf-8', index_col=0, dtype = 'str')
# 상품마스터 = pd.read_csv('c:/Users/kim85/OneDrive/Project_AgileSoda/원본/추가6차_181005/상품마스터.csv',encoding ='utf-8',dtype = 'str')
# 조직마스터 = pd.read_csv('c:/users/kim85/OneDrive/Project_AgileSoda/원본/추가6차_181005/조직마스터.csv', encoding ='utf-8', dtype ='str')
# 사업장= pd.read_csv('c:/users/kim85/OneDrive/Project_AgileSoda/원본/추가6차_181005/사업장.csv', encoding ='utf-8', dtype ='str')
# 공사유형 = pd.read_csv('c:/users/kim85/OneDrive/Project_AgileSoda/원본/추가6차_181005/공사유형.csv', encoding ='utf-8', dtype ='str')

# 주문 = pd.concat([주문_2014, 주문_2015, 주문_2016, 주문_2017, 주문_2018])
# 주문상품 = pd.concat([주문상품_2014, 주문상품_2015, 주문상품_2016, 주문상품_2017, 주문상품_2018])
# 주문상품출하 = pd.concat([주문상품출하_2014, 주문상품출하_2015, 주문상품출하_2016, 주문상품출하_2017, 주문상품출하_2018])

# # 주문 처리
# 주문 = 주문.reset_index()[['ORDE_IDEN_NUMB','CONS_IDEN_NAME','GROUPID','CLIENTID','BRANCHID',
#                        'DELI_AREA_CODE','REGI_DATE_TIME','ORDE_USER_ID']].drop_duplicates(keep='first')
# 주문상품 = 주문상품.reset_index()[['ORDE_IDEN_NUMB','ORDE_SEQU_NUMB','GOOD_IDEN_NUMB',
#                            'ORDE_REQU_QUAN']].drop_duplicates(keep='first')
# 주문상품 = 주문상품.groupby(['ORDE_IDEN_NUMB','GOOD_IDEN_NUMB'])['ORDE_REQU_QUAN'].agg('sum').reset_index()
# 주문상품출하 =  주문상품출하.reset_index()[['ORDE_IDEN_NUMB', 'DELI_STAT_FLAG']].drop_duplicates(keep='first')
# 주문상품출하 = 주문상품출하[주문상품출하.DELI_STAT_FLAG=='70'].ORDE_IDEN_NUMB.unique()

# 주문상품_주문 = pd.merge(주문상품, 주문, on = 'ORDE_IDEN_NUMB', how = 'left')
# 주문상품_주문_출하 = 주문상품_주문[주문상품_주문.ORDE_IDEN_NUMB.isin(주문상품출하)]
# 주문전체 = 주문상품_주문_출하[주문상품_주문_출하.GROUPID != '101'].drop_duplicates(keep='first')

# # 상품 처리
# 상품마스터 = 상품마스터[['good_iden_numb','cate_id','good_name','good_spec','good_type','repre_good']].drop_duplicates(keep='first')
# 상품마스터 = 상품마스터.rename(columns = {'good_iden_numb':'GOOD_IDEN_NUMB'})
# # repre_good - Y : 옵션대표상품, N : 단품, P : 옵션상품 // good_type - 10 : 일반, 20 : 지정,60 : 공구, 70 : 안전,80 : 보안

# 상품주문전체 = pd.merge(주문전체, 상품마스터, on = 'GOOD_IDEN_NUMB', how = 'left')
# 상품주문전체 = 상품주문전체[상품주문전체.repre_good=='N'] # 옵션상품 제외

# # 사업장 처리
# 사업장 = pd.concat([
#         사업장[['BRANCHID','AREATYPE','BRANCHBUSITYPE','BRANCHBUSICLAS','WORKID']].rename(columns={'BRANCHID':'BORGID'}), 
#         사업장[['BRANCHCD','AREATYPE','BRANCHBUSITYPE','BRANCHBUSICLAS','WORKID']].rename(columns={'BRANCHCD':'BORGID'})
#     ], axis=0).drop_duplicates(keep='first')

# 공사유형_사업장 = pd.merge(
#     사업장, 
#     공사유형[['WORKID','WORKNM']].drop_duplicates(keep='first'), 
#     how = 'left', on = 'WORKID')

# 조직마스터 = 조직마스터[(조직마스터.BORGTYPECD == 'BCH') & (조직마스터.SVCTYPECD == 'BUY')] # 사업장레벨 및 구매사만
# 조직마스터 = pd.concat([
#         조직마스터[['BORGID','BORGNM']], 
#         조직마스터[['BORGCD','BORGNM']].rename(columns={'BORGCD':'BORGID'})
#     ], axis=0).drop_duplicates(keep='first')

# 조직전체 = pd.merge(조직마스터, 공사유형_사업장, how = 'left', on = 'BORGID').rename(columns={'BORGID':'BRANCHID'})

# df = pd.merge(상품주문전체,조직전체,how = 'left',on = 'BRANCHID')
# df = df.drop([
#         'ORDE_REQU_QUAN','WORKNM','DELI_AREA_CODE','GROUPID','CLIENTID','ORDE_USER_ID','BORGNM',
#         'good_name','good_spec','repre_good'], axis = 1)
# df = df.drop_duplicates(keep='first')
# df.columns = ['OrderNum','ProductCode','Constructionnm','BpID','OrderTime','ProductCategory','ProductClass','Region',
#               'BpType','BpClass','ConstructionType']
# duration = datetime.datetime.now()-start
# m, s = divmod(duration.seconds, 60);h, m = divmod(m, 60);print("[%02d:%02d:%02d]" %(h, m, s))

# 이상 데이터 불러오기 및 필요 데이터 필터링 부분.
# df.to_csv('./Datasets/df.csv', index = False)

In [4]:
# # 데이터 불러오기 및 필요 데이터 필터링 이후부터 작업
# df = pd.read_csv('./Datasets/df.csv', dtype='str', encoding = 'utf-8')

# # 181019
# # 일기준 작업
# df1 = df[['BpID','Region','BpType','BpClass','ConstructionType','OrderTime','OrderNum','ProductCode']].drop_duplicates(keep='first')
# df1['OrderTime'] = pd.to_datetime(df1['OrderTime'])
# df1['OrderTime'] = df1['OrderTime'].dt.date
# df1 = df1.sort_values(['BpID','OrderTime']).reset_index(drop = True)
# df2 = df1.groupby(['BpID','OrderTime']).agg({
#         'OrderNum': lambda x: list(x), 
#         'ProductCode' : lambda x: list(x)}).reset_index()

# nx =  3 # 이전 기록 갯수(X)
# ny = 1 # 이후 기록 갯수(Y)
# df2_1 = {
#     'BpID':[], 
# # 추후 해당정보 필요할 수 있으니 일단 keep
# #     'OrderTime':[], 'x_OrderNums':[], 'y_OrderNums':[], 
#     'x_ProductCodes':[], 
#     'y_ProductCodes':[]
# }
# print('Loop start!!')
# start = datetime.datetime.now()
# for ii in range(len(df2)-nx):
#     targetdata = df2.iloc[ii:ii+nx+ny,].reset_index(drop=True)
#     if len(targetdata['BpID'].unique())==1 and len(targetdata) == nx+ny: # 앞 nx+ny 가 모두 같은 사업장이고 사이즈가 nx+ny인 경우
#         tmpdata1 = targetdata['OrderNum'].tolist()
#         tmpdata2 = targetdata['ProductCode'].tolist()
#         df2_1['BpID'].append(targetdata['BpID'][0])
# # 추후 해당정보 필요할 수 있으니 일단 keep
# #         df2_1['OrderTime'].append(targetdata['OrderTime'][nx-ny])
# #         df2_1['x_OrderNums'].append([','.join(list(chain.from_iterable(tmpdata1[0:nx])))])
# #         df2_1['y_OrderNums'].append([','.join(list(chain.from_iterable(tmpdata1[nx:nx+ny])))])
#         df2_1['x_ProductCodes'].append(','.join(list(chain.from_iterable(tmpdata2[0:nx]))))
#         df2_1['y_ProductCodes'].append(','.join(list(chain.from_iterable(tmpdata2[nx:nx+ny]))))
#     if ii % 10000 == 0:
#         duration = datetime.datetime.now()-start;m, s = divmod(duration.seconds, 60)
#         h, m = divmod(m, 60);print("[%d]/[%d]-----[%02d:%02d:%02d]" %(ii, len(df2)-nx, h, m, s)) 
# df3 =pd.merge(
#     pd.DataFrame(df2_1), 
#     df1[['BpID','Region','BpType','BpClass','ConstructionType']].drop_duplicates(keep='first'),
#     on = 'BpID')

# # 이상 데이터 전처리
# # df3.to_csv('./Datasets/df3.csv', index = False)

In [3]:
# # 데이터 전처리 이후부터 작업
# df3 = pd.read_csv('./Datasets/df3.csv', dtype='str', encoding = 'utf-8')

# # item multihot vectorizer
# x_item_embed = ItemOneHotGenerator(df3.x_ProductCodes, threshold=50) # 30 이상 구매건#######

# # n구매이력아이템 제거->row정리in X_data, x_item_embed
# drop_target_index = x_item_embed[x_item_embed.sum(axis = 1) ==0].index

# X_data = df3.drop(drop_target_index, axis = 0).reset_index(drop=True)
# x_item_embed = x_item_embed.drop(drop_target_index, axis = 0).reset_index(drop=True)

# # 크로스 컬럼 생성
# x_cross_embed = CrossColumnGenerator(X_data.x_ProductCodes, threshold=200) #100 이상 공동구매##### 

# # print(' X_data:', X_data.shape,  '\n', 'x_item:', x_item_embed.shape, '\n', 
# #       'x_cross:', x_cross_embed.shape)

# # # Doc2Vec 임베딩 생성
# x_d2v_embed = myDoc2Vec(data=X_data[['BpID','x_ProductCodes']], d2v_size=300, trainTF=False,
#                         model_loc='./Trained_models/Models_Doc2Vec/BpID_300_181022.model')
# # print('x_d2v:', x_d2v_embed.shape)

# # Label 처리
# label_1 = [ii.split(',') for ii in X_data.y_ProductCodes]

# buy_cnt = []
# buy_items = []
# for ii, jj in pd.Series(label_1).iteritems():
#     buy_cnt.append(len(jj))
#     buy_items.extend(jj)

# label_df = pd.DataFrame(buy_items, columns = ['label'])
# # 1 label - multiple x's
# X_data = X_data.drop(['BpID','x_ProductCodes','y_ProductCodes'], axis = 1)
# X_data_1=pd.DataFrame(np.repeat(X_data.values,buy_cnt,axis=0),columns=X_data.columns)
# x_item_embed_1=pd.DataFrame(np.repeat(x_item_embed.values,buy_cnt,axis=0),columns=x_item_embed.columns)
# x_cross_embed_1=pd.DataFrame(np.repeat(x_cross_embed.values,buy_cnt,axis=0),columns=x_cross_embed.columns)
# x_d2v_embed_1=pd.DataFrame(np.repeat(x_d2v_embed.values,buy_cnt,axis=0),columns=x_d2v_embed.columns)

# # print(X_data_1.shape)
# # print(x_item_embed_1.shape)
# # print(x_cross_embed_1.shape)
# # print(x_d2v_embed_1.shape)
# # print(label_df.shape)

# # 레이블의 비대칭성 개선 --- y값 threshold 적용
# y_threshold = 10 # 10번 미만 구매 이력의 경우 제외
# y_target1 = label_df.groupby('label')['label'].count().sort_values(ascending = False)
# y_target2 = y_target1[y_target1.values>=y_threshold]
# y_target3 = label_df.label.isin(y_target2.index)

# label_df = label_df[y_target3].reset_index(drop=True)
# X_data_1 = X_data_1[y_target3].reset_index(drop=True)
# x_item_embed_1 = x_item_embed_1[y_target3].reset_index(drop=True)
# x_cross_embed_1 = x_cross_embed_1[y_target3].reset_index(drop=True)
# x_d2v_embed_1 = x_d2v_embed_1[y_target3].reset_index(drop=True)

# # print(X_data_1.shape)
# # print(x_item_embed_1.shape)
# # print(x_cross_embed_1.shape)
# # print(x_d2v_embed_1.shape)
# # print(label_df.shape)

# # Label encoding -> all strings
# le_BpType = LabelEncoder()
# BpType_embed = le_BpType.fit_transform(X_data_1.BpType.tolist())
# with open('./le_BpType.pkl','wb') as f:
#     pickle.dump(le_BpType, f)

# le_BpClass = LabelEncoder()
# BpClass_embed = le_BpClass.fit_transform(X_data_1.BpClass.tolist())
# with open('./le_BpClass.pkl','wb') as g:
#     pickle.dump(le_BpClass, g)
    
# X_data_1['BpType'] = np.core.defchararray.add('bt', BpType_embed.astype('str'))
# X_data_1['BpClass'] =np.core.defchararray.add('bc', BpClass_embed.astype('str'))
# X_data_1['Region'] =np.core.defchararray.add('rg', X_data_1['Region'].values.astype('str'))
# X_data_1['ConstructionType'] =np.core.defchararray.add('ct', X_data_1['ConstructionType'].values.astype('str'))

# # csv 디폴트 컬럼 값 만들기
# np_matrix = pd.concat([X_data_1.iloc[:1,:],x_item_embed_1.iloc[:1,:],
#                        x_cross_embed_1.iloc[:1,:],x_d2v_embed_1.iloc[:1,:],
#                        label_df.iloc[:1,:]], axis =1)
# _csv_col_defaults = []
# for ii in np_matrix.dtypes:
#     if ii == 'object':
#         _csv_col_defaults.append([""])
#     elif ii == 'int64':
#         _csv_col_defaults.append([0])
#     elif ii == 'float64':
#         _csv_col_defaults.append([0.0])
#     else:
#         continue
        
# iK = {
#     'df_size':int(1e4),
#     'df_cnt':int(label_df.shape[0]/int(1e4)),
#     'featurenm':np_matrix.columns[:-1],
#     'x_item_colnm':x_item_embed_1.columns,
#     'x_cross_colnm':x_cross_embed_1.columns,
#     'x_d2v_colnm':x_d2v_embed_1.columns,
#     'x_else_colnm':X_data_1.columns,
#     'ConstructionType':X_data_1.ConstructionType.unique(),
#     'BpType':X_data_1.BpType.unique(),
#     'BpClass':X_data_1.BpClass.unique(),
#     'Region':X_data_1.Region.unique(),
#     'y_classes':tuple(label_df.label.unique()),
#     '_csv_col_defaults':_csv_col_defaults
# }
# with open('./iKeys.pkl','wb') as f:
#     pickle.dump(iK, f)
    
# # Slice and save_CSV files
# for ii in range(iK['df_cnt']):
#     filenm = "./Datasets/dataset"+str(ii)+".csv"
#     if ii == iK['df_cnt']-1:
#         np_matrix = pd.concat([
#             X_data_1.iloc[ii*iK['df_size']:,:],
#             x_item_embed_1.iloc[ii*iK['df_size']:,:],
#             x_cross_embed_1.iloc[ii*iK['df_size']:,:],
#             x_d2v_embed_1.iloc[ii*iK['df_size']:,:],
#             label_df.iloc[ii*iK['df_size']:,:]], axis =1).values
#     else:
#         np_matrix = pd.concat([
#             X_data_1.iloc[ii*iK['df_size']:ii*iK['df_size']+iK['df_size'],:],
#             x_item_embed_1.iloc[ii*iK['df_size']:ii*iK['df_size']+iK['df_size'],:],
#             x_cross_embed_1.iloc[ii*iK['df_size']:ii*iK['df_size']+iK['df_size'],:],
#             x_d2v_embed_1.iloc[ii*iK['df_size']:ii*iK['df_size']+iK['df_size'],:],
#             label_df.iloc[ii*iK['df_size']:ii*iK['df_size']+iK['df_size'],:]], axis =1).values
#     np.savetxt(fname=filenm, X=np_matrix, delimiter=',',encoding='utf-8',fmt="%s")
#     print("[%d]/[%d]" %(ii, iK['df_cnt']))
# print('CSV-OUT JOB DONE')

# 모델평가
# 여기서부터 시작

In [122]:
import shutil, os, warnings, re, time, itertools, datetime, multiprocessing
warnings.filterwarnings(action='ignore')
import _pickle as pickle
import numpy as np
import pandas as pd
import tensorflow as tf

from itertools import combinations
from gensim.models import Doc2Vec
from collections import namedtuple, Counter
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
warnings.filterwarnings(action='default')

model_dir = "./Trained_models/Models_WnD/m1031/"
    
with open('./iKeys.pkl','rb') as f:
    iK = pickle.load(f)
print(iK.keys())

dict_keys(['df_size', 'featurenm', 'x_cross_colnm', 'Region', 'y_classes', 'x_item_colnm', 'x_d2v_colnm', 'BpClass', 'x_else_colnm', 'BpType', '_csv_col_defaults', 'df_cnt', 'ConstructionType'])


In [125]:
def input_fn(data_files, num_epochs, batch_size, predict=False):
    def parse_csv(value):
        columns = tf.decode_csv(value, record_defaults=iK['_csv_col_defaults'])
        f_str = dict(zip(iK['featurenm'][:4],columns[:4]))
        f_int = {'item_cross':tf.stack(columns[4:-301],axis=-1)}#item + cross
        f_d2v = {'d2v': tf.stack(columns[-301:-1],axis=-1)}
        label = columns[-1]
        out = {**f_str, **f_int, **f_d2v}, label
        return out
    
    dataset = tf.data.TextLineDataset(data_files)
    if predict==False:
        dataset = dataset.apply(tf.contrib.data.shuffle_and_repeat(buffer_size = 400000, count = num_epochs))
    dataset = dataset.apply(tf.contrib.data.map_and_batch(map_func=parse_csv, 
                                                          batch_size=batch_size, 
                                                          num_parallel_calls = 16))
    dataset = dataset.cache()
    dataset = dataset.prefetch(buffer_size = 50)
    iterator = dataset.make_one_shot_iterator()
    batch_features, batch_labels = iterator.get_next()
    return batch_features, batch_labels

In [166]:
filenames = ["./Datasets/"+f_name for f_name in os.listdir('./Datasets/') if f_name.startswith('dataset') and f_name.endswith('.csv')]
filename_split = train_eval_filesplit(filenames = filenames, train_rate=0.8)
predict_filenames = filename_split['eval_files'][:3]

def train_input_fn():
    return input_fn(data_files=filename_split['train_files'],num_epochs=int(1e6), batch_size=64)

def eval_input_fn():
    return input_fn(data_files=filename_split['eval_files'], num_epochs=1, batch_size=1)

def pred_input_fn():
    return input_fn(data_files=predict_filenames,num_epochs=1,batch_size=1,predict=True)

print("Train files===","\n",filename_split['train_files'])
print("Eval files===","\n",filename_split['eval_files'])
print("Predict files===","\n",predict_filenames)

Train files=== 
 ['./Datasets/dataset1.csv', './Datasets/dataset2.csv', './Datasets/dataset3.csv', './Datasets/dataset4.csv', './Datasets/dataset6.csv', './Datasets/dataset7.csv', './Datasets/dataset8.csv', './Datasets/dataset9.csv', './Datasets/dataset11.csv', './Datasets/dataset12.csv', './Datasets/dataset13.csv', './Datasets/dataset14.csv', './Datasets/dataset16.csv', './Datasets/dataset17.csv', './Datasets/dataset18.csv', './Datasets/dataset19.csv', './Datasets/dataset21.csv', './Datasets/dataset22.csv', './Datasets/dataset23.csv', './Datasets/dataset24.csv', './Datasets/dataset26.csv', './Datasets/dataset27.csv', './Datasets/dataset28.csv', './Datasets/dataset29.csv', './Datasets/dataset31.csv', './Datasets/dataset32.csv', './Datasets/dataset33.csv', './Datasets/dataset34.csv', './Datasets/dataset36.csv', './Datasets/dataset37.csv', './Datasets/dataset38.csv', './Datasets/dataset39.csv', './Datasets/dataset41.csv']
Eval files=== 
 ['./Datasets/dataset0.csv', './Datasets/dataset5.c

In [127]:
item_dim = len(iK['x_item_colnm']) + len(iK['x_cross_colnm'])
item_cross = tf.contrib.layers.real_valued_column("item_cross", dimension=item_dim, dtype=tf.int64)
item_cross_buckets = tf.contrib.layers.bucketized_column(item_cross, boundaries=[1])

d2v = tf.contrib.layers.real_valued_column("d2v", dimension=300, dtype=tf.float64)

BpClass=tf.contrib.layers.sparse_column_with_hash_bucket("BpClass", hash_bucket_size=300)
BpType=tf.contrib.layers.sparse_column_with_hash_bucket("BpType", hash_bucket_size=300)
ConstructionType=tf.contrib.layers.sparse_column_with_hash_bucket("ConstructionType", hash_bucket_size=100)
Region=tf.contrib.layers.sparse_column_with_hash_bucket("Region", hash_bucket_size=100)

wide_columns = [
    item_cross_buckets, 
    ConstructionType]
deep_columns = [
    d2v, 
    tf.contrib.layers.embedding_column(BpClass, dimension=128), 
    tf.contrib.layers.embedding_column(BpType, dimension=128), 
    tf.contrib.layers.embedding_column(ConstructionType, dimension=128), 
    tf.contrib.layers.embedding_column(Region, dimension=128)]



  _get_logger().warn(msg, *args, **kwargs)


In [128]:
# model config setup for speed up (reference : tensorflow performance guide)
config = tf.ConfigProto()
config.intra_op_parallelism_threads = 32 # 8->16->32->64....
config.inter_op_parallelism_threads = 0 # 0으로 고정
_config = tf.contrib.learn.RunConfig(
    session_config=config, 
    save_checkpoints_secs=None, 
    save_checkpoints_steps=1000)

In [129]:
tot_model = tf.estimator.DNNLinearCombinedClassifier(
    model_dir=model_dir, 
    linear_feature_columns=wide_columns, 
    dnn_feature_columns=deep_columns, 
    dnn_hidden_units= [1000, 500, 200], dnn_dropout = 0.5, dnn_optimizer= tf.train.AdamOptimizer,
    n_classes = len(iK['y_classes']), label_vocabulary=iK['y_classes'], batch_norm = True,
    config = _config)

INFO:tensorflow:Using config: {'_task_type': None, '_keep_checkpoint_max': 5, '_environment': 'local', '_eval_distribute': None, '_save_summary_steps': 100, '_task_id': 0, '_save_checkpoints_secs': None, '_session_config': intra_op_parallelism_threads: 32
, '_model_dir': './Trained_models/Models_WnD/m1031/', '_protocol': None, '_num_worker_replicas': 0, '_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f7e501ba860>, '_num_ps_replicas': 0, '_train_distribute': None, '_log_step_count_steps': 100, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': 1000, '_evaluation_master': '', '_device_fn': None, '_tf_random_seed': None, '_is_chief': True, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
}


In [173]:
# https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier
p_result = tot_model.predict(input_fn=pred_input_fn, yield_single_examples=False)

In [167]:
true_label = []
for ii in range(len(predict_filenames)):
    print(predict_filenames[ii],'is in progress...')
    tmp = np.loadtxt(fname=predict_filenames[ii],delimiter=',',dtype='str')
    true_label.extend(tmp[:,-1].tolist())
print('process done')

./Datasets/dataset0.csv is in progress...
./Datasets/dataset5.csv is in progress...
./Datasets/dataset10.csv is in progress...
process done


In [174]:
p_result_final = []
for num, item in enumerate(p_result):
    if num%10000==0:
        print(num)
    if num>=len(true_label):
        break
    else:
        p_result_final.append(item['probabilities'])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./Trained_models/Models_WnD/m1031/model.ckpt-0


NotFoundError: Restoring from checkpoint failed. This is most likely due to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:

Key dnn/hiddenlayer_2/batchnorm_2/beta not found in checkpoint
	 [[{{node save/RestoreV2}} = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_INT64], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]

Caused by op 'save/RestoreV2', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.5/dist-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.5/asyncio/base_events.py", line 345, in run_forever
    self._run_once()
  File "/usr/lib/python3.5/asyncio/base_events.py", line 1312, in _run_once
    handle._run()
  File "/usr/lib/python3.5/asyncio/events.py", line 125, in _run
    self._callback(*self._args)
  File "/usr/local/lib/python3.5/dist-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tornado/gen.py", line 1233, in inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/usr/local/lib/python3.5/dist-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/usr/local/lib/python3.5/dist-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "/usr/local/lib/python3.5/dist-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2817, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2843, in _run_cell
    return runner(coro)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 3018, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 3183, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 3265, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-174-0458ed75f68b>", line 2, in <module>
    for num, item in enumerate(p_result):
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 567, in predict
    hooks=all_hooks) as mon_sess:
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py", line 921, in __init__
    stop_grace_period_secs=stop_grace_period_secs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py", line 643, in __init__
    self._sess = _RecoverableSession(self._coordinated_creator)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py", line 1107, in __init__
    _WrappedSession.__init__(self, self._create_session())
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py", line 1112, in _create_session
    return self._sess_creator.create_session()
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py", line 800, in create_session
    self.tf_sess = self._session_creator.create_session()
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py", line 557, in create_session
    self._scaffold.finalize()
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py", line 213, in finalize
    self._saver = training_saver._get_saver_or_default()  # pylint: disable=protected-access
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 878, in _get_saver_or_default
    saver = Saver(sharded=True, allow_empty=True)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 1094, in __init__
    self.build()
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 1106, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 1143, in _build
    build_save=build_save, build_restore=build_restore)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 781, in _build_internal
    restore_sequentially, reshape)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 459, in _AddShardedRestoreOps
    name="restore_shard"))
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 406, in _AddRestoreOps
    restore_sequentially)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 854, in bulk_restore
    return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_io_ops.py", line 1466, in restore_v2
    shape_and_slices=shape_and_slices, dtypes=dtypes, name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 3272, in create_op
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1768, in __init__
    self._traceback = tf_stack.extract_stack()

NotFoundError (see above for traceback): Restoring from checkpoint failed. This is most likely due to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:

Key dnn/hiddenlayer_2/batchnorm_2/beta not found in checkpoint
	 [[{{node save/RestoreV2}} = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_INT64], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]


In [169]:
list_TopN=[3,5,10]
for ii in range(len(list_TopN)):
    TopN = list_TopN[ii]
    calc_rate = []
    for ii in range(len(true_label)):
        sort_idx = (np.argsort(np.argsort(p_result_final[ii]))>=(len(p_result_final[ii][0])-TopN)).reshape(-1)
        calc_rate.append(int(true_label[ii] in np.array(iK['y_classes'])[sort_idx].tolist()))

    print("적중률 : %.2f",%(np.mean(calc_rate)))
    print("[%d]개를 추천하면 그 중에 [%.2f]개를 구매함" %(TopN, np.mean(calc_rate)*TopN))

SyntaxError: invalid syntax (<ipython-input-169-041adab53335>, line 9)

In [None]:
# 181030 - m1026_iter56000모델로 예측한 결과------------------------
# 적중률 :  0.0817
# [3]개를 추천하면 그 중에 [0.25]개를 구매함
# 적중률 :  0.1241
# [5]개를 추천하면 그 중에 [0.62]개를 구매함
# 적중률 :  0.1966
# [10]개를 추천하면 그 중에 [1.97]개를 구매함
# 그러나 X에 대한 진짜 답이 p_result인지 아직 모른다.
# 맞다고 한다면 아직은 절망적인 성능을 보임.

# 181030 - m1029_iter12000모델로 예측한 결과------------------------
# 적중률 :  0.074
# [3]개를 추천하면 그 중에 [0.22]개를 구매함
# 적중률 :  0.1099
# [5]개를 추천하면 그 중에 [0.55]개를 구매함
# 적중률 :  0.1788
# [10]개를 추천하면 그 중에 [1.79]개를 구매함
# 그러나 X에 대한 진짜 답이 p_result인지 아직 모른다.
# 맞다고 한다면 아직은 절망적인 성능을 보임.

In [194]:
# # 정답 불러오기.
# eval_f= filename_split['eval_files']
# for ii in range(len(eval_f)):
#     print(ii, len(eval_f)-1)
#     tmp = np.loadtxt(fname=eval_f[ii],delimiter=',',dtype='str')
#     if ii == 0:
#         home1 = tmp.copy()
#     else:
#         home1 = np.concatenate((home1,tmp), axis =0)
# print(home1.shape)
# # 정상확인되면 나중에는 y값만 가져오도록 구정해도됨.