In [3]:
import numpy as np
import pandas as pd

In [4]:
import warnings
warnings.filterwarnings(action='ignore') # default

In [5]:
mid_cat_score = pd.read_pickle('mid_cat_score.pkl')
brand_score = pd.read_pickle('brand_score.pkl')
limit_score = pd.read_pickle('limit_score.pkl')

score_lists = [mid_cat_score,brand_score,limit_score]

In [14]:
def preprocessing(input_df,score_lists,cat_encoder):
    df = input_df
    
    df.columns = [col[:-6] for col in df.columns]
    df = df.replace({True:1, False:0})
    df['pgm'] = df['pgm'].astype('str')
    df['showhost'] = df['showhost'].apply(lambda x: str(x)[2:-2].replace("'",''))
    
    df['year'] = df['date'].apply(lambda x: x.year)
    df['month'] = df['date'].apply(lambda x: x.month)
    df['day'] = df['date'].apply(lambda x: x.day)
    df['weekday'] = df['date'].apply(lambda x: x.weekday())

    df['month_sin'] = np.sin(2*np.pi*df['month']/12)
    df['month_cos'] = np.cos(2*np.pi*df['month']/12)
    df['weekday_sin'] = np.sin(2*np.pi*df['weekday']/7)
    df['weekday_cos'] = np.cos(2*np.pi*df['weekday']/7)
    df['start_time_sin'] = np.sin(2*np.pi*df['start_time']/24)
    df['start_time_cos'] = np.cos(2*np.pi*df['start_time']/24)
    df['end_time_sin'] = np.sin(2*np.pi*df['end_time']/24)
    df['end_time_cos'] = np.cos(2*np.pi*df['end_time']/24)
    
    df['price_min'] = df['price'].apply(lambda x: min(x))
    df['price_max'] = df['price'].apply(lambda x: max(x))
    df['price_mean'] = df['price'].apply(lambda x: np.mean(x))
    
    
    def make_top_col(brand_list,score_df,col_name):
        temp = score_df
        for brand in set(brand_list):
            if brand not in temp[col_name]:
                new = pd.DataFrame([[brand,0,0,score_df[score_df.columns[-1]].mean()]], columns=score_df.columns)
                temp = temp.append(new,ignore_index=True)
            else:
                continue
        
        temp = temp.sort_values(by=[temp.columns[-1],'방송등장횟수'],ascending=False).reset_index(drop=True)
        temp2 = list(temp[temp[col_name].isin(list(set(brand_list)))][col_name])
        return temp2
    
    
    df['top_midcat'] = df['midcat'].apply(lambda a: make_top_col(a,score_lists[0],'상품중분류명'))
    df['midcat1'] = df['top_midcat'].apply(lambda a: a[0])
    df['midcat2'] = df['top_midcat'].apply(lambda a: a[1] if len(a)>=2 else a[0])
    df['midcat3'] = df['top_midcat'].apply(lambda a: a[2] if len(a)>=3 else a[0])
    df['top_brand'] = df['brand'].apply(lambda a: make_top_col(a,score_lists[1],'브랜드명'))
    df['brand1'] = df['top_brand'].apply(lambda a: a[0])
    df['brand2'] = df['top_brand'].apply(lambda a: a[1] if len(a)>=2 else a[0])
    df['brand3'] = df['top_brand'].apply(lambda a: a[2] if len(a)>=3 else a[0])
    df['top_expression'] = df['expression'].apply(lambda a: make_top_col(a,score_lists[2],'한정표현구분'))
    df['expression1'] = df['top_expression'].apply(lambda a: a[0])
    df['expression2'] = df['top_expression'].apply(lambda a: a[1] if len(a)>=2 else a[0])
    df['expression3'] = df['top_expression'].apply(lambda a: a[2] if len(a)>=3 else a[0])
    
    drop_cols = ['date','year','month','weekday','start_time','end_time',
                 'midcat','brand','expression','price','top_midcat','top_brand','top_expression']
    df = df.drop(drop_cols, axis=1)
    
    df = cat_encoder.transform(df)
    
    return df

In [15]:
test_df = pd.read_pickle('/Users/beomso0/Documents/GH/Hyundai/deploy_st/df_example.pkl')

In [16]:
import joblib
ce = joblib.load('/Users/beomso0/Documents/GH/Hyundai/deploy_st/model/cat_encoder.pkl')

In [20]:
X_train = joblib.load('/Users/beomso0/Documents/GH/Hyundai/deploy_st/model/X_train.pkl')

In [24]:
joblib.dump(ce.transform(X_train), 'X_train_encoded.pkl')

['X_train_encoded.pkl']

In [26]:
ce.transform(X_train)

Unnamed: 0,PGM코드,상품구분,쇼호스트명,방송길이,기온,주말제외공휴일,MONTH_SIN,MONTH_COS,WEEKDAY_SIN,WEEKDAY_COS,...,상품판매가_mean,상품중분류1,상품중분류2,상품중분류3,브랜드1,브랜드2,브랜드3,한정표현1,한정표현2,한정표현3
0,214593,1,0.319366,75.0,-4.0,1,0.5,0.866025,0.974928,-0.222521,...,7.900000e+04,-0.283239,-0.173159,-0.263667,-2.697490e-01,-2.419711e-01,-2.697490e-01,1.321259,1.302166,1.315214
1,207666,1,0.583478,60.0,-4.0,1,0.5,0.866025,0.974928,-0.222521,...,3.990000e+05,0.097698,0.103472,0.090846,5.963074e-01,5.963074e-01,5.963074e-01,0.417602,0.475696,0.427348
2,214169,1,0.880863,130.0,0.0,1,0.5,0.866025,0.974928,-0.222521,...,1.829800e+05,0.520538,0.959223,-0.312095,6.566157e-01,6.273121e-02,1.734912e-01,1.321259,1.302166,1.315214
3,215079,1,0.460996,75.0,0.0,1,0.5,0.866025,0.974928,-0.222521,...,7.990000e+04,0.308409,0.288156,0.280859,1.074697e+00,1.074697e+00,1.074697e+00,-0.312687,-0.312687,-0.312687
4,210587,1,0.143426,60.0,2.0,1,0.5,0.866025,0.974928,-0.222521,...,2.464625e+06,-0.244164,-0.200151,-0.159222,-1.115078e-01,-5.780563e-02,-1.115078e-01,0.417602,0.475696,0.427348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35349,223752,1,0.307739,60.0,2.0,1,0.5,0.866025,0.000000,1.000000,...,5.390000e+05,0.097698,0.103472,0.090846,7.332021e-16,7.332021e-16,7.332021e-16,-0.312687,-0.312687,-0.312687
35350,218842,1,-0.005757,65.0,2.0,1,0.5,0.866025,0.000000,1.000000,...,8.450000e+04,0.306895,0.251360,0.257683,5.231001e-01,5.193873e-01,5.231001e-01,-0.312687,-0.312687,-0.312687
35351,223755,1,-0.238707,60.0,1.0,1,0.5,0.866025,0.000000,1.000000,...,2.646667e+05,-0.015643,0.015940,0.004168,-8.015341e-01,-8.015341e-01,-8.015341e-01,0.417602,0.475696,0.427348
35352,207938,1,0.042303,70.0,1.0,1,0.5,0.866025,0.000000,1.000000,...,2.091500e+06,-0.472861,-0.404565,-0.459613,-2.657121e-01,-2.289412e-01,-2.571353e-01,-0.312687,-0.312687,-0.312687


In [17]:
preprocessing(test_df, score_lists, ce)

ValueError: Unexpected input dimension 27, expected 32