In [2]:
# 기본
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.externals import joblib 
from sklearn.metrics import make_scorer

# 시계열
# from fbprophet import Prophet
from datetime import datetime as dt
from statsmodels.tsa.arima_model import ARIMA
from dateutil.relativedelta import relativedelta
from statsmodels.tsa.api import SimpleExpSmoothing, Holt, ExponentialSmoothing

# 회귀분석
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# Deep Neural Network
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
import time

# 설정
%matplotlib inline

Using TensorFlow backend.


In [3]:
# 평가산식

def mae(prediction, correct):
    prediction = np.array(prediction)
    correct = np.array(correct)
    
    difference = correct - prediction
    abs_val = abs(difference)
    
    score = abs_val.mean()
    
    return score

mae_scorer = make_scorer(mae)
mae_scorer

make_scorer(mae)

In [95]:
df_train = pd.read_csv('C:/Users/KIHyuk/Desktop/dacon_data/Data_펀다/funda_train.csv')
df_sub = pd.read_csv('C:/Users/KIHyuk/Desktop/dacon_data/Data_펀다/submission.csv')
df_train['transacted_date'] = pd.to_datetime(df_train['transacted_date']) # time column
df_train = df_train.set_index('transacted_date')

print(df_train.shape)

(6556613, 8)


In [96]:
df_num = df_train[df_train.store_id == 1]

In [97]:
revisit_idx = df_num.card_id.value_counts().reset_index().query("card_id > 2")["index"].values
revisit_ct = df_num[df_num.card_id.isin(revisit_idx)].card_id.resample(rule='d').count().rename('num_of_visit')
count_cols = df_num['card_id'].resample('d').count().rename('num_of_pay')
sum_cols = df_num[['installment_term', 'amount']].resample(rule='d').sum()

In [98]:
df_num_day = pd.concat([count_cols,revisit_ct,sum_cols],axis=1)

In [99]:
df_num_day.insert(0, 'store_id', 1) # store_id 컬럼 추가
df_num_day.insert(4, 'region', df_num[df_num.store_id == 1].region.unique()[0]) # 지역 추가
df_num_day.insert(5, 'type_of_business', df_num[df_num.store_id == 1].type_of_business.unique()[0]) # 업종 추가

In [100]:
df_day = pd.DataFrame()

In [101]:
df_day = pd.concat([df_day,df_num_day],axis=0)

In [102]:
df_day.insert(1, 'day_of_week', df_day.index.dayofweek)
df_day.insert(2, 'business_day', df_day.day_of_week.replace({0:1, 2:1, 3:1, 4:1, 5:0, 6:0}).values) # 평일 1, 주말 0
df_day.num_of_visit.fillna(0, inplace=True) 

In [103]:
df_day

Unnamed: 0_level_0,store_id,day_of_week,business_day,num_of_pay,num_of_visit,installment_term,region,type_of_business,amount
transacted_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-06-01,1,2,1,6,5,0,,,6500.000000
2016-06-02,1,3,1,8,6,0,,,8357.142857
2016-06-03,1,4,1,4,2,0,,,5500.000000
2016-06-04,1,5,0,0,0,0,,,0.000000
2016-06-05,1,6,0,0,0,0,,,0.000000
...,...,...,...,...,...,...,...,...,...
2019-02-24,1,6,0,0,0,0,,,0.000000
2019-02-25,1,0,1,7,6,0,,,3928.571429
2019-02-26,1,1,1,6,2,0,,,5785.714286
2019-02-27,1,2,1,7,6,0,,,6071.428571


In [None]:
def resample_day(train_df):
    df_day = pd.DataFrame() 
    for i in train_df.store_id.unique(): # each unique store_id 
        df_num = train_df[train_df.store_id == i] # store_id별로 적용하기 위해
        
        # 'card_id' 의 일별 counting을 통해 일 거래 횟수 확인
        count_cols = df_num['card_id'].resample(rule='d').count().rename('num_of_pay')
        
        # 'card_id' value count가 2보다 크면 단골인 것으로 판단하고 단골 방문 횟수 확인
        # 1. store_id 별 card_id에서 value_counts() 실행 => card_id를 index로 가지고 count를 값으로 반환
        # 2. count가 2이상인 단골 card_id만 추출하기 위해 reset_index 실행 => count가 2이상인 card_id의 index로 reset_index
        revisit_idx = df_num.card_id.value_counts().reset_index().query("card_id > 2")["index"].values # 단골 card_id
        # store_id 별 단골 card_id를 알아내었다.
        # 해당 card_id들을 일별로 count resampling 해주어서 일별 단골들의 결제 횟수를 추출
        revisit_ct = df_num[df_num.card_id.isin(revisit_idx)].card_id.resample(rule='d').count().rename('num_of_revisit')
        
        # 할부 개월수와 매출액은 일 단위로 합
        # store_id 별 일단위 할부개월수/매출액
        sum_cols = df_num[['installment_term', 'amount']].resample(rule='d').sum()
        
        # 일별 총거래횟수/ 일별 단골거래횟수 / 일별 매출액 /일별 할부개월수
        df_num_day = pd.concat([count_cols, revisit_ct, sum_cols], axis=1)

        df_num_day.insert(0, 'store_id', i) # store_id 컬럼 추가
        df_num_day.insert(4, 'region', df_num[df_num.store_id == i].region.unique()[0]) # 지역 추가
        df_num_day.insert(5, 'type_of_business', df_num[df_num.store_id == i].type_of_business.unique()[0]) # 업종 추가

        df_day = pd.concat([df_day, df_num_day], axis=0) # stored_id별로 위 작업을 누적하여 concat하기 위해
        
    df_day.insert(1, 'day_of_week', df_day.index.dayofweek)
    df_day.insert(2, 'business_day', df_day.day_of_week.replace({0:1, 2:1, 3:1, 4:1, 5:0, 6:0}).values) # 평일 1, 주말 0
    df_day.num_of_revisit.fillna(0, inplace=True) 
    
    return df_day