In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import math
import seaborn as sns
import warnings
import statsmodels.api as sm
import matplotlib.pyplot as plt
import itertools
import scipy.stats as st
from datetime import datetime, timedelta
warnings.filterwarnings("ignore")  # specify to ignore warning messages

sns.set(color_codes=True)

test = pd.read_csv('C:/Users/KIHyuk/Desktop/dacon_data/Data_상점매출/test.csv')
submission = pd.read_csv('C:/Users/KIHyuk/Desktop/dacon_data/Data_상점매출/submission.csv')

df_copy = test.copy() # 복사본
df_copy['date'] = pd.to_datetime(df_copy.date)

df_copy['date'] = pd.to_datetime(df_copy.date.astype(str) + " " + df_copy.time, format='%Y-%m-%d %H:%M:%S') # date + time

In [2]:
df_copy

Unnamed: 0,store_id,date,time,card_id,amount,installments,days_of_week,holyday
0,0,2016-08-01 00:28:15,00:28:15,bf33518373,125,,0,0
1,0,2016-08-01 01:09:58,01:09:58,7a19a3a92f,90,,0,0
2,0,2016-08-01 01:47:24,01:47:24,6f9fd7e241,150,,0,0
3,0,2016-08-01 17:54:43,17:54:43,8bcf1d61b2,362,,0,0
4,0,2016-08-01 18:48:53,18:48:53,6a722ce674,125,,0,0
...,...,...,...,...,...,...,...,...
473387,199,2018-03-30 14:17:59,14:17:59,300d7bc922,65,,4,0
473388,199,2018-03-30 19:01:54,19:01:54,3ab757718b,65,,4,0
473389,199,2018-03-30 20:08:03,20:08:03,2d8e9e421c,65,,4,0
473390,199,2018-03-30 20:11:58,20:11:58,22daeb334e,200,,4,0


In [3]:
# Remove negative values from the data set.
# 하루 매출이 음수가 되는 경우 존재.

def reduce_noise_by_removing_neg_vals(df_copy):
    df_pos = df_copy[df_copy['amount'] > 0] # 정상거래
    df_neg = df_copy[df_copy['amount'] < 0] # 거래취소

#     start = datetime.now() # 현재시간

    for nega_i in df_neg.to_records()[:]: # to_records : DataFrame => ndarray
        store_i = nega_i[1]  # i번째 행의 store_id  # -거래
        date_i = nega_i[2] # i번째 행의 date  # -거래
        card_i = nega_i[4] # i번째 행의 card_id  # -거래
        amt_i = nega_i[5] # i번째 행의 amount  # -거래
        row_i = df_pos[df_pos['store_id'] == store_i] # 정상거래 중 store_id가 i번째(-거래) store_i와 같은 데이터 추출
        row_i = row_i[row_i['card_id'] == card_i] # 정상거래 중 card_id가 i번째(-거래) card_id와 같은 데이터 추출
        row_i = row_i[row_i['amount'] >= abs(amt_i)] # -거래와 절대값이 같거나 큰 결제정보 중
        row_i = row_i[row_i['date'] <= date_i] # -거래 이전데이터 중 
        if len(row_i[row_i['amount'] == abs(amt_i)]) > 0: # -거래 이전 거래중 절대값이 같은 거래가 있다면,,
            row_i = row_i[row_i['amount'] == abs(amt_i)] # 여기 왜 필요?..
            matched_row = row_i[row_i['date'] == max(row_i['date'])] # 가장 최근 시점 
            # df_pos.loc[matched_row.index, 'amount'] = 0
            df_pos = df_pos.loc[~df_pos.index.isin(matched_row.index), :] # matched_row에 해당하는 거래정보 제거(index이용)
        elif len(row_i[row_i['amount'] > abs(amt_i)]) > 0: # -거래 이전 거래중 절대값이 더 큰 거래가 있다면,,
            matched_row = row_i[row_i.date == max(row_i.date)]
            df_pos.loc[matched_row.index, 'amount'] = matched_row.amount + amt_i # 차액만큼만
        # else:
        #     pass
            # no_match.append(nega_i)
#     end = datetime.now()
#     time_took = (end - start).seconds / 60

#     print(round(time_took, 2))
    return df_pos # df_pos는 -거래 없는 데이터(-거래에 따른 환불처리까지)

df_pos = reduce_noise_by_removing_neg_vals(df_copy)

In [5]:
def adf_test(y): # stationary test function
    # perform Augmented Dickey Fuller test
    print('Results of Augmented Dickey-Fuller test:')
    dftest = adfuller(y, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['test statistic', 'p-value', '# of lags', '# of observations'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value ({})'.format(key)] = value
    print(dfoutput)

In [6]:
def ts_diagnostics(y, lags=None, title='', filename=''):
    '''
    Calculate acf, pacf, qq plot and Augmented Dickey Fuller test for a given time series
    '''
    if not isinstance(y, pd.Series):
        y = pd.Series(y)

    # weekly moving averages (5 day window because of workdays)
    rolling_mean = pd.Series.rolling(y, window=2).mean()
    rolling_std = pd.Series.rolling(y, window=2).std()

    fig = plt.figure(figsize=(14, 12))
    layout = (3, 2)
    ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
    acf_ax = plt.subplot2grid(layout, (1, 0))
    pacf_ax = plt.subplot2grid(layout, (1, 1))
    qq_ax = plt.subplot2grid(layout, (2, 0))
    hist_ax = plt.subplot2grid(layout, (2, 1))

    # time series plot
    y.plot(ax=ts_ax)
    rolling_mean.plot(ax=ts_ax, color='crimson')
    rolling_std.plot(ax=ts_ax, color='darkslateblue')
    plt.legend(loc='best')
    ts_ax.set_title(title, fontsize=24)

    # acf and pacf
    plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5)
    plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5)

    # qq plot
    sm.qqplot(y, line='s', ax=qq_ax)
    qq_ax.set_title('QQ Plot')

    # hist plot
    y.plot(ax=hist_ax, kind='hist', bins=25)
    hist_ax.set_title('Histogram')
    plt.tight_layout()
    plt.show()

    # perform Augmented Dickey Fuller test
    print('Results of Dickey-Fuller test:')
    dftest = adfuller(y, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['test statistic', 'p-value', '# of lags', '# of observations'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value (%s)' % key] = value
    print(dfoutput)
    return

df = df_pos.copy()
test_groupby_date_store = df.groupby(['date', 'store_id'])['amount', 'holyday'].sum()
test_groupby_date_store = test_groupby_date_store.reset_index()

test_groupby_date_store = test_groupby_date_store.set_index('date')
store_list = test_groupby_date_store.store_id.unique()

store_list.sort()