## import and load

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import notebook
import datetime

#한글 깨짐방지
plt.rc('font',family = 'Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

In [2]:
path = './data/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
submission  = pd.read_csv(path + 'submission.csv')

## data setting

In [3]:
#time, installment, days_of_week, card_id, holyday는 필요없다 생각하여 글쓴이는 제거
train = train.drop(columns=['time', 'installments', 'days_of_week', 'card_id', 'holyday'])
test = test.drop(columns=['time', 'installments', 'days_of_week', 'card_id', 'holyday'])

In [4]:
# 'date','store_id'로 결합
train = train.groupby(['date','store_id']).agg({'amount':'sum'}).reset_index()
test = test.groupby(['date','store_id']).agg({'amount':'sum'}).reset_index()

In [5]:
#이러한 형태로 변환됨
train.head(1)

Unnamed: 0,date,store_id,amount
0,2016-08-01,2,2372


In [6]:
#date전환
train['date'] = pd.to_datetime(train['date'],infer_datetime_format= True)
test['date'] = pd.to_datetime(test['date'],infer_datetime_format= True)

In [7]:
# 나중을 위한 복제
train['temp_date'] = train['date']
test['temp_date'] = test['date']

In [8]:
# 날짜를 인덱스로 설정
train.set_index("date",inplace=True)
test.set_index("date",inplace=True)

### 일부제거

#### train
1. 거래기록이 160일 이하인 경우 삭제 -> 대회 목적이 100일 이후를 예측하는 것이므로 60일은 train(X), 100일은 predict(y)에 사용
    * 160일이 가장 이상적이라고 고려함
2. 2018-07-31 주변 5일 내 거래가 없으면 폐업으로 추정, 2018-07-31은 꼭 있어야하는 데이터

In [9]:
counter = 0
limit = 160

print("Before removing stores (due to limit): ", train.shape)

#160 이상이면 카운트를 올리고, 아니라면 제외함
for store_id in notebook.tqdm(range(train['store_id'].max()+1)):
    if train.loc[train['store_id'] == store_id, 'store_id' ].count() <= limit:
        counter += 1
    else :
        train = train[train.store_id !=store_id]

print(f"Total # of stores that exceeds {limit} is {counter}")
print("After removing stores (due to limit): ", train.shape)

Before removing stores (due to limit):  (480160, 3)


HBox(children=(FloatProgress(value=0.0, max=1800.0), HTML(value='')))


Total # of stores that exceeds 160 is 806
After removing stores (due to limit):  (47969, 3)


In [10]:
#  2018-07-31 주변 5일 내 거래가 없으면 폐업으로 추정, 2018-07-31은 꼭 있어야하는 데이터
def keep_alive_store(df):
    store_id_list = df.store_id.unique()
    yes, no = 0, 0
    
    t2 = datetime.datetime.strptime('2018-07-31', '%Y-%m-%d')
    
    for store_id in store_id_list:
        if df.loc[df.store_id == store_id,'temp_date'].iloc[-1] == t2:
            yes += 1
        else:
            t1 = df.loc[df.store_id == store_id,'temp_date'].iloc[-1]
            difference = t2 - t1
            if difference.days <=5:
                yes += 1
            else :
                no += 1
                df = df[df.store_id != store_id]
    print(f"# of train store open/out of business: {yes}, {no}")
    return df

In [11]:
train = keep_alive_store(train)

# of train store open/out of business: 399, 382


#### test
1. 2018-03-31 주변 7일 거래기록이 없으면 폐업으로 가정 
  - 197번 매장을 폐업으로 추정

In [15]:
store_id_list = test.store_id.unique()
yes, no = 0, 0
closed_test_store = []
t2 = datetime.datetime.strptime('2018-03-01',"%Y-%m-%d")

for store_id in store_id_list:    
    t1 = test[test.store_id == store_id]['temp_date'].iloc[-1]
    differ = t2 - t1
    
    if differ.days <= 7 :
        yes += 1 
    else :
        no += 1
        print(test[test.store_id == store_id].iloc[-1]['temp_date'])
        closed_test_store.append(store_id)
print(f"# of test store open/out of business: {yes}, {no}")

2018-01-17 00:00:00
# of test store open/out of business: 199, 1


### reform

In [32]:
def reform_data(df, isTrain):
    store_id_list = df.store_id.unique()
    x_array = []
    for store_id in store_id_list:
        store = df[df.store_id == store_id]
        store = store.asfreq('D',fill_value = 0) #일자별로 열추가 
        store['temp_date'] = store.index
        store['store_id'] = store_id
        
        # 7일, 15일, 30일간의 이동평균 계산 
        store['MA7'] = store['amount'].rolling('7D').mean()
        store['MA15'] = store['amount'].rolling('15D').mean()
        store['MA30'] = store['amount'].rolling('30D').mean()
        
        # train set인 경우 마지막 100일은 y값으로 저장되어야함
        # store_x는 100일전까지 기간 데이터 
        if isTrain:
            store_y = store.last("100D")
            y = store_y.amount.sum()
            store_x = store[store.temp_date < store_y.iloc[0].temp_date]
        #test는 전체
        else : 
            y = 0
            store_x = store[:]
            
        new_data = []
        new_data.append(store_id)
        new_data.append(y)
        
        new_data.append(store_x.amount.mean())
        new_data.append(store_x.amount.median())
        
        new_data.append(store_x.last('7D').amount.mean())
        new_data.append(store_x.last('15D').amount.mean())
        new_data.append(store_x.last('30D').amount.mean())
        
        new_data.append(store_x.last('7D').amount.median())
        new_data.append(store_x.last('15D').amount.median())
        new_data.append(store_x.last('30D').amount.median())
        
        new_data.append(store_x.last('7D').amount.sum())
        new_data.append(store_x.last('15D').amount.sum())
        new_data.append(store_x.last('30D').amount.sum())
        
        new_data.append(store_x.last('7D').MA7.mean())
        new_data.append(store_x.last('15D').MA7.mean())
        new_data.append(store_x.last('30D').MA7.mean())
        
        new_data.append(store_x.last('7D').MA15.mean())
        new_data.append(store_x.last('15D').MA15.mean())
        new_data.append(store_x.last('30D').MA15.mean())
        
        new_data.append(store_x.last('7D').MA30.mean())
        new_data.append(store_x.last('15D').MA30.mean())
        new_data.append(store_x.last('30D').MA30.mean())
        
        x_array.append(new_data)
    
    return x_array

In [33]:
reformed_train = reform_data(train, True) # train data with new values
reformed_test = reform_data(test, False) # test data with new values

In [37]:
#컬럼이름 부여
train_fin = pd.DataFrame(reformed_train, columns=['store_id', 'y', 'mean', 'median', '7mean', '15mean', '30mean', 
                                         '7median', '15median', '30median',  '7sum', '15sum', '30sum', 
                                         '7ma7mean', '15ma7mean', '30ma7mean',  '7ma15mean', '15ma15mean',
                                         '30ma15mean',  '7ma30mean', '15ma30mean', '30ma30mean'])
test_fin = pd.DataFrame(reformed_test, columns=['store_id', 'y', 'mean', 'median', '7mean', '15mean', '30mean', 
                                         '7median', '15median', '30median',  '7sum', '15sum', '30sum', 
                                         '7ma7mean', '15ma7mean', '30ma7mean',  '7ma15mean', '15ma15mean',
                                         '30ma15mean',  '7ma30mean', '15ma30mean', '30ma30mean'])

In [44]:
train_fin.shape, test_fin.shape

((399, 22), (200, 22))

### modeling
* xgboost를 활용, test분할은 0.1aks

In [35]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

  config.update(yaml.load(text) or {})


In [42]:
col = [i for i in train_fin.columns if i not in ['store_id','y']]
y = train_fin['y'].values

In [43]:
train_X, test_X, train_y, test_y = train_test_split(train_fin[col],y, test_size=0.1, random_state=2018)

In [68]:
def XGB_regressor(train_X, train_y, test_X, test_y, metric, feature_name = None, seed_val = 2018, num_rounds = 30):
    params = {'objective' : 'reg:squarederror',
              'eta' : 0.05,
              'max_depth' : 10,
              'eval_metric': metric,#??? rmse가 아니고??
              'min_child_weight': 1,
              'subsample': 0.7,
              'colsample_bytree' : 0.7,
              'seed': seed_val}
    plst = list(params.items())
    
    xgtrain = xgb.DMatrix(train_X, label = train_y)
    
    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label = test_y)
        watchlist = [ (xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds= 300,verbose_eval= 10)
    else :
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)
        
    return model

In [69]:
model = XGB_regressor(train_X, train_y, test_X, test_y, 'mae')

[0]	train-mae:31435.27539	test-mae:31416.20703
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 300 rounds.
[10]	train-mae:21438.35352	test-mae:20587.20703
[20]	train-mae:16275.93750	test-mae:16815.99414
[29]	train-mae:13466.90234	test-mae:16067.39551


In [70]:
model = XGB_regressor(train_X, train_y, test_X, test_y, 'rmse')

[0]	train-rmse:59720.36328	test-rmse:44749.51562
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 300 rounds.
[10]	train-rmse:50741.58594	test-rmse:34072.02734
[20]	train-rmse:46506.29297	test-rmse:29934.97852
[29]	train-rmse:44591.16406	test-rmse:28305.98047


In [72]:
##??? sorting을 이제한다고???
test_fin = test_fin.sort_values(by='store_id')

In [75]:
#몇가지 조정을 더 거친후 제출할 답변임
y_pred = model.predict(xgb.DMatrix(test_fin[col]), ntree_limit = model.best_ntree_limit)

In [81]:
score = 28305.98047

In [83]:
# 이상해서 한번 제출해봄 -> 오버피팅 과도하게되고 결과 안 좋음

score = 28305.98047
submission['total_sales'] = y_pred

today = datetime.datetime.today().date()
os.makedirs('./submission/',exist_ok=True)
submission.to_csv(f'./submission/submission_{today}_xgb_{score}.csv',index=False)

### 추가조정
* 몇몇 상점은 2~3월에 폐업했을 것임

In [84]:
store_id_list = test.store_id.unique()
store_id_list.sort()

In [87]:
feb_march = []
for store_id in store_id_list:
    mini = test[test.store_id == store_id]
    march = mini[mini.temp_date <= datetime.datetime(2018,3,1)]['temp_date'].count()
    feb = mini[mini.temp_date <= datetime.datetime(2018,2,1)]['temp_date'].count() - march # 1월은???
    feb_march.append((march+feb)/2/31) #???

In [92]:
#2-3월 확인 전에 이전에 발견한 폐점한 곳 0으로 적용
for c in closed_test_store:
    y_pred[c] = 0

In [None]:
# At last the 'y_test' predicted from the trained model is adjusted with 'feb_march'.
#
# The last number 0.72 is an optimized number variable to make sure that no prediction exceeds the answer.
# This is required as penalties are given.

In [93]:
for x in range(200):
    y_pred[x] = y_pred[x] * feb_march[x] * 0.72

In [95]:
# ??? 성적 구린데?? 뭐지?? 다시해봐야하나

score = 28305.98047
submission['total_sales'] = y_pred

today = datetime.datetime.today().date()
os.makedirs('./submission/',exist_ok=True)
submission.to_csv(f'./submission/submission_{today}_xgb_{score}_adjust.csv',index=False)