## import / load data

In [20]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm_notebook

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm
import xgboost

In [2]:
path = './data/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
submission  = pd.read_csv(path + 'submission.csv')

## Data setting

In [3]:
# train/test 결합
train['test'] = 0
test['test'] = 1
merge_df = pd.concat([train,test])

In [4]:
# 날짜
merge_df['date'] = pd.to_datetime(merge_df.date)
merge_df['year'] = merge_df['date'].apply(lambda x : x.year)
merge_df['month'] = merge_df['date'].apply(lambda x : x.month)

In [5]:
# 평균소비액
daily_amount = merge_df[merge_df.amount>0].groupby('date')['amount'].mean()

# 거래취소의 경우 
merge_df.loc[merge_df.amount <0, 'refund'] = 1
merge_df.loc[merge_df.amount >0, 'refund'] = 0
# merge_df.loc[merge_df.amount <0, 'amount'] = 0

In [6]:
merge_df.head(2)

Unnamed: 0,store_id,date,time,card_id,amount,installments,days_of_week,holyday,test,year,month,refund
0,0,2016-12-14,18:05:31,d297bba73f,5,,2,0,0,2016,12,0.0
1,0,2016-12-14,18:05:54,d297bba73f,-5,,2,0,0,2016,12,1.0


### train_data setting
* store_id가 연속적이진 않다 (11과 같은 숫자는 없음)

In [31]:
# 전체 날짜의 요일, 휴일
all_dates = merge_df.drop_duplicates('date').sort_values('date')
all_dates = all_dates[['date','days_of_week','holyday']]
all_dates.reset_index(drop=True, inplace=True)

In [32]:
train_df = merge_df[merge_df.test == 0]

# 날짜범위 확인
train_fin = train_df.groupby('store_id')['date'].agg(['min','max'])
train_fin.reset_index(inplace=True)
train_fin['last_day'] = train_fin['max'] - datetime.timedelta(100)

#최초거래일자부터 마지막 거래일자까지의 기간
train_fin['day_gap'] = train_fin['last_day'] - train_fin['min']
train_fin['day_gap'] = train_fin['day_gap'].apply(lambda x : x.days)

# 마지막 날짜에서 100일전 이전의 거래내역이 없으면 학습대상이 아님
train_fin.loc[train_fin['min'] > train_fin['last_day'],'outbound'] = 1
train_fin.loc[~(train_fin['min'] > train_fin['last_day']),'outbound'] = 0

del_list = train_fin[train_fin.outbound == 1].index

train_fin.drop(del_list,inplace= True)
del train_fin['outbound']
train_fin.reset_index(drop = True,inplace=True)

# 일자별 결제
check_amount = train_df.groupby(['store_id','date'])['amount'].agg(['sum','count']).reset_index()
refund_df = train_df.groupby(['store_id','date'])['refund'].sum().reset_index()
installlment_df = train_df.groupby(['store_id','date'])['installments'].sum().reset_index()

In [33]:
before_amount_ls = []
after_amount_ls = []
transfer_count_ls = []

before_holyday_ls = []
before_week_dict_ls =[]
refund_ls = []
installment_ls = []

for i in tqdm_notebook(train_fin.index):
    last_day = train_fin.loc[train_fin.index == i,'last_day'].iloc[0]
    store_id = train_fin.loc[train_fin.index == i,'store_id'].iloc[0]
    one = check_amount.loc[(check_amount.store_id == store_id)].reset_index(drop=True)
    two = refund_df.loc[(refund_df.store_id == store_id)].reset_index(drop=True)
    three = installlment_df.loc[(installlment_df.store_id == store_id)].reset_index(drop=True)
    
    try :
        last_index = one.loc[one.date < last_day].index[-1]
        
        before = one[0:last_index+1]
        after = one[last_index+1:]
        
        transfer_count = before['count'].sum()
        before_amount = before['sum'].sum()
        after_amount = after['sum'].sum()
        
        
        #요일, 휴일
        before_holyday = pd.merge(before,all_dates)['holyday'].sum()
        before_week = pd.merge(before,all_dates)['days_of_week'].value_counts().to_dict()

        after_holyday = pd.merge(after,all_dates)['holyday'].sum()
        after_week = pd.merge(after,all_dates)['days_of_week'].value_counts().to_dict()
        
        #환불횟수, 할부총기간
        refund = two[0:last_index+1]['refund'].sum()
        installment = three[0:last_index+1]['installments'].sum()
        
        #저장
        transfer_count_ls.append(transfer_count)
        before_amount_ls.append(before_amount)
        after_amount_ls.append(after_amount)
        
        before_holyday_ls.append(before_holyday)
        before_week_dict_ls.append(before_week)
        
        refund_ls.append(refund)
        installment_ls.append(installment)
        
    except :
        transfer_count_ls.append(np.nan)
        before_amount_ls.append(np.nan)
        after_amount_ls.append(np.nan)
        
        before_holyday_ls.append(np.nan)
        before_week_dict_ls.append({})
        refund_ls.append(np.nan)
        installment_ls.append(np.nan)
    
        
train_fin['transfer_count'] = transfer_count_ls
train_fin['before_amount'] = before_amount_ls
train_fin['after_amount'] = after_amount_ls
train_fin['before_holyday'] = before_holyday_ls
train_fin['refund_sum'] = refund_ls
train_fin['installlment_sum'] = installment_ls

# 요일 추가
week_df = pd.DataFrame(before_week_dict_ls)
week_df = week_df[[0,1,2,3,4,5,6]]

train_fin = pd.concat([train_fin,week_df],axis= 1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=1641.0), HTML(value='')))




In [34]:
train_fin.head(2)

Unnamed: 0,store_id,min,max,last_day,day_gap,transfer_count,before_amount,after_amount,before_holyday,refund_sum,installlment_sum,0,1,2,3,4,5,6
0,0,2016-12-14,2018-07-31,2018-04-22,494,56725,4361581,873030,25,499.0,526.0,70.0,70.0,71.0,70.0,70.0,70.0,69.0
1,1,2016-12-21,2018-07-02,2018-03-24,458,35,120650,27300,0,0.0,64.0,10.0,9.0,7.0,3.0,2.0,,


### test_data setting

In [35]:
test_df = merge_df[merge_df.test == 1]

# 날짜범위 확인
test_fin = test_df.groupby('store_id')['date'].agg(['min','max'])
test_fin.reset_index(inplace=True)

#최초거래일자부터 마지막 거래일자까지의 기간
test_fin['day_gap'] = test_fin['max'] - test_fin['min']
test_fin['day_gap'] = test_fin['day_gap'].apply(lambda x : x.days)

# 일자별 결제
check_amount = test_df.groupby(['store_id'])['amount'].agg(['sum','count']).reset_index()
check_amount = check_amount.rename(columns={'sum':'before_amount','count':'transfer_count'})
test_fin = pd.merge(test_fin,check_amount)

refund_df = test_df.groupby(['store_id','date'])['refund'].sum().reset_index()
installlment_df = test_df.groupby(['store_id','date'])['installments'].sum().reset_index()

In [36]:
check_amount = train_df.groupby(['store_id','date'])['amount'].sum().reset_index()

before_holyday_ls = []
before_week_dict_ls =[]

refund_ls = []
installment_ls = []

for store_id in range(0, test_fin.store_id.nunique()):
    one = check_amount.loc[(check_amount.store_id == store_id)].reset_index(drop=True)
    two = refund_df.loc[(refund_df.store_id == store_id)].reset_index(drop=True)
    three = installlment_df.loc[(installlment_df.store_id == store_id)].reset_index(drop=True)
    
    refund = two['refund'].sum()
    installment = three['installments'].sum()
    
    holyday = pd.merge(one['date'],all_dates)['holyday'].sum()
    before_week = pd.merge(one['date'],all_dates)['days_of_week'].value_counts().to_dict()
    
    before_holyday_ls.append(holyday)
    before_week_dict_ls.append(before_week)
    
    refund_ls.append(refund)
    installment_ls.append(installment)
    
test_fin['holyday'] =  before_holyday_ls
test_fin['refund_sum'] =  refund_ls
test_fin['installment_sum'] = installment_ls

# 요일 추가
week_df = pd.DataFrame(before_week_dict_ls)
week_df = week_df[[0,1,2,3,4,5,6]]

test_fin = pd.concat([test_fin,week_df],axis= 1)

In [37]:
test_fin.head(2)

Unnamed: 0,store_id,min,max,day_gap,before_amount,transfer_count,holyday,refund_sum,installment_sum,0,1,2,3,4,5,6
0,0,2016-08-01,2018-03-31,607,638257,4215,31,14.0,2.0,85.0,85.0,85.0,84.0,84.0,84.0,84.0
1,1,2016-08-02,2018-03-30,605,427806,435,0,14.0,192.0,13.0,10.0,7.0,3.0,2.0,1.0,


### 최종전처리

In [38]:
y = train_fin['after_amount']

train_fin.drop(['min','max','last_day','after_amount','store_id'],axis = 1,inplace=True)
test_fin.drop(['min','max','store_id'],axis = 1,inplace=True)

In [39]:
train_fin.shape, test_fin.shape

((1641, 13), (200, 13))

In [40]:
#로그 변환
train_fin.before_amount = np.log(1+train_fin.before_amount)
test_fin.before_amount = np.log(1+test_fin.before_amount)

In [41]:
train_fin.fillna(0, inplace=True)
test_fin.fillna(0, inplace=True)

In [42]:
def info_df(data):
    '''data의 type, null_count, null_rate를 알려주는 함수
    
    Parameter
    ---------
    data(DataFrame) : 파악하고자하는 Data
       
    '''
    info_df = pd.DataFrame({"type":data.dtypes,
                            'null_count':data.isnull().sum(),
                           'null_rate':data.isnull().sum()/data.isnull().count() * 100})  
    info_df['uni_count'] = data.apply(lambda x : x.nunique())
    
    return info_df

### train_test split

In [43]:
X_train, X_test, y_train, y_test = train_test_split(train_fin,y,test_size =0.3, random_state = 42)

### modeling 

#### rf

In [44]:
rf = RandomForestRegressor(random_state=42, n_estimators= 1000)
rf.fit(X_train,y_train)
rf_y_pred = rf.predict(X_test)

np.sqrt(mean_squared_error(y_test,rf_y_pred))

28301.77525590252

In [45]:
fi_df = pd.DataFrame(rf.feature_importances_*100,index = train_fin.columns)
fi_df

Unnamed: 0,0
day_gap,10.512951
transfer_count,1.320584
before_amount,39.410583
before_holyday,0.825858
refund_sum,34.840304
installlment_sum,0.579917
0,0.674267
1,3.201963
2,1.946996
3,2.71295


#### lgbm

In [46]:
lgbm_c = lightgbm.LGBMRegressor(random_state=42, learning_rate=0.001,n_estimators= 1000)
lgbm_c.fit(X_train,y_train)
lgbm_y_pred = lgbm_c.predict(X_test)

np.sqrt(mean_squared_error(y_test,lgbm_y_pred))

29635.983986422652

In [48]:
fi_df = pd.DataFrame(lgbm_c.feature_importances_,index = train_fin.columns)
fi_df

Unnamed: 0,0
day_gap,9801
transfer_count,1154
before_amount,13040
before_holyday,620
refund_sum,988
installlment_sum,883
0,769
1,671
2,432
3,258


### final

In [54]:
# #lgbm
# lgbm_c.fit(train_fin,y)
# y_pred_fin = lgbm_c.predict(test_fin)

# rf
rf.fit(train_fin,y)
y_pred_fin = rf.predict(test_fin)

In [55]:
submission['total_sales'] = y_pred_fin

In [56]:
submission.to_csv('submission_1st_rf.csv',index=False)