In [3]:
# 출처 https://www.kaggle.com/javi23ruiz/eda-with-plotly-useful-conclusions/notebook
import numpy as np #
import pandas as pd 
import math

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [4]:
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
holidays_events = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')

## Data Information

**train,store 데이터 확인**

In [5]:
fig = go.Figure(data=[go.Table(header=dict(values=['KPI', 'Value']),
                 cells=dict(values=[['Number of Stores', 'Number of Different Products', 
                                     'Window Start Date', 'Window End Date',
                                    '#Rows in training set', '#Date Points in Train Dataset'], 
                                    [train['store_nbr'].nunique(), train['family'].nunique(), 
                                     train['date'].min(), train['date'].max(),
                                    train.shape[0], train['date'].nunique()]]))
                     ])

fig.update_layout({"title": f'BASIC KPIS of TRAIN DATA'}, height=500, width=500)
fig.show()

In [6]:
print(train.info())
print(train.isnull().sum())
print(train.columns)
print(train.head())

In [7]:
print(stores.info())
print(stores.isnull().sum())

In [8]:
print(train.head())
print(stores.head())

**결측값 확인**

In [9]:
print(holidays_events.isnull().sum())
print(oil.isnull().sum())
print(transactions.isnull().sum())

#oil에 결측값 존재

## EDA

**holiday** 휴일인 날과 아닌 날 구분

In [10]:
holidays_events['holiday'] = ["1" if s == False else "0" for s in holidays_events['transferred']]
holidays_events.head()
#평일로 바뀐 날 즉 휴일이 아닌 날은 holiday 0

In [11]:
train_holiday=train.merge(holidays_events, on = 'date', how='left')
train_holiday.head()

**oil data**

In [12]:
oil.info()

In [13]:
ax = oil.set_index('date').plot(figsize = (16, 8))
ax.set_xlabel('Date', fontsize = 'large')
ax.set_ylabel("Crude Oil", fontsize = 'large')

**전체 sales**

In [14]:

#https://www.kaggle.com/yuxuanliu0626/eda-feature-engineering-model-error-analysis
daily_sales = train[['date', 'sales']].groupby('date').mean().reset_index()
fig = go.Figure(data=go.Scatter(x=daily_sales['date'], 
                                y=daily_sales['sales'],
                                marker_color='red', text="sales"))
fig.show()

**oil , sales, transactions의 상관관계**

In [15]:
oil['sales'] = daily_sales['sales']
oil['transactions'] = transactions['transactions']
print(oil.head())
oil.corr()
#관련성이 보이지 않는다

**family에 따른 판매량**

In [16]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(15,7))
train.groupby("family").sum()['sales'].plot(kind='bar')

**holiday에 따른 판매량**

In [17]:
holiday_sales = pd.merge(daily_sales, holidays_events, on='date', how='inner')
fig = px.scatter(holiday_sales, x='date', y='sales', size='sales', color='holiday')
fig.show()

**월에 따른 판매량**

In [18]:
# extract date features
train['year'] = pd.to_datetime(train['date']).dt.year
train['month'] = pd.to_datetime(train['date']).dt.month
train['day'] = pd.to_datetime(train['date']).dt.day
train['day_of_week'] = pd.to_datetime(train['date']).dt.day_name()

# sales by month
by_month = train.groupby(['month'])['sales'].mean().reset_index()
fig = px.bar(by_month, x='month', y='sales', color='sales', color_continuous_scale="darkmint")
fig.show()
#12월에 최다 매출

**일에 따른 판매량**

In [19]:
# sales by day of month
by_day = train.groupby(['day'])['sales'].mean().reset_index()
fig = go.Figure(data=go.Scatter(x=by_day['day'], 
                                y=by_day['sales'],
                                marker_color='red', text="sales"))
fig.show()
#월초 월말 최대

**요일에 따른 판매량**

In [20]:
# sales by day of week
by_weekday = train.groupby(['day_of_week'])['sales'].mean()
new_order_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
by_weekday_df = by_weekday.reindex(new_order_week, axis=0).reset_index()
fig = px.bar(by_weekday_df, x='day_of_week', y='sales', color='sales', color_continuous_scale="darkmint")
fig.show()
#토요일 일요일에서 가장 많은 수요

In [21]:
#https://www.kaggle.com/howoojang/first-kaggle-notebook-following-ts-tutorial
print(holidays_events['type'].unique())
print(holidays_events['type'].value_counts())

day_type = holidays_events[['date', 'type']]
avg_sales = train.groupby('date').agg({'sales': 'mean'}).reset_index()

day_type['date'] = pd.to_datetime(day_type['date'])
avg_sales['date'] = pd.to_datetime(avg_sales['date'])

#print(day_type.head())
#print(avg_sales.head())

df = pd.merge_asof(day_type, avg_sales, on = 'date')
df.dropna(inplace= True)
df.reset_index(drop = True, inplace= True)

#print(df.head())

df_1 = df.groupby(['type']).mean()['sales']
average_holiday_sales = df_1.mean()
#print(df_1.head())

print(f'average holiday sales is {average_holiday_sales}')

df_1.plot(kind='bar', figsize = (12,6)).set_title('average holiday sales')
#holiday average sale이 주말의 sale 값과 비슷하다

In [22]:
print(holidays_events['locale'].unique())
print(holidays_events['locale'].value_counts())

## Feature Engineering

In [23]:
#oil price 결측값 대체, 이름 변경
oil = oil.fillna(method = 'backfill')
print(oil.isna().sum())
oil.rename(columns = {'dcoilwtico': 'oil_price'}, inplace = True)

oil['date'] = pd.to_datetime(oil['date'])

In [24]:
daily_sales['sales'].head()

In [25]:
transactions.head()

In [26]:
oil.drop(columns = ['sales', 'transactions'], inplace = True)
#상관계수 구할때 더해준 열 제거

In [27]:
#locale정보와 위치 정보를 묶어서 할 수 없으려나
# 불필요한 정보 제거
train_holiday.drop(columns = ['locale_name','description','transferred'], inplace = True)
train_holiday.head()

In [28]:
train_store=train_holiday.merge(stores, on = 'store_nbr', how='left')
train1=train_store
train_store.head()
#store,holiday,train 모두 합쳐줌

In [29]:
train1['family'].replace(['AUTOMOTIVE', 'HARDWARE', 'LAWN AND GARDEN', 'PLAYERS AND ELECTRONICS'], 'Tools', inplace = True)

train1['family'].replace(['BEAUTY', 'LINGERIE', 'LADIESWEAR','PERSONAL CARE','CELEBRATION','MAGAZINES','BOOKS', 'BABY CARE'], 'LifeStyle', inplace = True)

train1['family'].replace(['HOME APPLIANCES','HOME AND KITCHEN I', 'HOME AND KITCHEN II','HOME CARE','SCHOOL AND OFFICE SUPPLIES'], 'Home', inplace=True)

train1['family'].replace([ 'GROCERY II', 'PET SUPPLIES','SEAFOOD','LIQUOR,WINE,BEER'], 'Food', inplace=True)

train1['family'].replace(['DELI', 'EGGS'], 'Daily', inplace=True)

test['family'].replace(['AUTOMOTIVE', 'HARDWARE', 'LAWN AND GARDEN', 'PLAYERS AND ELECTRONICS'], 'Tools', inplace = True)

test['family'].replace(['BEAUTY', 'LINGERIE', 'LADIESWEAR','PERSONAL CARE','CELEBRATION','MAGAZINES','BOOKS', 'BABY CARE'], 'LifeStyle', inplace = True)

test['family'].replace(['HOME APPLIANCES','HOME AND KITCHEN I', 'HOME AND KITCHEN II','HOME CARE','SCHOOL AND OFFICE SUPPLIES'], 'Home', inplace=True)

test['family'].replace([ 'GROCERY II', 'PET SUPPLIES','SEAFOOD','LIQUOR,WINE,BEER'], 'Food', inplace=True)

test['family'].replace(['DELI', 'EGGS'], 'Daily', inplace=True)

In [30]:
train1['family'].unique()
#family종류가 너무 여러가지이므로 묶어줌

In [31]:
#https://www.kaggle.com/sudipg411/eda-feature-engineering-visualization
def feature_eng(data):
    data['date'] = pd.to_datetime(data['date'])
    data['dayofweek'] = data['date'].dt.dayofweek
    data['quarter'] = data['date'].dt.quarter
    data['month'] = data['date'].dt.month
    data['year'] = data['date'].dt.year
    data['dayofyear'] = data['date'].dt.dayofyear
    data['dayofmonth'] = data['date'].dt.day
    return data
    
train1 = feature_eng(train1)
test1 = feature_eng(test)
train1.head()

In [32]:
#promotion은 어떻게,,?

#모델링 하기 전 라벨링 작업
# lag

**XGBoost의 장점**
1. 강력한 병렬 처리로 학습과 처리가 빠름
(Gradient Boosting Model 대비 빠른 것)
2. Greedy-algorithm을 이용, 자동 가지치기(Pruning)가 가능
과적합(Overfitting) 방지
3. 자체 교차검증 알고리즘 탑재
4. 결측치 자체 처리
5. Early Stopping 
100번으로 설정 시 100번 동안 성능이 좋아지지 않는다면 그 전에 생성된 최고로 좋은 모델을 선택
6. CART (Classification And Regression Tree)기반으로 분류, 회귀 둘다 구현

 

**앙상블**
* **배깅**:훈련데이터를 다르게 랜덤 복원 샘플링
* **부스팅**:모델을 변화시킴 <br>
    1.오류에 가중치
    2.오차를 훈련데이터에 투입 -**XGBoost**

**XGBoost**<br>
이전 모델에서의 실제값과 예측값의 오차(loss)를 훈련데이터 투입하고 gradient를 이용하여 오류를 보완하는 방식을 사용
<br>
* 방법 1 : 파이썬 래퍼 XGBoost 모듈<br>(import xgboost as xgb)
* 방법 2 : 사이킷런 래퍼 XGBoost 모듈<br>(from xgboost import XGBClassifier)

<br>
from xgboost import XGBClassifier

#객체 생성<br>
model = XGBClassifier(파라미터들)<br>
xgb_model = model.fit(x_train, y_train, early_stopping_rounds=100, 
                        eval_metric='logloss',eval_set=[(X_val, y_val)])

#예측하기<br>
y_pre = xgb_model.predict(X_test)
y_pred_probs = xgb_model.predict_proba(X_test)[:,1]

#특성 중요도 시각화<br>
fig, ax = plt.subplots(figsize=(10,12))
plot_importance(xgb_model, ax=ax)

**파라미터**
* General Parameters<br>
 booster [default = 'gbtree'] <br>
(gbtree : 트리 기반 모델  gblinear : 선형 모델)<br>
 silent [default = 0]
 <br>nthread [default = 전체 다 사용]
* Booster Parameters<br>
* Learning Task Parameters 

**GridSearchCV**

## Modeling
xgb, linear , ridge

In [33]:
from sklearn import preprocessing
enc = preprocessing.LabelEncoder()
enc.fit(train1['family'])
train1['family'] = enc.transform(train1['family'])
test['family'] = enc.transform(test['family'])

In [34]:
from sklearn.model_selection import train_test_split
features=['date','store_nbr','family','onpromotion','type_x','city','state','type_y','cluster']
X=train1[features]
y=train1.sales
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [35]:
import xgboost as xgb
model = xgb.XGBRegressor()
model.fit(train1.loc[:, train1.columns != 'sales'], train1['sales'])

In [None]:
#XGBregressor
import xgboost as xgb
model = xgb.XGBRegressor(objective='reg:squaredlogerror', n_estimators=200)
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)
y_val_pred = np.where(y_val_pred<0, 0, y_val_pred)
print('Root Mean Squared Logaritmic Error:', np.sqrt(mean_squared_log_error(y_val, y_val_pred)))

In [None]:
#Linear Regressor
reg = LinearRegression(normalize=True).fit(X_train, y_train)
y_val_pred = reg.predict(X_val)
y_val_pred = np.where(y_val_pred<0, 0, y_val_pred)
print('Root Mean Squared Logaritmic Error:', np.sqrt(mean_squared_log_error(y_val, y_val_pred)))

In [None]:
# Visualize the data

plt.figure(figsize=(20,5))
#title = stock + ' Model Forecast'
#ylabel = stock + ' Close Price USD ($)'

plt.plot(train['sales'])
plt.plot(valid[['sales','Predictions']])

plt.xlabel('Date', fontsize=14)
plt.ylabel('Sales ($)', fontsize=14)
plt.title('Prediction using Linear Model', fontsize=16)
plt.grid()
plt.legend(['Training Data', 'Validated Data', 'Predicted Data'], loc='upper left')

plt.show()

## submission

In [None]:
My_submission = pd.read_csv(datafolder + 'sample_submission.csv', index_col='id')
My_submission.sales = sales_pred.values
My_submission.to_csv('submission.csv', index=True)

In [None]:
My_submission