### 라이브러리 선언

In [2]:
# 데이터 조작 라이브러리 및 시각화 라이브러리
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

In [3]:
import tensorflow as tf

In [4]:
### 사이킷런 전처리 함수들
from sklearn.preprocessing import LabelEncoder

In [5]:
from tensorflow import keras

In [6]:
from tensorflow.keras import Sequential

In [7]:
from tensorflow.keras.layers import Dense, Dropout

In [8]:
from tensorflow.keras import losses

In [9]:
from tensorflow.keras import metrics

### 데이터 불러오기

In [10]:
featuresData = pd.read_csv("../dataset/feature_regression_example.csv")

In [11]:
featuresData.head(2)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442


In [12]:
featuresData.shape

(105, 12)

In [13]:
featuresData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   REGIONID      105 non-null    object 
 1   PRODUCTGROUP  105 non-null    object 
 2   PRODUCT       105 non-null    object 
 3   ITEM          105 non-null    object 
 4   YEARWEEK      105 non-null    int64  
 5   YEAR          105 non-null    int64  
 6   WEEK          105 non-null    int64  
 7   QTY           105 non-null    int64  
 8   HOLIDAY       105 non-null    object 
 9   HCLUS         105 non-null    int64  
 10  PROMOTION     105 non-null    object 
 11  PRO_PERCENT   105 non-null    float64
dtypes: float64(1), int64(5), object(6)
memory usage: 10.0+ KB


In [14]:
featuresData.describe()

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT
count,105.0,105.0,105.0,105.0,105.0,105.0
mean,201576.27619,2015.495238,26.752381,1210.238095,2.742857,0.20578
std,52.254278,0.502375,15.229514,820.097819,1.587382,0.128636
min,201501.0,2015.0,1.0,15.0,0.0,0.0
25%,201527.0,2015.0,14.0,542.0,1.0,0.208155
50%,201553.0,2015.0,27.0,1139.0,4.0,0.209442
75%,201626.0,2016.0,40.0,1753.0,4.0,0.280258
max,201652.0,2016.0,53.0,4035.0,4.0,0.421888


### 1. 데이터 타입 통합 및 특성 숫자 변환

#### 1-1 데이터 타입 통합

#### 1-2 특성값 숫자 컬럼 변경

In [15]:
### np.where or loc or ynMap

In [16]:
### 컬럼 종류 확인하는 방법
featuresData.HOLIDAY.drop_duplicates().shape[0]

2

In [17]:
### 1. np.where : 컬럼의 종류가 2건 or 3건 정도에 사용하면 좋다

In [18]:
featuresData["HOLIDAY_LE"] = np.where(featuresData.HOLIDAY == "Y", 1, 0)

In [19]:
### 2. ynMap : 컬럼의 종류가 3건 이상 7건정도 이하인 경우

In [20]:
ynMap = {"Y":1, "N":0}

In [21]:
featuresData.HOLIDAY.map(ynMap)

0      1
1      0
2      0
3      1
4      0
      ..
100    1
101    0
102    0
103    0
104    1
Name: HOLIDAY, Length: 105, dtype: int64

In [22]:
### 3. loc

In [23]:
featuresData.loc[featuresData.HOLIDAY == "Y", "HOLIDAY_LE"] = 1
featuresData.loc[featuresData.HOLIDAY == "N", "HOLIDAY_LE"] = 0

In [24]:
### 4. 7건 이상인 경우 LableEncoder를 쓴다 -> 자동으로 인코딩을 붙여줌

In [25]:
### 객체 생성
holidayLe = LabelEncoder()

In [26]:
### 알아서 라벨링을 붙여줌
holidayLe.fit_transform( featuresData.HOLIDAY )

array([1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1])

In [27]:
featuresData["HOLIDAY_LE"] = holidayLe.fit_transform( featuresData.HOLIDAY )

In [28]:
featuresData

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HOLIDAY_LE
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201648,2016,48,2412,Y,0,Y,0.421888,1
101,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201649,2016,49,1955,N,4,Y,0.421888,0
102,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201650,2016,50,1800,N,4,Y,0.352361,0
103,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201651,2016,51,1173,N,4,Y,0.352361,0


In [29]:
### 라벨링 풀어주는 방법
holidayLe.inverse_transform(featuresData["HOLIDAY_LE"])

array(['Y', 'N', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'N', 'N', 'N', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N',
       'N', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'N', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'N', 'N', 'N', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N',
       'N', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N',
       'Y'], dtype=object)

#### 실습

In [30]:
promotionLe = LabelEncoder()

In [31]:
promotionLe.fit_transform(featuresData.PROMOTION)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [32]:
featuresData["PROMOTION_LE"] = promotionLe.fit_transform(featuresData.PROMOTION)

#### 1-3 데이터 선택

In [33]:
featuresData = featuresData.loc[(featuresData.YEARWEEK >= 201601) & (featuresData.YEARWEEK <= 201652)]

### 2. 특성선정 및 데이터 분리(*단 특성선정은 제외)

#### 2-1 머신러닝용

In [34]:
corrDf = featuresData.corr()

In [35]:
stdCorr = 0.5

In [36]:
corrDf.loc[(abs(corrDf.QTY) >= stdCorr) & (corrDf.QTY != 1)]

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,HOLIDAY_LE,PROMOTION_LE
PRO_PERCENT,0.565697,,0.565697,0.758393,-0.51521,1.0,0.437308,0.868357
PROMOTION_LE,0.216634,,0.216634,0.671958,-0.260641,0.868357,0.227542,1.0


In [37]:
list(corrDf.loc[(abs(corrDf.QTY) >= stdCorr) & (corrDf.QTY != 1)].index)

['PRO_PERCENT', 'PROMOTION_LE']

### --------------------------------------------------------------------------

In [38]:
### 합집합, 차집합 쉽게 구할 수 있는 자료형
set([1,2,3,4]) - set([2,3])

{1, 4}

In [39]:
firstSet = set([1,2,3,4])

In [40]:
### instersection : 교집합
firstSet.intersection(set([1,2]))

{1, 2}

In [41]:
### difference : 차집합
firstSet.difference(set([1,2,5,6]))

{3, 4}

In [42]:
### union : 합집합
firstSet.union(set([1,2,5,6]))

{1, 2, 3, 4, 5, 6}

### --------------------------------------------------------------------------

In [43]:
features = \
    set(corrDf.loc[(abs(corrDf.QTY) >= stdCorr) & (corrDf.QTY != 1)].index)

In [44]:
features

{'PROMOTION_LE', 'PRO_PERCENT'}

#### 2-2 딥러닝용

In [45]:
### 숫자형 컬럼만 추출
featuresData.select_dtypes(np.number)

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,HOLIDAY_LE,PROMOTION_LE
53,201601,2016,1,711,4,0.237768,0,1
54,201602,2016,2,16,4,0.0,0,0
55,201603,2016,3,15,2,0.0,1,0
56,201604,2016,4,424,4,0.237768,0,1
57,201605,2016,5,1139,4,0.237768,0,1
58,201606,2016,6,1504,4,0.237768,0,1
59,201607,2016,7,1552,1,0.237768,1,1
60,201608,2016,8,896,4,0.237768,0,1
61,201609,2016,9,219,4,0.0,0,0
62,201610,2016,10,753,4,0.209442,0,1


In [46]:
featuresData.select_dtypes(np.number).columns

Index(['YEARWEEK', 'YEAR', 'WEEK', 'QTY', 'HCLUS', 'PRO_PERCENT', 'HOLIDAY_LE',
       'PROMOTION_LE'],
      dtype='object')

In [47]:
allColumns = \
    list(featuresData.select_dtypes(np.number).columns)

In [48]:
allColumns

['YEARWEEK',
 'YEAR',
 'WEEK',
 'QTY',
 'HCLUS',
 'PRO_PERCENT',
 'HOLIDAY_LE',
 'PROMOTION_LE']

In [49]:
label = ["QTY"]

In [50]:
features = set(allColumns) - set(label)

In [51]:
features

{'HCLUS',
 'HOLIDAY_LE',
 'PROMOTION_LE',
 'PRO_PERCENT',
 'WEEK',
 'YEAR',
 'YEARWEEK'}

In [52]:
### training data 와 test data로 분리
stdRatio = 0.7

In [53]:
sortKey = ["YEARWEEK"]

In [54]:
featuresData = featuresData.sort_values(sortKey).reset_index(drop=True)

In [55]:
stdIndex = int(featuresData.shape[0] * stdRatio)

In [56]:
stdYearWeek = featuresData.loc[stdIndex].YEARWEEK

In [57]:
trainingDataFeatures = \
    featuresData.loc[featuresData.YEARWEEK <= stdYearWeek, features]
trainingDataLable = \
    featuresData.loc[featuresData.YEARWEEK <= stdYearWeek, label]
testDataFeatures = \
    featuresData.loc[featuresData.YEARWEEK > stdYearWeek, features]
testDataLable = \
    featuresData.loc[featuresData.YEARWEEK > stdYearWeek, label]

In [58]:
print(trainingDataFeatures.shape)
print(trainingDataLable.shape)
print(testDataFeatures.shape)
print(testDataLable.shape)

(37, 7)
(37, 1)
(15, 7)
(15, 1)


### 3. 모델 생성

In [59]:
baseModelUnits = 32

In [60]:
# 모델 초기화
model = Sequential()

In [61]:
trainingDataFeatures.loc[0]

PRO_PERCENT          0.237768
YEARWEEK        201601.000000
YEAR              2016.000000
WEEK                 1.000000
PROMOTION_LE         1.000000
HOLIDAY_LE           0.000000
HCLUS                4.000000
Name: 0, dtype: float64

In [62]:
inputShape = trainingDataFeatures.loc[0].shape

In [63]:
model.add( Dense( units=baseModelUnits, activation="relu", input_shape = inputShape ) )

In [64]:
model.add( Dense( units=baseModelUnits, activation="relu" ) )
model.add( Dense( units=baseModelUnits, activation="relu" ) )
model.add( Dense( units=baseModelUnits, activation="relu" ) )

In [65]:
model.add( Dense( units=1, activation="relu" ) )

In [66]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                256       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 3,457
Trainable params: 3,457
Non-trainable params: 0
_________________________________________________________________


###### unit에 8 넣었을 때
###### 7*8 + 8 => 56 + 8 => 64
###### 8*8 + 8 => 64 + 8 => 72
###### 8*1 + 1 => 8 + 1 => 9

### 4. 모델 컴파일 (*딥러닝 추가 : Loss function[손실함수] 정의 및 optimizer[교수] 정의)

In [67]:
model.compile( optimizer="adam",
               loss="mean_squared_error",
               metrics=["mean_absolute_error"] )

### 5. 모델 훈련 (* callback 함수 등장!!! tensorboard, modelcheckpoint, earlystopping)

In [68]:
model.fit(x=trainingDataFeatures,
          y=trainingDataLable,
          epochs=2000,
          batch_size=32)

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

<tensorflow.python.keras.callbacks.History at 0x1f6afa173a0>

### 6. 모델 추론

In [72]:
userInput = 2
userHolidayInput = 1
userPromotionInput = 1
userProPercent = 0.7

In [73]:
testData = pd.DataFrame( [[userInput, userHolidayInput, userPromotionInput, userProPercent]] )

In [74]:
model.predict(testData)

ValueError: in user code:

    C:\Users\AnJungHoon\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:1569 predict_function  *
        return step_function(self, iterator)
    C:\Users\AnJungHoon\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:1559 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\AnJungHoon\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1285 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\AnJungHoon\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2833 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\AnJungHoon\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3608 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\AnJungHoon\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:1552 run_step  **
        outputs = model.predict_step(data)
    C:\Users\AnJungHoon\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:1525 predict_step
        return self(x, training=False)
    C:\Users\AnJungHoon\anaconda3\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:1013 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    C:\Users\AnJungHoon\anaconda3\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:251 assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer sequential is incompatible with the layer: expected axis -1 of input shape to have value 7 but received input with shape (None, 4)


In [None]:
### 훈련모델 저장
model.save_weights("./2016weights.h5")

### 7. 모델 예측

### 8. 데이터 정리