In [15]:
import gc
import pandas as pd
import numpy as np
import pyarrow as pa
import tensorflow as tf
from pyarrow import parquet as pq
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

In [12]:
train = pd.read_csv('../Data/Train/sales_train_validation.csv')
# train = train.drop(['item_id','dept_id','cat_id','store_id','state_id'],axis =1)
train.index = train.id
train = train.drop('id',axis = 1)

In [30]:
# To-Do
# 1. Index -> label encode -> stacked into generator
# 2. Calendar event generator corresponding to certain date
# 3. Sell Price generator corresponding to certain date
# 4. Multi-Feature stacked generator

class feature_engineering(object):
    def __init__ (self,df,dimList,encoder = LabelEncoder):
        super().__init__()
        self._df = df
        self._dimList = dimList
        self.arr = df.drop(dimList,axis =1).values
        self.indexList = df.index.tolist()
        self._encoder = encoder
        self.encodeDict = None

    def _rolling_window(self,a, window):
        shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
        strides = a.strides + (a.strides[-1],)
        return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

    def _get_time_tensor(self,arr,window_size):
        tmp = self._rolling_window(arr,window_size+1)
        Xtensor = tmp[:,:-1]
        Ytensor = tmp[:,-1]
        return (Xtensor.reshape(-1,window_size,1),Ytensor.reshape(-1,1))

    def np_to_time_tensor_generator(self,windowSize):
        if np.ndim(self.arr) > 1:
            for ix,v in enumerate(arr):
                yield self._get_time_tensor(v,windowSize)
        else:
            yield self._get_time_tensor(self.arr,windowSize) 

    def _label_encode(self,arr):
        encoder = self._encoder
        enc_arr = encoder().fit_transform(arr)
        return enc_arr,encoder

    def pandas_to_categorical_encode(self):
        encodeDict = {}
        labelDict = {}
        for i in self._dimList:
            enc_arr,encoder = self._label_encode(self._df[i])
            encodeDict[i] = encoder
            labelDict[i] = enc_arr
        self.encodeDict = encodeDict
        return labelDict

    def _get_item_id(self,fullIndex):
        tmp = fullIndex.split('_')
        return '_'.join(tmp[:3])

    def _get_store_id(self,fullIndex):
        tmp = fullIndex.split('_')
        return '_'.join(tmp[3:5])

    def _get_cate_info(self,fullIndex,cateInfoDir):
        item_id = _get_item_id(fullIndex)
        store_id = _get_store_id(fullIndex)
        return pd.read_parquet(cateInfoDir,filters = [("item_id",'=',str(item_id)),("store_id",'=',str(store_id))])

    def _get_events(self,calendar_dir,sdate,edate):
        df = pd.read_csv(calendar_dir)
        df['d_num'] = df.d.apply(lambda x: x.replace('d_','')).astype('int')
        return df[df.d_num.apply(lambda x: x <= edate and x >= sdate)]

    def _get_future_events(self,calendar_dir, sdate,duration):
        edate = sdate+duration-1
        return _get_events(calendar_dir,sdate,edate)



In [31]:
train.columns
fe = feature_engineering(train,['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])
fe.pandas_to_categorical_encode()

{'item_id': array([1437, 1438, 1439, ..., 1434, 1435, 1436]),
 'dept_id': array([3, 3, 3, ..., 2, 2, 2]),
 'cat_id': array([1, 1, 1, ..., 0, 0, 0]),
 'store_id': array([0, 0, 0, ..., 9, 9, 9]),
 'state_id': array([0, 0, 0, ..., 2, 2, 2])}

# Model Training

In [30]:
cacheFile = str(uuid.uuid4())
train_univariate = tf.data.Dataset.from_generator(
    np_to_time_tensor_generator,
    (tf.float32,tf.float32),
    output_shapes = (tf.TensorShape([None,100,1]),tf.TensorShape([None,1])),
    args = (train.head(100).values,100))
train_univariate = train_univariate.prefetch(tf.data.experimental.AUTOTUNE).cache(cacheFile).repeat()

In [31]:
simple_lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(29, input_shape=(100,1)),
    tf.keras.layers.Dense(10,'elu'),
    tf.keras.layers.Dense(1)
])
simple_lstm_model.compile(optimizer='adam', loss='mae')

In [2]:
EVALUATION_INTERVAL = 2000
EPOCHS = 10

simple_lstm_model.fit(
    train_univariate,
    epochs=EPOCHS,
    steps_per_epoch= EVALUATION_INTERVAL,
    validation_data=train_univariate, 
    validation_steps=50
    )

In [None]:
a