In [14]:
import csv
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [15]:
# Load Dataset

def load_data():
    geo_params = pd.read_csv('../Data/geo_params.csv')
    sales = pd.read_csv('../Data/sales.csv')
    sku = pd.read_csv('../Data/sku.csv')
    test = pd.read_csv('../Data/test.csv')
    
    ## Transform dataset
    
    sales['date'] = pd.to_datetime(sales['date'])
    
    # Fill null values with 0
    sales['price'].fillna(0, inplace=True)
    sales['sales'].fillna(0, inplace=True)
    
    # Normalize data
    sales[['price','sales']] = minmax_scale(sales[['price','sales']])
    
    sku.dropna(subset = ['Category'], inplace=True)
    del sku['Units'] # Null field
    
    return geo_params, sales, sku, test

In [43]:
[geo_params,sales,sku,test] = load_data()

# Categories to train the model
Categories = sku['Category'].unique()

In [44]:
# Geo_params Headers

print(geo_params.head())

   geoCluster  cityId
0          21       1
1          47       1
2          48       1
3          92       1
4         112       1


In [45]:
# Sales Headers

print(sales.head())

           ID  geoCluster    SKU       date     price     sales
0  RR27956459          92  32485 2021-07-05  0.005284  0.004994
1  RR27956474          92  32549 2021-07-05  0.012335  0.000624
2  RR27956489         112  32485 2021-05-27  0.006263  0.004619
3  RR27956490         112  32485 2021-05-28  0.000000  0.000000
4  RR27956491         112  32485 2021-05-29  0.000000  0.000000


In [46]:
# Sku Headers

print(sku.head())

     SKU                  Category  \
0     24               Pomegranate   
1    208          Water, sparkling   
2   1008  Water, import, sparkling   
3  16649              Water, still   
4  20872          Water, sparkling   

                                            Type  brandId  lagerUnitQuantity  \
0            Tropical fruit — Pomegranate— Plain      NaN                1.0   
1  Therapeutic-table water — PET — from 1 to 2 L   1241.0                1.5   
2       Therapeutic-table water — Import — Glass   1241.0                0.5   
3      Table water — PET — from 1 to 2 L — Still   1241.0                1.5   
4  Therapeutic-table water — PET — from 1 to 2 L   2693.0                1.5   

   trademark  countryOfOrigin            Group  
0        NaN              NaN  Tropical fruits  
1     3670.0              1.0    Mineral water  
2     4970.0             14.0    Mineral water  
3     1323.0              1.0    Mineral water  
4     4384.0              1.0    Mineral wate

In [47]:
# Test Headers

print(test.head())

           ID  geoCluster    SKU        date  price_filled  sales
0  RR27956447          21  32485  2021-07-08         39.69    NaN
1  RR27956448          21  32485  2021-07-09         39.69    NaN
2  RR27956449          21  32485  2021-07-10         39.69    NaN
3  RR27956450          21  32485  2021-07-11         39.69    NaN
4  RR27956451          21  32485  2021-07-12         39.69    NaN


In [48]:
# Merge table sales and sku

df = pd.merge(sales, sku, on='SKU', how='inner')
df.head()

Unnamed: 0,ID,geoCluster,SKU,date,price,sales,Category,Type,brandId,lagerUnitQuantity,trademark,countryOfOrigin,Group
0,RR27956459,92,32485,2021-07-05,0.005284,0.004994,Banana,Banana — Plain,,1.0,,,Tropical fruits
1,RR27956489,112,32485,2021-05-27,0.006263,0.004619,Banana,Banana — Plain,,1.0,,,Tropical fruits
2,RR27956490,112,32485,2021-05-28,0.0,0.0,Banana,Banana — Plain,,1.0,,,Tropical fruits
3,RR27956491,112,32485,2021-05-29,0.0,0.0,Banana,Banana — Plain,,1.0,,,Tropical fruits
4,RR27956492,112,32485,2021-05-30,0.0,0.0,Banana,Banana — Plain,,1.0,,,Tropical fruits


In [49]:
# Types of columns

print(df.dtypes)

ID                           object
geoCluster                    int64
SKU                           int64
date                 datetime64[ns]
price                       float64
sales                       float64
Category                     object
Type                         object
brandId                     float64
lagerUnitQuantity           float64
trademark                   float64
countryOfOrigin             float64
Group                        object
dtype: object


In [59]:
# Load train and test set

def load_sets(data):
    X = data[['date', 'price']].copy()
    y = data[['sales']].copy()
    trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.1, shuffle=False)
    return trainX, trainY, testX, testY

In [60]:
# Train 1 model for each category

for category in Categories:
    data = df.loc[df['Category'] == category]
    
    trainX, trainY, testX, testY = load_sets(data)
    
    
    #print(category, trainX.shape, trainY.shape, testX.shape, testY.shape)

Pomegranate (66049, 2) (66049, 1) (7339, 2) (7339, 1)
Water, sparkling (212250, 2) (212250, 1) (23584, 2) (23584, 1)
Water, import, sparkling (185738, 2) (185738, 1) (20638, 2) (20638, 1)
Water, still (207763, 2) (207763, 1) (23085, 2) (23085, 1)
Semi-hard coarse-pored cheese (343400, 2) (343400, 1) (38156, 2) (38156, 1)
Banana (115188, 2) (115188, 1) (12799, 2) (12799, 1)
Avocado (229132, 2) (229132, 1) (25460, 2) (25460, 1)
Orange (103050, 2) (103050, 1) (11451, 2) (11451, 1)
Grapefruit (133115, 2) (133115, 1) (14791, 2) (14791, 1)
Lemon (103980, 2) (103980, 1) (11554, 2) (11554, 1)
Water, slightly sparkling (84467, 2) (84467, 1) (9386, 2) (9386, 1)
Mango (75942, 2) (75942, 1) (8439, 2) (8439, 1)
Semi-hard bulk cheese (82992, 2) (82992, 1) (9222, 2) (9222, 1)
Milk cakes (67931, 2) (67931, 1) (7548, 2) (7548, 1)
Kiwi (88196, 2) (88196, 1) (9800, 2) (9800, 1)
Lime (90837, 2) (90837, 1) (10093, 2) (10093, 1)
Plain long loaf, own production (43676, 2) (43676, 1) (4853, 2) (4853, 1)
Plain

In [None]:
trainX = train.iloc[:, :-1].to_numpy()
trainY = train.iloc[:, -1].to_numpy()

testX = test.iloc[:, :-1].to_numpy()
testY = test.iloc[:, -1].to_numpy()


# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))



In [None]:
# create and fit the LSTM network

look_back = 1
model = Sequential()
model.add(LSTM(4, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)

In [None]:
lstm_model(train, test)