In [120]:
import csv
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [147]:
# Load Dataset
def load_data():
    geo_params = pd.read_csv('../Data/geo_params.csv')
    sales = pd.read_csv('../Data/sales.csv')
    sku = pd.read_csv('../Data/sku.csv')
    test = pd.read_csv('../Data/test.csv')
    
    ## Transform dataset
    
    sales['date'] = pd.to_datetime(sales['date'])
    
    # Fill null values with 0
    sales['price'].fillna(0, inplace=True)
    sales['sales'].fillna(0, inplace=True)
    
    # Normalize data
    sales[['price','sales']] = minmax_scale(sales[['price','sales']])
    
    sku.dropna(subset = ['Category'], inplace=True)
    sku['Category'] = [c.split(',') for c in sku['Category']]
    #print(sku['Category'])
    
    del sku['Units'] # Null field
    
    return geo_params, sales, sku, test

In [149]:
[geo_params,sales,sku,test] = load_data()
# Categories to train the model
cat_aux = (np.concatenate(sku['Category'].to_numpy()).ravel())
Categories = np.char.strip(np.unique(cat_aux))
print(Categories)

['import' 'light' 'own production' 'parbaking' 'plain'
 'slightly sparkling' 'sparkling' 'still' 'toast-type' 'white mold'
 'Avocado' 'Banana' 'Ciabatta' 'Fancy strudel with poppy stuff'
 'Flavored baguette loaf baked in the hearth'
 'Flavored wheat bread baked in the hearth'
 'Grain bread roll baked in the hearth' 'Grapefruit' 'Kiwi' 'Lemon' 'Lime'
 'Mandarin' 'Mango' 'Milk cakes' 'Milk desserts' 'Orange'
 'Plain baguette loaf' 'Plain baguette loaf baked in the hearth'
 'Plain bread roll baked in the hearth' 'Plain croissant'
 'Plain fancy cake' 'Plain long loaf' 'Plain rye-wheat bread'
 'Plain wheat bread' 'Pomegranate'
 'Rye-wheat grain bread baked in the hearth' 'Semi-hard bulk cheese'
 'Semi-hard coarse-pored cheese' 'Semi-hard layered cheese'
 'Small fancy bread with berry stuff' 'Small flavored bread roll'
 'Soft cheese' 'Sweet bun' 'Water' 'Wheat bran bread baked in the hearth'
 'Yoghurts']


In [97]:
# Geo_params Headers
print(geo_params.head())

   geoCluster  cityId
0          21       1
1          47       1
2          48       1
3          92       1
4         112       1


In [98]:
# Sales Headers
print(sales.head())

           ID  geoCluster    SKU       date     price     sales
0  RR27956459          92  32485 2021-07-05  0.005284  0.004994
1  RR27956474          92  32549 2021-07-05  0.012335  0.000624
2  RR27956489         112  32485 2021-05-27  0.006263  0.004619
3  RR27956490         112  32485 2021-05-28  0.000000  0.000000
4  RR27956491         112  32485 2021-05-29  0.000000  0.000000


In [99]:
# Sku Headers
print(sku.head())

     SKU                      Category  \
0     24                 [Pomegranate]   
1    208           [Water,  sparkling]   
2   1008  [Water,  import,  sparkling]   
3  16649               [Water,  still]   
4  20872           [Water,  sparkling]   

                                            Type  brandId  lagerUnitQuantity  \
0            Tropical fruit — Pomegranate— Plain      NaN                1.0   
1  Therapeutic-table water — PET — from 1 to 2 L   1241.0                1.5   
2       Therapeutic-table water — Import — Glass   1241.0                0.5   
3      Table water — PET — from 1 to 2 L — Still   1241.0                1.5   
4  Therapeutic-table water — PET — from 1 to 2 L   2693.0                1.5   

   trademark  countryOfOrigin            Group  
0        NaN              NaN  Tropical fruits  
1     3670.0              1.0    Mineral water  
2     4970.0             14.0    Mineral water  
3     1323.0              1.0    Mineral water  
4     4384.0         

In [100]:
# Test Headers
print(test.head())

           ID  geoCluster    SKU        date  price_filled  sales
0  RR27956447          21  32485  2021-07-08         39.69    NaN
1  RR27956448          21  32485  2021-07-09         39.69    NaN
2  RR27956449          21  32485  2021-07-10         39.69    NaN
3  RR27956450          21  32485  2021-07-11         39.69    NaN
4  RR27956451          21  32485  2021-07-12         39.69    NaN


In [153]:
# Merge table sales and sku

df = pd.merge(sales, sku, on='SKU', how='inner')
df = df.explode('Category')
df['Category'] = df['Category'].str.strip()

In [154]:
# Types of columns

print(df.dtypes)

ID                           object
geoCluster                    int64
SKU                           int64
date                 datetime64[ns]
price                       float64
sales                       float64
Category                     object
Type                         object
brandId                     float64
lagerUnitQuantity           float64
trademark                   float64
countryOfOrigin             float64
Group                        object
dtype: object


In [155]:
# Load train and test set

def load_sets(data):
    X = data[['date', 'price']].copy()
    y = data[['sales']].copy()
    trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.1, shuffle=False)
    return trainX, trainY, testX, testY

In [109]:
# Train 1 model for each category
for category in Categories:
    print(category.strip())
    print(df['Category'])
    data = df.loc[df['Category'] == category]
    
    trainX, trainY, testX, testY = load_sets(data)
    
    
#print(category, trainX.shape, trainY.shape, testX.shape, testY.shape)
#print(Categories)

import
0                                     [Banana]
1                                     [Banana]
2                                     [Banana]
3                                     [Banana]
4                                     [Banana]
                          ...                 
4121838    [Plain fancy cake,  own production]
4121839    [Plain fancy cake,  own production]
4121840    [Plain fancy cake,  own production]
4121841    [Plain fancy cake,  own production]
4121842    [Plain fancy cake,  own production]
Name: Category, Length: 4121843, dtype: object


ValueError: With n_samples=0, test_size=0.1 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [29]:
trainX = train.iloc[:, :-1].to_numpy()
trainY = train.iloc[:, -1].to_numpy()

testX = test.iloc[:, :-1].to_numpy()
testY = test.iloc[:, -1].to_numpy()


# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))



NameError: name 'train' is not defined

In [None]:
# create and fit the LSTM network

look_back = 1
model = Sequential()
model.add(LSTM(4, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)

In [None]:
lstm_model(train, test)