In [169]:
import csv
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [170]:
# Load Dataset
def load_data():
    geo_params = pd.read_csv('../Data/geo_params.csv')
    sales = pd.read_csv('../Data/sales.csv')
    sku = pd.read_csv('../Data/sku.csv')
    test = pd.read_csv('../Data/test.csv')
    
    ## Transform dataset
    sales['date'] = pd.to_datetime(sales['date'])
    
    # Fill null values with 0
    sales['price'].fillna(0, inplace=True)
    sales['sales'].fillna(0, inplace=True)
    
    # Normalize data
    sales[['price','sales']] = minmax_scale(sales[['price','sales']])
    
    sku.dropna(subset = ['Category'], inplace=True)
    sku['Category'] = [c.split(',') for c in sku['Category']]
    #print(sku['Category'])
    
    sales.dropna(subset = ['geoCluster'], inplace=True)
    
    del sku['Units'] # Null field
    
    return geo_params, sales, sku, test

In [171]:
[geo_params,sales,sku,test] = load_data()
# Categories to train the model
cat_aux = (np.concatenate(sku['Category'].to_numpy()).ravel())
Categories = np.char.strip(np.unique(cat_aux))
GeoCluster = np.unique(sales['geoCluster'])

In [172]:
# Geo_params Headers
print(geo_params.head())

   geoCluster  cityId
0          21       1
1          47       1
2          48       1
3          92       1
4         112       1


In [173]:
# Sales Headers
print(sales.head())

           ID  geoCluster    SKU       date     price     sales
0  RR27956459          92  32485 2021-07-05  0.005284  0.004994
1  RR27956474          92  32549 2021-07-05  0.012335  0.000624
2  RR27956489         112  32485 2021-05-27  0.006263  0.004619
3  RR27956490         112  32485 2021-05-28  0.000000  0.000000
4  RR27956491         112  32485 2021-05-29  0.000000  0.000000


In [174]:
# Sku Headers
print(sku.head())

     SKU                      Category  \
0     24                 [Pomegranate]   
1    208           [Water,  sparkling]   
2   1008  [Water,  import,  sparkling]   
3  16649               [Water,  still]   
4  20872           [Water,  sparkling]   

                                            Type  brandId  lagerUnitQuantity  \
0            Tropical fruit — Pomegranate— Plain      NaN                1.0   
1  Therapeutic-table water — PET — from 1 to 2 L   1241.0                1.5   
2       Therapeutic-table water — Import — Glass   1241.0                0.5   
3      Table water — PET — from 1 to 2 L — Still   1241.0                1.5   
4  Therapeutic-table water — PET — from 1 to 2 L   2693.0                1.5   

   trademark  countryOfOrigin            Group  
0        NaN              NaN  Tropical fruits  
1     3670.0              1.0    Mineral water  
2     4970.0             14.0    Mineral water  
3     1323.0              1.0    Mineral water  
4     4384.0         

In [175]:
# Test Headers
print(test.head())

           ID  geoCluster    SKU        date  price_filled  sales
0  RR27956447          21  32485  2021-07-08         39.69    NaN
1  RR27956448          21  32485  2021-07-09         39.69    NaN
2  RR27956449          21  32485  2021-07-10         39.69    NaN
3  RR27956450          21  32485  2021-07-11         39.69    NaN
4  RR27956451          21  32485  2021-07-12         39.69    NaN


In [176]:
# Merge table sales and sku

df = pd.merge(sales, sku, on='SKU', how='inner')
df = df.explode('Category')
df['Category'] = df['Category'].str.strip()

In [177]:
# Types of columns
print(df.dtypes)

ID                           object
geoCluster                    int64
SKU                           int64
date                 datetime64[ns]
price                       float64
sales                       float64
Category                     object
Type                         object
brandId                     float64
lagerUnitQuantity           float64
trademark                   float64
countryOfOrigin             float64
Group                        object
dtype: object


In [155]:
# Load train and test set

def load_sets(data):
    X = data[['date', 'price']].copy()
    y = data[['sales']].copy()
    trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.1, shuffle=False)
    return trainX, trainY, testX, testY

In [None]:
# Train 1 model for each category
for geocluster in GeoCluster:
    print(df.loc[df['geoCluster']])
    for category in Categories:
        #data = [df.loc[df['Category'] == category], df.loc[df['geoCluster'] == geocluster)]
        print()
        #trainX, trainY, testX, testY = load_sets(data)

#print(testY)

              ID  geoCluster    SKU       date  price  sales Category  \
92    RR27956641         131  32485 2021-06-10    0.0    0.0   Banana   
112   RR27956661         131  32485 2021-06-30    0.0    0.0   Banana   
112   RR27956661         131  32485 2021-06-30    0.0    0.0   Banana   
112   RR27956661         131  32485 2021-06-30    0.0    0.0   Banana   
112   RR27956661         131  32485 2021-06-30    0.0    0.0   Banana   
...          ...         ...    ...        ...    ...    ...      ...   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   

                Type  brandId  lagerUnitQuantity  trademark  countryOfOrigin  \
92    Banana — Plain      NaN              

              ID  geoCluster    SKU       date  price  sales Category  \
92    RR27956641         131  32485 2021-06-10    0.0    0.0   Banana   
112   RR27956661         131  32485 2021-06-30    0.0    0.0   Banana   
112   RR27956661         131  32485 2021-06-30    0.0    0.0   Banana   
112   RR27956661         131  32485 2021-06-30    0.0    0.0   Banana   
112   RR27956661         131  32485 2021-06-30    0.0    0.0   Banana   
...          ...         ...    ...        ...    ...    ...      ...   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   

                Type  brandId  lagerUnitQuantity  trademark  countryOfOrigin  \
92    Banana — Plain      NaN              

              ID  geoCluster    SKU       date  price  sales Category  \
92    RR27956641         131  32485 2021-06-10    0.0    0.0   Banana   
112   RR27956661         131  32485 2021-06-30    0.0    0.0   Banana   
112   RR27956661         131  32485 2021-06-30    0.0    0.0   Banana   
112   RR27956661         131  32485 2021-06-30    0.0    0.0   Banana   
112   RR27956661         131  32485 2021-06-30    0.0    0.0   Banana   
...          ...         ...    ...        ...    ...    ...      ...   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   
3209  RR27972848        1894  32485 2020-12-20    0.0    0.0   Banana   

                Type  brandId  lagerUnitQuantity  trademark  countryOfOrigin  \
92    Banana — Plain      NaN              

In [29]:
trainX = train.iloc[:, :-1].to_numpy()
trainY = train.iloc[:, -1].to_numpy()

testX = test.iloc[:, :-1].to_numpy()
testY = test.iloc[:, -1].to_numpy()


# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))



NameError: name 'train' is not defined

In [None]:
# create and fit the LSTM network

look_back = 1
model = Sequential()
model.add(LSTM(4, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)

In [None]:
lstm_model(train, test)