In [61]:
import tensorflow as tf

In [62]:
import csv
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [63]:
# Load Dataset

def load_data():
    geo_params = pd.read_csv('../Data/geo_params.csv')
    sales = pd.read_csv('../Data/sales.csv')
    sku = pd.read_csv('../Data/sku.csv')
    test = pd.read_csv('../Data/test.csv')
    
    return [geo_params, sales, sku, test]

In [64]:
[geo_params,sales,sku,test] = load_data()

In [65]:
# Geo_params Headers

print(geo_params.head())

   geoCluster  cityId
0          21       1
1          47       1
2          48       1
3          92       1
4         112       1


In [66]:
# Sales Headers

print(sales.head())

           ID  geoCluster    SKU        date  price  sales
0  RR27956459          92  32485  2021-07-05  38.29    4.0
1  RR27956474          92  32549  2021-07-05  89.39    0.5
2  RR27956489         112  32485  2021-05-27  45.39    3.7
3  RR27956490         112  32485  2021-05-28    NaN    NaN
4  RR27956491         112  32485  2021-05-29    NaN    NaN


In [67]:
# Sku Headers
del sku['Units'] # Null field
print(sku.head())

     SKU                  Category  \
0     24               Pomegranate   
1    208          Water, sparkling   
2   1008  Water, import, sparkling   
3  16649              Water, still   
4  20872          Water, sparkling   

                                            Type  brandId  lagerUnitQuantity  \
0            Tropical fruit — Pomegranate— Plain      NaN                1.0   
1  Therapeutic-table water — PET — from 1 to 2 L   1241.0                1.5   
2       Therapeutic-table water — Import — Glass   1241.0                0.5   
3      Table water — PET — from 1 to 2 L — Still   1241.0                1.5   
4  Therapeutic-table water — PET — from 1 to 2 L   2693.0                1.5   

   trademark  countryOfOrigin            Group  
0        NaN              NaN  Tropical fruits  
1     3670.0              1.0    Mineral water  
2     4970.0             14.0    Mineral water  
3     1323.0              1.0    Mineral water  
4     4384.0              1.0    Mineral wate

In [68]:
# Test Headers

print(test.head())

           ID  geoCluster    SKU        date  price_filled  sales
0  RR27956447          21  32485  2021-07-08         39.69    NaN
1  RR27956448          21  32485  2021-07-09         39.69    NaN
2  RR27956449          21  32485  2021-07-10         39.69    NaN
3  RR27956450          21  32485  2021-07-11         39.69    NaN
4  RR27956451          21  32485  2021-07-12         39.69    NaN


In [69]:
df = pd.merge(sales, sku, on='SKU', how='inner')
df.head()
df['geoCluster'] = df['geoCluster'].astype('float32')
df['SKU'] = df['SKU'].astype('float32')
df['price'] = df['price'].astype('float32')
df['sales'] = df['sales'].astype('float32')
# geoCluste, date, price, sales, category, type, 

In [70]:
#print(dt[dt.trademark == 'NaN'])
print(df['countryOfOrigin'].isna().sum())
print(len(df))
print(df.dtypes)

2329612
4605985
ID                    object
geoCluster           float32
SKU                  float32
date                  object
price                float32
sales                float32
Category              object
Type                  object
brandId              float64
lagerUnitQuantity    float64
trademark            float64
countryOfOrigin      float64
Group                 object
dtype: object


In [54]:
# Fill null values with 0

df.fillna(0)

Unnamed: 0,ID,geoCluster,SKU,date,price,sales,Category,Type,brandId,lagerUnitQuantity,trademark,countryOfOrigin,Group
0,RR27956459,92.0,32485,2021-07-05,38.290001,4.0,Banana,Banana — Plain,0.0,1.0,0.0,0.0,Tropical fruits
1,RR27956489,112.0,32485,2021-05-27,45.389999,3.7,Banana,Banana — Plain,0.0,1.0,0.0,0.0,Tropical fruits
2,RR27956490,112.0,32485,2021-05-28,0.000000,0.0,Banana,Banana — Plain,0.0,1.0,0.0,0.0,Tropical fruits
3,RR27956491,112.0,32485,2021-05-29,0.000000,0.0,Banana,Banana — Plain,0.0,1.0,0.0,0.0,Tropical fruits
4,RR27956492,112.0,32485,2021-05-30,0.000000,0.0,Banana,Banana — Plain,0.0,1.0,0.0,0.0,Tropical fruits
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4605980,RR55905027,3209.0,362721,2021-07-01,36.990002,1.0,"Plain fancy cake, own production",Cupcake,0.0,300.0,9666.0,1.0,Bakery
4605981,RR55905028,3209.0,362721,2021-07-02,36.990002,2.0,"Plain fancy cake, own production",Cupcake,0.0,300.0,9666.0,1.0,Bakery
4605982,RR55905029,3209.0,362721,2021-07-03,0.000000,0.0,"Plain fancy cake, own production",Cupcake,0.0,300.0,9666.0,1.0,Bakery
4605983,RR55905030,3209.0,362721,2021-07-04,36.990002,8.0,"Plain fancy cake, own production",Cupcake,0.0,300.0,9666.0,1.0,Bakery


In [55]:
# normalize the dataset

df[['price','sales']] = minmax_scale(df[['price','sales']])

In [56]:
train, test = train_test_split(df, test_size=0.1)

In [57]:
trainX = train.iloc[:, :-1].to_numpy()
trainY = train.iloc[:, -1].to_numpy()

testX = test.iloc[:, :-1].to_numpy()
testY = test.iloc[:, -1].to_numpy()


# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))



In [58]:
# create and fit the LSTM network

look_back = 1
model = Sequential()
model.add(LSTM(4, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [None]:
lstm_model(train, test)