# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from keras.layers import Input, Dense
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

Using TensorFlow backend.
  from numpy.core.umath_tests import inner1d


# Load Data

In [2]:
df_product_demand = pd.read_csv("C:\github\EECS731\world-wide-products\Data\Historical Product Demand.csv")
df_product_demand

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
0,Product_0993,Whse_J,Category_028,2012/7/27,100
1,Product_0979,Whse_J,Category_028,2012/1/19,500
2,Product_0979,Whse_J,Category_028,2012/2/3,500
3,Product_0979,Whse_J,Category_028,2012/2/9,500
4,Product_0979,Whse_J,Category_028,2012/3/2,500
...,...,...,...,...,...
1048570,Product_1791,Whse_J,Category_006,2016/4/27,1000
1048571,Product_1974,Whse_J,Category_006,2016/4/27,1
1048572,Product_1787,Whse_J,Category_006,2016/4/28,2500
1048573,Product_0901,Whse_J,Category_023,2016/10/7,50


# Transform Data

Remove NaN values

In [3]:
df_product_demand = df_product_demand.dropna()

In [4]:
print(len(df_product_demand['Warehouse'].unique()))
print(len(df_product_demand['Product_Code'].unique()))
print(len(df_product_demand['Product_Category'].unique()))

4
2160
33


Find the product with the most data

In [5]:
df_product_demand['Product_Code'].mode()

0    Product_1359
dtype: object

Remove data that isn't product_1359

In [6]:
df_product_1359 = df_product_demand[df_product_demand.Product_Code == 'Product_1359']
df_product_1359

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
276,Product_1359,Whse_J,Category_019,2012/4/18,80000
282,Product_1359,Whse_J,Category_019,2012/5/21,70000
289,Product_1359,Whse_J,Category_019,2012/6/26,80000
292,Product_1359,Whse_J,Category_019,2012/7/17,100000
296,Product_1359,Whse_J,Category_019,2012/8/24,100000
...,...,...,...,...,...
1046514,Product_1359,Whse_J,Category_019,2016/10/3,20000
1046515,Product_1359,Whse_J,Category_019,2016/9/16,10000
1046516,Product_1359,Whse_J,Category_019,2016/11/1,30000
1046517,Product_1359,Whse_J,Category_019,2016/12/1,20000


In [7]:
df_product_1359['Date'].iloc[0].split('/')

['2012', '4', '18']

In [8]:
year = []
month = []
day = []
for date in df_product_1359['Date']:
    y,m,d = date.split('/')
    year.append(y)
    month.append(m)
    day.append(d)
df_product_1359['Year'] = year
df_product_1359['Month'] = month
df_product_1359['Day'] = day
df_product_1359 = df_product_1359.drop(columns=['Date','Product_Code','Product_Category'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [9]:
df_product_1359

Unnamed: 0,Warehouse,Order_Demand,Year,Month,Day
276,Whse_J,80000,2012,4,18
282,Whse_J,70000,2012,5,21
289,Whse_J,80000,2012,6,26
292,Whse_J,100000,2012,7,17
296,Whse_J,100000,2012,8,24
...,...,...,...,...,...
1046514,Whse_J,20000,2016,10,3
1046515,Whse_J,10000,2016,9,16
1046516,Whse_J,30000,2016,11,1
1046517,Whse_J,20000,2016,12,1


In [10]:
df_product_1359['Warehouse'].unique()

array(['Whse_J'], dtype=object)

In [11]:
df_product_1359 = df_product_1359.drop(columns=['Warehouse'])
df_product_1359

Unnamed: 0,Order_Demand,Year,Month,Day
276,80000,2012,4,18
282,70000,2012,5,21
289,80000,2012,6,26
292,100000,2012,7,17
296,100000,2012,8,24
...,...,...,...,...
1046514,20000,2016,10,3
1046515,10000,2016,9,16
1046516,30000,2016,11,1
1046517,20000,2016,12,1


In [12]:
data = df_product_1359

In [13]:
print('Processing Product Demand')
for i in data.index:
    val = data.at[i,'Order_Demand'].strip()
    if not val.isdigit():
        val = val[1:-1]
    data.at[i,'Order_Demand'] = int(val)
print('Processing Year')
for i in data.index:
    val = data.at[i,'Year'].strip()
    if not val.isdigit():
        val = val[1:-1]
    data.at[i,'Year'] = int(val)
print('Processing Month')
for i in data.index:
    val = data.at[i,'Month'].strip()
    if not val.isdigit():
        val = val[1:-1]
    data.at[i,'Month'] = int(val)
print('Processing Day')
for i in data.index:
    val = data.at[i,'Day'].strip()
    if not val.isdigit():
        val = val[1:-1]
    data.at[i,'Day'] = int(val)

Processing Product Demand
Processing Year
Processing Month
Processing Day


In [14]:
data = data.astype('int32')
data

Unnamed: 0,Order_Demand,Year,Month,Day
276,80000,2012,4,18
282,70000,2012,5,21
289,80000,2012,6,26
292,100000,2012,7,17
296,100000,2012,8,24
...,...,...,...,...
1046514,20000,2016,10,3
1046515,10000,2016,9,16
1046516,30000,2016,11,1
1046517,20000,2016,12,1


In [15]:
data = data.sort_values(by=['Year','Month','Day'])
data

Unnamed: 0,Order_Demand,Year,Month,Day
2577,2000,2012,1,5
4875,25000,2012,1,5
7940,100000,2012,1,5
14860,300000,2012,1,5
20412,150000,2012,1,5
...,...,...,...,...
870423,3000,2016,12,28
871104,10000,2016,12,28
877287,3000,2016,12,28
943424,50000,2016,12,28


In [16]:
data = data.groupby(['Year','Month','Day'],as_index=False).sum()
data

Unnamed: 0,Year,Month,Day,Order_Demand
0,2012,1,5,702000
1,2012,1,6,676000
2,2012,1,10,783000
3,2012,1,11,200000
4,2012,1,12,284000
...,...,...,...,...
1227,2016,12,22,180000
1228,2016,12,26,286000
1229,2016,12,27,90000
1230,2016,12,28,140000


In [17]:
data[data.Order_Demand.between(0,100000)]['Order_Demand']

8        29000
72       85000
78       13000
84       25000
87       10000
         ...  
1203     66000
1208     71000
1219     68000
1229     90000
1231    100000
Name: Order_Demand, Length: 121, dtype: int32

# Build Models

In [18]:
x = np.array(data.drop(columns=['Order_Demand']))
y = np.array(data['Order_Demand'])

In [19]:
x

array([[2012,    1,    5],
       [2012,    1,    6],
       [2012,    1,   10],
       ...,
       [2016,   12,   27],
       [2016,   12,   28],
       [2017,    1,    6]], dtype=int64)

In [20]:
y

array([702000, 676000, 783000, ...,  90000, 140000, 100000])

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)
print(len(x_train),len(y_train),len(x_test),len(y_test))

862 862 370 370


In [22]:
model = Sequential()
model.add(Dense(64,input_dim=3,activation='tanh'))
model.add(Dense(1))
model.compile(optimizer='sgd', loss='mean_absolute_error', metrics=['accuracy'])

In [23]:
model.fit(x=x_train,y=y_train,epochs=10,validation_split=.2,batch_size=32)

Train on 689 samples, validate on 173 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x27abbd1bba8>

In [24]:
predictions = model.predict(np.array(x_test))
predictions = list(map(lambda a: a[0],predictions))
error = abs(y_test-predictions)

In [25]:
average_error = sum(error)/len(error)
average_correct_value = sum(y_test)/len(y_test)
print("Average Error:",average_error)
print("Average Percent Error:",100*average_error/average_correct_value)

Average Error: 369184.4262840477
Average Percent Error: 99.9621205452599


The neural network is not working well because neural networks are not very good at exact outputs. They are much more successful with classification problems. They do do work in field such as regression with things like object detection but that requires very powerful feature networks with tons of training data which is something we do not have.

In [26]:
rf = RandomForestClassifier(n_estimators = 100, verbose=1)
rf.fit(x_train, y_train)
preds = rf.predict(x_test)
error = preds-y_test
print("% error:",sum(abs(error))*100/(average_correct_value))

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.4s finished


% error: 5215.376516355654


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished
  """


The random forest classifier performs much worse than the neural network but let's try a random forest regressor to see if it performs better because this is closer to a regression problem than a classification problem.

In [27]:
rr = RandomForestRegressor(n_estimators = 100, verbose=1)
rr.fit(x_train, y_train)
preds = rr.predict(x_test)
error = preds-y_test
print("Average error:", sum(abs(error)))
print("% error:",sum(abs(error))*100/(average_correct_value))

Average error: 86080490.0
% error: 23307.560409806076


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


23307% error is obviously not a good model. The best model is then is the neural network.