In [1]:
# Data Manipulation Libraries
import pandas as pd
import numpy as np
import datetime

# Vizualization Libraries
# import plotly.express as px
# import seaborn as sns
import matplotlib.pyplot as plt

# pre-processing
from sklearn.preprocessing import StandardScaler

# ML model Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import cross_val_score
# from sklearn.tree import DecisionTreeClassifier
# from sklearn import metrics
# from sklearn.naive_bayes import GaussianNB
# from sklearn.linear_model import SGDClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import mean_squared_error

#sklearn
from sklearn.preprocessing import MinMaxScaler

# #statsmodels
# import statsmodels.api as sm
# from statsmodels.tsa.stattools import acf
# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# from statsmodels.tsa.stattools import adfuller
# from statsmodels.tsa.arima_model import ARMA
# from statsmodels.tsa.arima_model import ARIMA
# from statsmodels.tsa.arima_process import ArmaProcess
# from statsmodels.tsa.statespace.sarimax import SARIMAX
# !pip install pmdarima
# from pmdarima.arima import auto_arima

## for Deep-learing:
import tensorflow.keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import optimizers
from tensorflow.keras.optimizers import SGD 
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import LSTM, RepeatVector, TimeDistributed, Flatten
from tensorflow.keras.layers import Dropout

import warnings
warnings.filterwarnings('ignore')

In [2]:
retail = pd.read_csv('Train.csv')
retail_test = pd.read_csv('Test.csv')
retail_data = retail.copy()
retail_test_data = retail_test.copy()

#Drop Duplicate rows
retail_data.drop_duplicates(subset=None, keep='first', inplace=True)
#Only dropped one outlier
retail_data.drop(retail_data.loc[retail_data['UnitPrice']>35000,:].index,inplace=True)

# #No missing values
# import missingno as msno
# msno.matrix(retail_test)
# print(retail_test.isna().sum())
#Seperate Categorical and Numerical Columns
cat_cols = retail_data.select_dtypes(include=['object','category']).columns.tolist()
print(cat_cols)

num_cols = retail_data.select_dtypes(include=['int64','float64']).columns.tolist()
print(num_cols)

retail_data.head()

['InvoiceDate']
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'UnitPrice', 'CustomerID', 'Country']


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,6141,1583,144,3,2011-05-06 16:54:00,3.75,14056.0,35
1,6349,1300,3682,6,2011-05-11 07:35:00,1.95,13098.0,35
2,16783,2178,1939,4,2011-11-20 13:20:00,5.95,15044.0,35
3,16971,2115,2983,1,2011-11-22 12:07:00,0.83,15525.0,35
4,6080,1210,2886,12,2011-05-06 09:00:00,1.65,13952.0,35


In [3]:
def drop_irrelavant_columns(df):
  df.drop(columns=['InvoiceNo','Description','Quantity','CustomerID','Country'],inplace=True)

def sample_by_hour_set_index(df):
  df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
  df.set_index('InvoiceDate',inplace=True)
  df.index = pd.to_datetime(df.index.strftime('%Y-%m-%d %H:%M:%S'))
  df.sort_index(inplace=True)
  #df.resample(rule='H').mean().fillna(method ='ffill', inplace = True)

def convert_InvoiceDate_to_features(df):
  df['year'] = df.index.year
  df['month'] = df.index.month
  df['day'] = df.index.day
  df['hour'] = df.index.hour
  df['minute'] = df.index.minute
  return df

counts = pd.DataFrame({'StockCodeCounts' : retail_data['StockCode'].value_counts()})
test_counts = pd.DataFrame({'StockCodeCounts' : retail_test_data['StockCode'].value_counts()})
test_index = test_counts.index
filtered_columns = counts[counts['StockCodeCounts']>400].index
common_columns = []
for col in filtered_columns:
    if col in test_index:
        common_columns.append(col)

def convert_column_to_dummies(df,colname):
  
  dummies = pd.get_dummies(df[colname])
  global common_columns

  final_cols = common_columns if colname == 'StockCode' else dummies.columns
  for col in final_cols:
    df[col] = dummies[col]

  df.drop(columns=[colname],inplace=True)

def convert_unit_price_to_supervised(df):
  reframed = series_to_supervised(df['UnitPrice'].values, n_in=7, n_out=1, dropnan=True)
  for col in reframed.columns:
        df[col] = reframed[col].values
  return df

def series_to_supervised(data, n_in=7, n_out=1, dropnan=True):
    n_vars = 1
    dff = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(dff.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    # for i in range(0, n_out):
    # 	cols.append(dff.shift(-i))
    # 	if i == 0:
    # 		names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
    # 	else:
    # 		names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.fillna(method='bfill', inplace=True)
    return agg

In [4]:
retail_data.shape

(282158, 8)

In [5]:
drop_irrelavant_columns(retail_data)
drop_irrelavant_columns(retail_test_data)

sample_by_hour_set_index(retail_data)
sample_by_hour_set_index(retail_test_data)

supervised_combined = convert_unit_price_to_supervised(retail_data)

supervised_combined = convert_InvoiceDate_to_features(supervised_combined)
retail_test_data = convert_InvoiceDate_to_features(retail_test_data)

convert_column_to_dummies(supervised_combined,'StockCode')
convert_column_to_dummies(retail_test_data,'StockCode')

convert_column_to_dummies(supervised_combined,'year')
convert_column_to_dummies(retail_test_data,'year')

convert_column_to_dummies(supervised_combined,'month')
convert_column_to_dummies(retail_test_data,'month')

supervised_combined.reset_index(inplace=True)
supervised_combined.drop(columns=['InvoiceDate'],inplace=True)
supervised_combined.dropna(inplace=True)
retail_test_data.dropna(inplace=True)

# scaler = MinMaxScaler(feature_range=(0, 1))
# supervised_combined = scaler.fit_transform(supervised_combined)
# model_ready_test_data = scaler.fit_transform(retail_test_data)

model_ready_test_data = retail_test_data.iloc[:,0:].values
to_predict_data = model_ready_test_data.reshape((model_ready_test_data.shape[0], 1, model_ready_test_data.shape[1]))

In [6]:
supervised_combined

Unnamed: 0,UnitPrice,var1(t-7),var1(t-6),var1(t-5),var1(t-4),var1(t-3),var1(t-2),var1(t-1),day,hour,...,3,4,5,6,7,8,9,10,11,12
0,3.39,3.39,3.39,3.39,3.39,3.39,3.39,3.39,1,8,...,0,0,0,0,0,0,0,0,0,1
1,2.75,3.39,3.39,3.39,3.39,3.39,3.39,3.39,1,8,...,0,0,0,0,0,0,0,0,0,1
2,3.39,3.39,3.39,3.39,3.39,3.39,3.39,2.75,1,8,...,0,0,0,0,0,0,0,0,0,1
3,7.65,3.39,3.39,3.39,3.39,3.39,2.75,3.39,1,8,...,0,0,0,0,0,0,0,0,0,1
4,4.25,3.39,3.39,3.39,3.39,2.75,3.39,7.65,1,8,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282153,4.15,2.95,8.95,7.08,3.75,4.15,3.75,1.95,9,12,...,0,0,0,0,0,0,0,0,0,1
282154,1.95,8.95,7.08,3.75,4.15,3.75,1.95,4.15,9,12,...,0,0,0,0,0,0,0,0,0,1
282155,2.10,7.08,3.75,4.15,3.75,1.95,4.15,1.95,9,12,...,0,0,0,0,0,0,0,0,0,1
282156,4.95,3.75,4.15,3.75,1.95,4.15,1.95,2.10,9,12,...,0,0,0,0,0,0,0,0,0,1


In [7]:
# split into train and test sets
n_train_time = int(supervised_combined.shape[0]*0.8)
supervised_combined[:n_train_time]
train = supervised_combined[:n_train_time].values
test = supervised_combined[n_train_time:].values

##test = values[n_train_time:n_test_time, :]
# split into input and outputs
train_X, train_y = train[:,1:], train[:,0]
test_X, test_y = test[:,1:], test[:,0]
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

print(train_X.shape, train_y.shape, test_X.shape, test_y.shape) 

(225726, 1, 119) (225726,) (56432, 1, 119) (56432,)


In [8]:
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(None, train_X.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

epochs = 5
batch = 128
lr = 0.0001

# lstm_model = Sequential()
# lstm_model.add(LSTM(10, input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=True))
# lstm_model.add(LSTM(6, activation='relu', return_sequences=True))
# lstm_model.add(LSTM(1, activation='relu'))
# lstm_model.add(Dense(10, kernel_initializer='glorot_normal', activation='relu'))
# lstm_model.add(Dense(10, kernel_initializer='glorot_normal', activation='relu'))
# lstm_model.add(Dense(1))
# lstm_model.summary()

# adam = optimizers.Adam(lr)
# lstm_model.compile(loss='mse', optimizer=adam)

# fit network
history = model.fit(train_X, train_y, epochs=epochs, batch_size=batch, validation_data=(test_X, test_y), verbose=2, shuffle=False)

# make a prediction
nyhat = model.predict(to_predict_data)

Epoch 1/5
1764/1764 - 5s - loss: 948.1352 - val_loss: 274.1926
Epoch 2/5
1764/1764 - 5s - loss: 936.0707 - val_loss: 277.4747
Epoch 3/5
1764/1764 - 5s - loss: 935.5571 - val_loss: 273.3329
Epoch 4/5
1764/1764 - 4s - loss: 939.2935 - val_loss: 274.1361
Epoch 5/5
1764/1764 - 4s - loss: 940.6198 - val_loss: 272.5091


ValueError: in user code:

    /root/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:1462 predict_function  *
        return step_function(self, iterator)
    /root/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:1452 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /root/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /root/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /root/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /root/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:1445 run_step  **
        outputs = model.predict_step(data)
    /root/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:1418 predict_step
        return self(x, training=False)
    /root/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py:976 __call__
        self.name)
    /root/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/input_spec.py:227 assert_input_compatibility
        ', found shape=' + str(shape))

    ValueError: Input 0 is incompatible with layer sequential: expected shape=(None, None, 119), found shape=[None, 1, 112]


In [None]:
UnitPrice.shape

In [None]:
len(nyhat)

In [64]:
testmeankk = pd.read_csv('kkmean10.csv')
testmeankk.shape

(122049, 1)

In [86]:
UnitPrice = pd.DataFrame(nyhat,columns=['UnitPrice'])
UnitPrice['UnitPrice'] = np.round(UnitPrice['UnitPrice'].astype(float),decimals=2)
UnitPrice.to_csv('Nov15lstmkk03'+'.csv', index=False, index_label=None)