In [None]:
# Data Manipulation Libraries
import pandas as pd
import numpy as np
import datetime

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
retail = pd.read_csv('/content/Train.csv')
retail_test = pd.read_csv('/content/Test.csv')
retail_data = retail.copy()
retail_test_data = retail_test.copy()

#Drop Duplicate rows
retail_data.drop_duplicates(subset=None, keep='first', inplace=True)
#Only dropped one outlier
retail_data.drop(retail_data.loc[retail_data['UnitPrice']>35000,:].index,inplace=True)

# #No missing values
# import missingno as msno
# msno.matrix(retail_test)
# print(retail_test.isna().sum())
#Seperate Categorical and Numerical Columns
cat_cols = retail_data.select_dtypes(include=['object','category']).columns.tolist()
print(cat_cols)

num_cols = retail_data.select_dtypes(include=['int64','float64']).columns.tolist()
print(num_cols)

retail_data.head()

In [None]:
def drop_irrelavant_columns(df):
  df.drop(columns=['InvoiceNo','Description','Quantity','CustomerID','Country'],inplace=True)

def sample_by_hour_set_index(df):
  df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
  df.set_index('InvoiceDate',inplace=True)
  df.index = pd.to_datetime(df.index.strftime('%Y-%m-%d %H:%M:%S'))
  df.sort_index(inplace=True)
  df.resample(rule='H').mean().fillna(method ='ffill', inplace = True)

def convert_InvoiceDate_to_features(df):
  df['year'] = df.index.year
  df['month'] = df.index.month
  df['day'] = df.index.day
  df['hour'] = df.index.hour
  df['minute'] = df.index.minute
  return df


def convert_column_to_dummies(df,colname):
  dummies = pd.get_dummies(df[colname])

  for col in dummies.columns:
    df[col] = dummies[col]

  df.drop(columns=[colname],inplace=True)

def convert_unit_price_to_supervised(df):
  reframed = series_to_supervised(df['UnitPrice'].values, n_in=7, n_out=1, dropnan=True)
  for col in reframed.columns:
        df.assign(col = reframed[col].values)
  return df

def series_to_supervised(data, n_in=7, n_out=1, dropnan=True):
    n_vars = 1
    dff = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(dff.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    # for i in range(0, n_out):
    # 	cols.append(dff.shift(-i))
    # 	if i == 0:
    # 		names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
    # 	else:
    # 		names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.fillna(method='bfill', inplace=True)
    return agg

In [None]:
retail_data.shape

In [None]:
drop_irrelavant_columns(retail_data)
drop_irrelavant_columns(retail_test_data)

sample_by_hour_set_index(retail_data)
sample_by_hour_set_index(retail_test_data)

supervised_combined = convert_unit_price_to_supervised(retail_data)

supervised_combined = convert_InvoiceDate_to_features(supervised_combined)
retail_test_data = convert_InvoiceDate_to_features(retail_test_data)

convert_column_to_dummies(supervised_combined,'StockCode')
convert_column_to_dummies(retail_test_data,'StockCode')

retail_test_data.dropna(inplace=True)
model_ready_test_data = retail_test_data.values
to_predict_data = model_ready_test_data.reshape((model_ready_test_data.shape[0], 1, model_ready_test_data.shape[1]))

In [None]:
supervised_combined

In [None]:
n_train_time = int(supervised_combined.shape[0]*0.8)
supervised_combined.loc[:n_train_time,:]

In [None]:
# split into train and test sets
values = supervised_combined.values

n_train_time = int(len(supervised_combined)*0.8)
train = values[:n_train_time, :]
test = values[n_train_time:, :]
##test = values[n_train_time:n_test_time, :]
# split into input and outputs
train_X, train_y = train[:, 1:], train[:, 0]
test_X, test_y = test[:, 1:], test[:, 0]
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

print(train_X.shape, train_y.shape, test_X.shape, test_y.shape) 