# Store Dataset Preparation

This notebook briefly applies a short data engineering to the dataset which will be used for the actual model training

In [1]:
import pandas as pd

In [2]:
# Run this cell when running on Colab
!cp /content/drive/MyDrive/mynt/train.csv .

In [3]:
data = pd.DataFrame(pd.read_csv("train.csv"))

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
data

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,4,2015-04-30,6228,650,1,1,0,0
1,2,4,2015-04-30,6884,716,1,1,0,0
2,3,4,2015-04-30,9971,979,1,1,0,0
3,4,4,2015-04-30,16106,1854,1,1,0,0
4,5,4,2015-04-30,6598,729,1,1,0,0
...,...,...,...,...,...,...,...,...,...
914624,1111,2,2013-01-01,0,0,0,0,a,1
914625,1112,2,2013-01-01,0,0,0,0,a,1
914626,1113,2,2013-01-01,0,0,0,0,a,1
914627,1114,2,2013-01-01,0,0,0,0,a,1


In [5]:
# Slice the date and retain Month and Day
date = [item for item in data['Date'].values]

# Print sample sizes from the date
print(date[0:5])

['2015-04-30', '2015-04-30', '2015-04-30', '2015-04-30', '2015-04-30']


In [6]:
# Get the month and day
month = [month.split("-")[1] for month in date]
day = [day.split("-")[2] for day in date]

In [7]:
# Check for the retrieved items and check length from original to verify
print(month[0:5])
print(day[0:5])
print("Length date:", len(date), "\nLength Month:", len(month), "\nLength Day:", len(day))

['04', '04', '04', '04', '04']
['30', '30', '30', '30', '30']
Length date: 914629 
Length Month: 914629 
Length Day: 914629


In [8]:
# Remove the date feature from the dataset
new_data = data.drop(['Date'], axis=1)
new_data.head()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,4,6228,650,1,1,0,0
1,2,4,6884,716,1,1,0,0
2,3,4,9971,979,1,1,0,0
3,4,4,16106,1854,1,1,0,0
4,5,4,6598,729,1,1,0,0


In [9]:
# Append the month and week features to the intended dataset
new_data["Month"] = month
new_data["Day"] = day

In [10]:
# Check the new dataset
new_data.head()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Month,Day
0,1,4,6228,650,1,1,0,0,4,30
1,2,4,6884,716,1,1,0,0,4,30
2,3,4,9971,979,1,1,0,0,4,30
3,4,4,16106,1854,1,1,0,0,4,30
4,5,4,6598,729,1,1,0,0,4,30


In [12]:
# Transform strings to numerical index from the dataset (e.g. a=1)
state_holiday = [state for state in new_data["StateHoliday"]]

set(state_holiday)

{0, '0', 'a', 'b', 'c'}

In [13]:
# Iterate through all items and represent to numerical
new_state_holiday = []
for state in state_holiday:
  if state == 'a':
    new_state_holiday.append(1)
  elif state == 'b':
    new_state_holiday.append(2)
  elif state == 'c':
    new_state_holiday.append(3)
  else:
    new_state_holiday.append(0)

In [14]:
new_data["StateHoliday"] = new_state_holiday

In [15]:
# Visualize the updated dataset
new_data

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Month,Day
0,1,4,6228,650,1,1,0,0,04,30
1,2,4,6884,716,1,1,0,0,04,30
2,3,4,9971,979,1,1,0,0,04,30
3,4,4,16106,1854,1,1,0,0,04,30
4,5,4,6598,729,1,1,0,0,04,30
...,...,...,...,...,...,...,...,...,...,...
914624,1111,2,0,0,0,0,1,1,01,01
914625,1112,2,0,0,0,0,1,1,01,01
914626,1113,2,0,0,0,0,1,1,01,01
914627,1114,2,0,0,0,0,1,1,01,01


In [16]:
# Save the new_data and begin training with it
new_data.to_csv("train_modified.csv", encoding="utf-8", index=False)