## Loading the data

In [42]:
import numpy as np

filepath = 'avocado.csv'.format()
types = ["|S10", float, float, float, float, float, float, float, float, float, "|S20", "|S20"]
columnsToIgnore = [0,12]
rawData = np.genfromtxt(filepath, delimiter=',', usecols=np.setdiff1d(range(14), columnsToIgnore), dtype=types, names=True)
rawData[:3]

array([('2015-12-27', 1.33,  64236.62, 1036.74,  54454.85,  48.16, 8696.87, 8603.62,  93.25, 0., 'conventional', 'Albany'),
       ('2015-12-20', 1.35,  54876.98,  674.28,  44638.81,  58.33, 9505.56, 9408.07,  97.49, 0., 'conventional', 'Albany'),
       ('2015-12-13', 0.93, 118220.22,  794.7 , 109149.67, 130.5 , 8145.35, 8042.21, 103.14, 0., 'conventional', 'Albany')],
      dtype=[('Date', 'S10'), ('AveragePrice', '<f8'), ('Total_Volume', '<f8'), ('4046', '<f8'), ('4225', '<f8'), ('4770', '<f8'), ('Total_Bags', '<f8'), ('Small_Bags', '<f8'), ('Large_Bags', '<f8'), ('XLarge_Bags', '<f8'), ('type', 'S20'), ('region', 'S20')])

In [43]:
from datetime import datetime
from __future__ import division

# Convert date to epoch time
epochDT = datetime(1970,1,1)
reformattedDates = np.array([((datetime.strptime(dataSample['Date'], '%Y-%m-%d')) - epochDT).total_seconds()
                       for dataSample in rawData]).astype(int)

# Category-features will remain as labels until training time, and will be stored in the database as labels.
# This is a design decision I would be keen to discuss
monthNames = np.array(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
monthColumn = monthNames[np.array([dataSample['Date'].split('-')[1] for dataSample in rawData]).astype(int)-1]

extendedData = np.lib.recfunctions.drop_fields(rawData, 'Date')
extendedData = np.lib.recfunctions.append_fields(extendedData, names=['Epoch_time', 'Month'], data=[reformattedDates, monthColumn], dtypes=[int, '|S20'])

## Split into train/test
I'm sampling evenly from the dataset to make sure the features are evenly represented (and that all of the categories from the category-features will be present in training data)

In [86]:
numSamples = len(extendedData)
testingIndices = np.linspace(0, numSamples-1, num=int(numSamples/10), dtype=int)
trainingIndices = np.setdiff1d(range(numSamples), testingIndices)

## Save dataset as .csv

In [85]:
np.savetxt('avocadoCleanedTrain.csv', extendedData[trainingIndices], fmt='%s', delimiter=',')
np.savetxt('avocadoCleanedTest.csv', extendedData[testingIndices], fmt='%s', delimiter=',')