In [1]:
#meta: 6/23/2022 Ch 4. Pipeline - 0_DataSetup for Autoregressive and Automated Methods with Azure ML
#book: Machine Learning for Time Series Forecasting with Python  
#author: Francesca Lazzeri, PhD.  
#forked from FrancescaLazzeri https://github.com/FrancescaLazzeri/Machine-Learning-for-Time-Series-Forecasting  

#infra: AML anya-ml
#env default AzureML with py 3.8.3
#numpy 1.19.0, pandas 1.1.5


#history
#6/23/2022 MY DATA PIPELINE FOR AZURE AUTOML
#      Get data once, preprocess and use tidy data from this point on.


# References
#How to read a binary file
# refer to https://www.stackvidhya.com/python-read-binary-file/

In [2]:
#$my python version 
import sys
print(sys.version)

3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]


In [3]:
#import resources and packages
from datetime import datetime
import pandas as pd

#import azureml.core
from azureml.core import Dataset, Experiment, Workspace

In [4]:
#-------------- GLOBAL VARS ----------------------------
FILE_RAW_CSV = 'data/NYC_energy_raw.csv'
FILE_TRAIN_CSV = 'data/NYC_energy_train.csv'
FILE_TEST_CSV = 'data/NYC_energy_test.csv'

## 0. Load Data
NYC energy demand dataset (refer to http://mis.nyiso.com/public/P-58Blist.htm) stored in a tabular format.  
Ds includes energy demand and numerical weaher features at an hourly freqquency.

A good ds for ts forecasting: predict the energy demand in NYC for the next 24 hours by building a forecasting solution that leverages historical energy data from the same geographical region.

In [5]:
#define the target column
target_column_name = "demand"
#temporal structure
time_column_name = "timeStamp"

ts_data = Dataset.Tabular.from_delimited_files( #class azureml.data.tabular_dataset.TabularDataset #class azureml.data.tabular_dataset.TabularDataset
    path="https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/nyc_energy.csv"
).with_timestamp_columns(fine_grain_timestamp=time_column_name)

ts_data.take(5).to_pandas_dataframe().reset_index(drop=True)

Unnamed: 0,timeStamp,demand,precip,temp
0,2012-01-01 00:00:00,4937.5,0.0,46.13
1,2012-01-01 01:00:00,4752.1,0.0,45.89
2,2012-01-01 02:00:00,4542.6,0.0,45.04
3,2012-01-01 03:00:00,4357.7,0.0,45.03
4,2012-01-01 04:00:00,4275.5,0.0,42.61


### 0.1 Persist raw data

In [6]:
#ts_data.take(5).to_pandas_dataframe().to_csv(FILE_DF, index=False)
ts_data.to_pandas_dataframe().to_csv(FILE_RAW_CSV, index=False)

## 1. Prep Data
Define and prepare ts data for forecasting with AutoML

Ds is missing energy demand values for all datetimes later than 2017/08/10 05:00 -> reduce the ds by deleting rows with missing values from the end of ds.

In [7]:
#delete rows due to many NaN values
ts_data = ts_data.time_before(datetime(2017,8,10,5))

In [8]:
#train test split
train = ts_data.time_before(datetime(2017, 8, 8, 5), include_boundary=True)
train.to_pandas_dataframe().reset_index(drop=True).sort_values(time_column_name).tail(5)

test = ts_data.time_between(datetime(2017, 8, 8, 6), datetime(2017, 8, 10, 5))
test.to_pandas_dataframe().reset_index(drop=True).head(5)

Unnamed: 0,timeStamp,demand,precip,temp
0,2017-08-08 06:00:00,5590.992,0.0,66.17
1,2017-08-08 07:00:00,6147.033,0.0,66.29
2,2017-08-08 08:00:00,6592.425,0.0,66.72
3,2017-08-08 09:00:00,6874.533,0.0,67.37
4,2017-08-08 10:00:00,7010.542,0.0,68.3


### 1.1 Persist tidy data
Get data once, preprocess and use tidy data from this point on.

In [9]:

train.to_pandas_dataframe().reset_index(drop=True).sort_values(time_column_name).to_csv(FILE_TRAIN_CSV, index=False)
test.to_pandas_dataframe().reset_index(drop=True).to_csv(FILE_TEST_CSV, index=False)

### 1.2 Check persisted data

In [10]:
#preview saved dataframes
df = pd.read_csv(FILE_RAW_CSV)
df_train = pd.read_csv(FILE_TRAIN_CSV)
df_test = pd.read_csv(FILE_TEST_CSV)
print(df.shape, df_train.shape, df_test.shape)

print(df.dtypes)
print(df_train.dtypes)
print(df_test.dtypes)

#preview
df.head()

(49205, 4) (49108, 4) (48, 4)
timeStamp     object
demand       float64
precip       float64
temp         float64
dtype: object
timeStamp     object
demand       float64
precip       float64
temp         float64
dtype: object
timeStamp     object
demand       float64
precip       float64
temp         float64
dtype: object


Unnamed: 0,timeStamp,demand,precip,temp
0,2012-01-01 00:00:00,4937.5,0.0,46.13
1,2012-01-01 01:00:00,4752.1,0.0,45.89
2,2012-01-01 02:00:00,4542.6,0.0,45.04
3,2012-01-01 03:00:00,4357.7,0.0,45.03
4,2012-01-01 04:00:00,4275.5,0.0,42.61


In [11]:
#check missing data in raw dataset
df[df['timeStamp']> '2017-08-10 05:00:00'].head()

Unnamed: 0,timeStamp,demand,precip,temp
49156,2017-08-10 06:00:00,,0.003,67.06
49157,2017-08-10 07:00:00,,0.0,67.22
49158,2017-08-10 08:00:00,,0.0,69.5
49159,2017-08-10 09:00:00,,0.0,73.69
49160,2017-08-10 10:00:00,,0.0,78.07


In [12]:
mystop

NameError: name 'mystop' is not defined

## Xtra

In [None]:
#persist file - binary file stream
#$not - way easier to save as pandas df and will be ready to read

FILE_DIR = 'data/NYC_energy'
FILE_BINARY = 'data/NYC_energy/part-00000'
FILE_BINARY_SMALL = 'data/NYC_energy/NYC_energy_small'
#ts_data.to_csv_files().download(FILE_DIR) #file stream

In [None]:
#read file
# How to read a binary file
#refer to https://www.stackvidhya.com/python-read-binary-file/
f = open(FILE_BINARY_SMALL,'rb') #class _io.BufferedReader

lines = f.readlines()

for line in lines:
    print(line.rstrip())
