In [1]:
import glob
import json
import os
import sys

import pandas as pd
import numpy as np

import dask
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd

In [88]:
def get_segment_id_from_path(df, path):
    """
    Returns the segment_id from the path of the file 
    """
    df.segment_id = df.segment_id.str.replace(path, "")
    df.segment_id = df.segment_id.str.replace(".csv", "")
    df.segment_id = df.segment_id.astype(np.int64)
    
    return df

def append_time_column(df):
    df["time"] = range(0, len(df))
    
    return df

with open("/opt/vssexclude/personal/workshops/feature_engineering_with_tsfresh/notebooks/account_details.json") as f:
    credentials = json.load(f)

storage_options = {
    "account_name": credentials["ACCOUNT_NAME"],
    "account_key": credentials["ACCOUNT_KEY"]
}

In [38]:
DATA_DIR = "/opt/vssexclude/personal/kaggle/volcano/data/raw/train"
FEATURE_PATH = "/opt/vssexclude/personal/workshops/feature_engineering_with_tsfresh/data/features"

# Define the datatypes for different sensor data
data_types = {"sensor_1" : np.float32, 
                 "sensor_2" : np.float32, 
                 "sensor_3" : np.float32,
                 "sensor_4" : np.float32,
                 "sensor_5" : np.float32,
                 "sensor_6" : np.float32,
                 "sensor_7" : np.float32,
                 "sensor_8" : np.float32,
                 "sensor_9" : np.float32,
                 "sensor_10" : np.float32}



### Start Dask Client

<img src="../images/dask_architechture_diagram.png" width="600" height="200" style="border-style: solid;">

In [4]:
cluster = LocalCluster(n_workers=10, 
                       threads_per_worker=1, 
                       scheduler_port=8786, 
                       memory_limit='2GB')

client = Client(cluster)

In [5]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:8786  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 10  Cores: 10  Memory: 20.00 GB


### Read Data

In [10]:
! ls -lrt {DATA_DIR}/140*.csv | wc -l

31


In [61]:
%%time
ddf = dd.read_csv(
    urlpath="abfs://volcano/raw/140*.csv", 
    blocksize=None, 
    dtype=data_types,
    include_path_column='segment_id',
    storage_options=storage_options)

# Insert a new column with segment_id along with the values from 10 sensors
ddf = ddf.map_partitions(get_segment_id_from_path, "volcano/raw/")

# Add a column named time with ascending values staring from 0 representing time
ddf = ddf.map_partitions(append_time_column)

ddf = ddf.loc[0:999, :]

CPU times: user 610 ms, sys: 53.2 ms, total: 664 ms
Wall time: 7.4 s


In [63]:
ddf.npartitions

28

In [64]:
ddf.head()

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,segment_id,time
0,-486.0,34.0,-87.0,-516.0,234.0,-785.0,522.0,473.0,238.0,2802.0,1400253000,0
1,-567.0,95.0,-92.0,-591.0,231.0,-774.0,589.0,210.0,252.0,2678.0,1400253000,1
2,-631.0,261.0,-120.0,-620.0,212.0,-787.0,433.0,120.0,276.0,2517.0,1400253000,2
3,-744.0,262.0,-215.0,-550.0,174.0,-890.0,322.0,-240.0,334.0,2323.0,1400253000,3
4,-725.0,318.0,-193.0,-475.0,131.0,-806.0,267.0,-14.0,365.0,2089.0,1400253000,4


In [65]:
ddf = ddf.loc[0:999, ['segment_id', 'time', 'sensor_1', 'sensor_4']]

In [66]:
ddf.head()

Unnamed: 0,segment_id,time,sensor_1,sensor_4
0,1400253000,0,-486.0,-516.0
1,1400253000,1,-567.0,-591.0
2,1400253000,2,-631.0,-620.0
3,1400253000,3,-744.0,-550.0
4,1400253000,4,-725.0,-475.0


In [67]:
ddf.tail()

Unnamed: 0,segment_id,time,sensor_1,sensor_4
995,1409167039,995,525.0,-47.0
996,1409167039,996,477.0,26.0
997,1409167039,997,278.0,107.0
998,1409167039,998,-40.0,191.0
999,1409167039,999,-193.0,245.0


### Generate Features for individual partitions in parallel using Dask

In [68]:
from tsfresh.feature_extraction import extract_features
from tsfresh.feature_extraction.settings import MinimalFCParameters

def custom_extract_features(df, column_id, column_sort, default_fc_parameters):
    """
    Generate features using `extract_features` of `tsfresh` and then rename and 
    reset axis.
    
    Setting `n_jobs` to 0 disable multiprocessing functionality
    """
    feature_df = extract_features(df, 
                                  column_id=column_id, 
                                  column_sort=column_sort, 
                                  n_jobs=0, 
                                  default_fc_parameters=default_fc_parameters,
                                  disable_progressbar=True)
    feature_df = feature_df.rename_axis("segment_id").reset_index(drop=False)
    feature_df.segment_id = feature_df.segment_id.astype('category')
    return feature_df

In [69]:
ddf_features = ddf.map_partitions(custom_extract_features, 
                                  column_id='segment_id',
                                  column_sort='time',
                                  default_fc_parameters=MinimalFCParameters())

In [70]:
ddf_features.npartitions

28

In [71]:
def get_segment_ids(df):
    return df.segment_id.value_counts()

In [72]:
ddf_features.map_partitions(get_segment_ids).compute()

1400253000    1
1400727315    1
1400929225    1
1402556914    1
1402674973    1
1402914692    1
1403005697    1
1403222059    1
1403244730    1
1403440092    1
1403947680    1
1404122310    1
1404179874    1
1404322654    1
1404502479    1
1405189645    1
1405443107    1
1406234149    1
1406456924    1
1406626451    1
1406938061    1
1407084157    1
1407094442    1
1407261706    1
1408285202    1
1408645616    1
1408663387    1
1409167039    1
Name: segment_id, dtype: int64

In [73]:
ddf_features.partitions[0].segment_id.unique().compute()

0    1400253000
Name: segment_id, dtype: category
Categories (1, int64): [1400253000]

### Write the features back to hard drive

In [77]:
ddf_features.to_parquet(
    path="abfs://volcano/features", 
    write_index=False, 
    partition_on="segment_id",
    engine="pyarrow",
    append=False,
    storage_options=storage_options)

In [83]:
ddf_features_from_disk = dd.read_parquet(path="abfs://volcano/features/*/*.parquet", storage_options=storage_options)

In [87]:
ddf_features_from_disk.compute()

Unnamed: 0,sensor_1__sum_values,sensor_1__median,sensor_1__mean,sensor_1__length,sensor_1__standard_deviation,sensor_1__variance,sensor_1__maximum,sensor_1__minimum,sensor_4__sum_values,sensor_4__median,sensor_4__mean,sensor_4__length,sensor_4__standard_deviation,sensor_4__variance,sensor_4__maximum,sensor_4__minimum
0,-5286.0,19.0,-5.286,1000.0,370.556244,137311.9,1117.0,-1364.0,-16044.0,-38.5,-16.044001,1000.0,364.98819,133216.4,1109.0,-929.0
0,354441.0,976.5,354.44101,1000.0,11997.532227,143940800.0,32767.0,-32767.0,-37289.0,406.5,-37.289001,1000.0,12828.428711,164568600.0,32767.0,-32767.0
0,-36673.0,-53.0,-36.673,1000.0,481.30838,231657.8,1068.0,-1446.0,12212.0,-65.0,12.212,1000.0,442.950134,196204.8,1306.0,-896.0
0,-28392.0,-19.5,-28.392,1000.0,559.073425,312563.1,1482.0,-1305.0,44935.0,15.0,44.935001,1000.0,635.278503,403578.8,1933.0,-1736.0
0,4700.0,0.5,4.7,1000.0,225.85939,51012.46,531.0,-553.0,-848.0,-15.0,-0.848,1000.0,247.436676,61224.91,692.0,-698.0
0,7582.0,15.0,7.582,1000.0,249.712463,62356.32,712.0,-708.0,-11415.0,0.0,-11.415,1000.0,222.817245,49647.53,747.0,-588.0
0,4069.0,-36.0,4.069,1000.0,697.855347,487002.1,2109.0,-2315.0,-3377.0,-4.0,-3.377,1000.0,281.174011,79058.83,858.0,-903.0
0,-5334.0,-6.5,-5.334,1000.0,496.390961,246404.0,1365.0,-1124.0,12199.0,-4.0,12.199,1000.0,528.914307,279750.3,1397.0,-1809.0
0,9736.0,13.0,9.736,1000.0,152.40062,23225.95,359.0,-584.0,-8292.0,-3.0,-8.292,1000.0,112.092766,12564.79,362.0,-284.0
0,7318.0,7.0,7.318,1000.0,219.485367,48173.83,640.0,-484.0,7769.0,2.5,7.769,1000.0,199.156036,39663.13,508.0,-628.0
