In [26]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

import numpy as np
import pandas as pd
import os, sys
import requests

sys.path.insert(0, '..')

from src.data_container import *
from src.transformers.transformer import TimeSeriesTransformer, MeanSeriesTransformer, TimeSeriesWindowTransformer

In [2]:
# Read a TimeSeries from URL
url = urlopen("http://timeseriesclassification.com/Downloads/FordA.zip")
zipfile = ZipFile(BytesIO(url.read()))
lines = zipfile.open('FordA/FordA.csv').readlines()
lines = [l.decode('utf-8') for l in lines]
lines = lines[505:]
# lines now is a list of strings with of timeseries in comma separeted format
# 505 is a offset for the beginning of seriases

lines = [list(map(float, l.split(','))) for l in lines]

In [3]:
lines[0]

[1.1871, 0.4096, -0.43154, -1.231, -1.9055, -2.3824, -2.588, -2.5018, -2.1353, -1.574, -0.91166, -0.22714, 0.39089, 0.88353, 1.238, 1.4727, 1.5854, 1.5933, 1.5315, 1.4036, 1.2306, 1.0142, 0.74192, 0.43362, 0.10467, -0.2137, -0.52068, -0.79469, -1.0135, -1.1783, -1.2761, -1.3088, -1.2806, -1.1731, -0.99626, -0.76029, -0.47029, -0.14312, 0.18773, 0.50561, 0.79928, 1.0324, 1.1799, 1.2534, 1.2749, 1.2402, 1.1735, 1.0902, 0.95442, 0.75877, 0.48319, 0.11091, -0.35159, -0.84831, -1.3091, -1.6437, -1.7931, -1.7212, -1.4338, -0.99613, -0.48343, 0.029891, 0.47766, 0.82164, 1.0528, 1.2022, 1.2865, 1.3139, 1.2876, 1.1994, 1.0033, 0.68395, 0.24401, -0.28083, -0.83819, -1.3471, -1.7308, -1.9242, -1.8969, -1.666, -1.2413, -0.67011, -0.023588, 0.65044, 1.2781, 1.7832, 2.0965, 2.1715, 1.9864, 1.536, 0.86777, 0.082829, -0.70826, -1.3856, -1.8466, -2.012, -1.8724, -1.4558, -0.84879, -0.14262, 0.54696, 1.1013, 1.4682, 1.6087, 1.5331, 1.2713, 0.88522, 0.42504, -0.041895, -0.47022, -0.80565, -1.0163, -1.088

In [4]:
# now let's create a list of pd.Series
lines = [pd.Series(l) for l in lines]

In [5]:
lines[0]

0      1.187100
1      0.409600
2     -0.431540
3     -1.231000
4     -1.905500
5     -2.382400
6     -2.588000
7     -2.501800
8     -2.135300
9     -1.574000
10    -0.911660
11    -0.227140
12     0.390890
13     0.883530
14     1.238000
15     1.472700
16     1.585400
17     1.593300
18     1.531500
19     1.403600
20     1.230600
21     1.014200
22     0.741920
23     0.433620
24     0.104670
25    -0.213700
26    -0.520680
27    -0.794690
28    -1.013500
29    -1.178300
         ...   
471   -1.098000
472   -1.085200
473   -1.105100
474   -1.140800
475   -1.161200
476   -1.106800
477   -0.932220
478   -0.632770
479   -0.263120
480    0.123200
481    0.460400
482    0.709680
483    0.857890
484    0.935240
485    0.967530
486    0.991780
487    1.023700
488    1.042400
489    1.041700
490    1.018700
491    0.938780
492    0.796070
493    0.585600
494    0.310280
495   -0.016134
496   -0.395550
497   -0.816890
498   -1.229000
499   -1.591300
500    1.000000
Length: 501, dtype: floa

create a MultiSeries series of pd.Series objects

In [9]:
X = MultiSeries(lines)

In [13]:
X.head()

0    0      1.187100
1      0.409600
2     -0.43154...
1    0      0.094261
1      0.310310
2      0.53060...
2    0     -1.157000
1     -1.592600
2     -1.50960...
3    0      0.356960
1      0.300850
2      0.24314...
4    0      0.307980
1      0.370350
2      0.26015...
dtype: object
data_type: <class 'pandas.core.series.Series'>

X now is a Series of pd.Serieses. It means, that every element of this Series is pd.Series.
Let's try some transformers over it.
First transformer is a windows transformer. This transformer calculated rolling mean with a given windows size (or defaul) and return new series.

In [19]:
tr = TimeSeriesWindowTransformer(windows_size=5)
tr.fit()
transformed_series = tr.transform(X)

In [21]:
transformed_series.head()

0    0           NaN
1           NaN
2           Na...
1    0           NaN
1           NaN
2           Na...
2    0           NaN
1           NaN
2           Na...
3    0           NaN
1           NaN
2           Na...
4    0           NaN
1           NaN
2           Na...
dtype: object
data_type: <class 'pandas.core.series.Series'>

Of course, with a widndows_zie = 5 first 4 elements are NaN.

In [23]:
transformed_series[0].head(10)

0         NaN
1         NaN
2         NaN
3         NaN
4   -0.394268
5   -1.108168
6   -1.707688
7   -2.121740
8   -2.302600
9   -2.236300
dtype: float64

Let's try another transformer, that extracts mean and std from series and transform series into DataFrame

In [28]:
tr = TimeSeriesTransformer()
tr.fit()
transformed_series = tr.transform(X)

In [30]:
type(transformed_series)

src.data_container.MultiDataFrame

And last example of transformer — transformer with params

In [31]:
tr = MeanSeriesTransformer()
tr.fit(X)
transformed_series = tr.transform(X)

In [33]:
transformed_series.head(10)

0   -0.002049
1   -0.002051
2    0.001942
3    0.001943
4    0.001941
5   -0.002050
6   -0.002051
7    0.001943
8    0.001942
9    0.001943
dtype: float64
data_type: <class 'numpy.float64'>