In [1]:
import sklearn
import numpy as np
import pandas as pd
import datetime
from pyts.approximation import SymbolicFourierApproximation
from datetime import datetime

# Loading data and adding date column

In [2]:
epam = pd.read_csv('epam_clean.csv')
epam.drop(axis=1, columns=['Unnamed: 0'], inplace=True)
epam['timestamp'] =  pd.to_datetime(epam.datetime, format='%Y-%m-%d %H:%M:%S')
epam['date'] = epam.timestamp.dt.date

swepam = pd.read_csv('swepam_clean.csv')
swepam.drop(axis=1, columns=['Unnamed: 0'], inplace=True)
swepam['timestamp'] =  pd.to_datetime(swepam.datetime, format='%Y-%m-%d %H:%M:%S')
swepam['date'] = swepam.timestamp.dt.date

mag = pd.read_csv('mag_clean.csv')
mag.drop(axis=1, columns=['Unnamed: 0'], inplace=True)
mag['timestamp'] =  pd.to_datetime(mag.datetime, format='%Y-%m-%d %H:%M:%S')
mag['date'] = mag.timestamp.dt.date

sis = pd.read_csv('sis_clean.csv')
sis.drop(axis=1, columns=['Unnamed: 0'], inplace=True)
sis['timestamp'] =  pd.to_datetime(sis.datetime, format='%Y-%m-%d %H:%M:%S')
sis['date'] = sis.timestamp.dt.date

# Aggregating by date using SFA

### Putting all column values in one list per day

In [3]:
# Example before transformation
mag.head(2)

Unnamed: 0,Bx,By,Bz,Bt,Lat.,Long.,datetime,timestamp,date
0,-6.2,-3.1,-0.1,6.9,-0.6,206.5,2001-08-07 00:00:00,2001-08-07 00:00:00,2001-08-07
1,-6.3,-3.3,-0.3,7.1,-2.7,207.8,2001-08-07 00:01:00,2001-08-07 00:01:00,2001-08-07


In [4]:
mag_wanted_columns = ['Bx', 'By', 'Bz', 'Bt', 'Lat.', 'Long.', 'date']
mag_agg_grouped = mag[mag_wanted_columns].groupby(['date'])
mag_listed = mag_agg_grouped[mag_wanted_columns[:-1]].agg(lambda x : list(x.to_numpy()))
mag_listed.set_index(pd.DatetimeIndex(mag_listed.index), inplace=True)

In [5]:
# Example after transformation
mag_listed.head(2)

Unnamed: 0_level_0,Bx,By,Bz,Bt,Lat.,Long.
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001-08-07,"[-6.2, -6.3, -6.2, -6.1, -6.1, -6.1, -6.5, -6....","[-3.1, -3.3, -3.1, -3.4, -3.5, -3.5, -3.0, -3....","[-0.1, -0.3, -0.6, -0.6, -0.4, 0.0, -0.3, 0.0,...","[6.9, 7.1, 6.9, 7.0, 7.0, 7.0, 7.2, 7.2, 7.0, ...","[-0.6, -2.7, -4.7, -4.8, -3.1, 0.2, -2.3, -0.4...","[206.5, 207.8, 206.5, 209.2, 210.0, 209.8, 204..."
2001-08-08,"[-5.8, -5.3, -5.1, -5.4, -5.1, -5.6, -6.6, -6....","[1.5, 2.3, 2.6, 2.7, 3.0, 3.2, 1.5, 2.5, 3.3, ...","[-2.6, -2.8, -2.7, -1.8, -1.1, 0.6, -0.2, -0.9...","[6.5, 6.4, 6.3, 6.3, 6.1, 6.5, 6.7, 6.7, 6.5, ...","[-23.6, -26.2, -25.1, -16.5, -10.2, 5.1, -1.5,...","[165.8, 156.9, 152.8, 153.6, 149.5, 149.8, 167..."


In [6]:
swepam_wanted_columns = ['proton_density', 'bulk_speed', 'ion_temperature', 'date']
swepam_agg_grouped = swepam[swepam_wanted_columns].groupby(['date'])
swepam_listed = swepam_agg_grouped[swepam_wanted_columns[:-1]].agg(lambda x : list(x.to_numpy()))
swepam_listed.set_index(pd.DatetimeIndex(swepam_listed.index), inplace=True)

In [7]:
epam_wanted_columns = ['38-53', '175-315', '47-65', '112-187', '310-580',
                       '761-1220', '060-1910', 'anis_ratio', 'date']
epam_agg_grouped = epam[epam_wanted_columns].groupby(['date'])
epam_listed = epam_agg_grouped[epam_wanted_columns[:-1]].agg(lambda x : list(x.to_numpy()))
epam_listed.set_index(pd.DatetimeIndex(epam_listed.index), inplace=True)

In [8]:
sis_wanted_columns = ['> 10 MeV', '> 30 MeV', 'date']
sis_agg_grouped = sis[sis_wanted_columns].groupby(['date'])
sis_listed = sis_agg_grouped[sis_wanted_columns[:-1]].agg(lambda x : list(x.to_numpy()))
sis_listed.set_index(pd.DatetimeIndex(sis_listed.index), inplace=True)

# SFA

SFA demands that timeseries that are being transformed have the same number of timestamps.

SFA is done on two df because (sis, epam) are on 5min basis and (mag, swepam) are on a minute basis so they have different number of timestamps per day.

### Adjust length of daily timeseries

If timeseries happen to vary a bit in length due to some previous error, function slick_array will reduce their length to the length of the shortest

In [9]:
def slick_array(array):
    listic = list(array)
    min_size = min(list(map(len, listic)))
    return np.array(list(map(lambda x: x[:min_size], listic)))

### Making an object required for transformation
Each field in dataframe contains a list that represents a daily timeseries. Each list in one row needs to be turned to numpy.array and all of lists in one row need to be put in one numpy.array. For some reason I couldn't manage to do a more simple job :)

In [10]:
def list_to_numpy(listici_arg):
    listici = slick_array(listici_arg)
    res = np.array([listici[0]]).reshape(-1, 1)
    for listic in listici[1:]:
        res = np.concatenate((res, np.array([listic]).reshape(-1, 1)), axis = 1)
    return res

### SFA transformation

In [11]:
def transform(array, num_coefs=3, num_bins=10):
    x = list_to_numpy(array)
    if x.shape[0] < num_bins:
        return []
    if x.shape[1] < num_coefs:
        num_coefs=x.shape[1]
    transformer = SymbolicFourierApproximation(n_coefs=num_coefs, n_bins=num_bins)    
    transformed = transformer.fit_transform(x)
    return transformed.flatten()

In [12]:
resManji = pd.concat([sis_listed, epam_listed], axis=1, join="inner")
resVeci = pd.concat([mag_listed, swepam_listed], axis=1, join="inner")
# Example before transformation
resManji.head(1)

Unnamed: 0_level_0,> 10 MeV,> 30 MeV,38-53,175-315,47-65,112-187,310-580,761-1220,060-1910,anis_ratio
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2001-08-07,"[0.831, 0.82, 0.821, 0.805, 0.809, 0.801, 0.80...","[0.587, 0.574, 0.576, 0.571, 0.574, 0.561, 0.5...","[694.0, 704.0, 797.0, 758.0, 650.0, 712.0, 724...","[11.6, 14.7, 13.4, 16.0, 16.2, 13.1, 14.9, 18....","[1130.0, 1210.0, 1560.0, 1490.0, 1360.0, 1110....","[123.0, 120.0, 131.0, 139.0, 133.0, 117.0, 126...","[6.65, 7.18, 7.67, 8.0, 7.18, 6.49, 6.26, 9.33...","[0.633, 0.946, 0.889, 0.666, 0.903, 0.925, 0.6...","[0.193, 0.252, 0.321, 0.234, 0.19, 0.257, 0.27...","[0.9, 0.68, 1.38, 0.89, 0.43, 0.83, 0.94, 1.02..."


In [13]:
resManji_transformed = resManji.apply(transform, axis=1)
resVeci_transformed = resVeci.apply(transform, axis=1)
# Example after transformation
resManji_transformed.head()

date
2001-08-07    [i, a, b, j, a, a, j, a, a, j, a, a, j, a, a, ...
2001-08-08    [h, c, c, h, b, d, j, d, a, j, b, b, j, e, a, ...
2001-08-09    [d, b, g, c, d, h, b, a, j, a, a, j, e, b, f, ...
2001-08-10    [b, j, c, a, j, d, b, j, d, b, j, d, b, j, d, ...
2001-08-11    [a, j, j, a, j, j, a, j, j, a, j, j, a, j, j, ...
dtype: object

### Joining two dfs
Join both transformed dfs and join their lists into one

In [14]:
def flatten_row(row):
    return np.concatenate((row[0], row[1]))

In [15]:
res = pd.concat([resManji_transformed, resVeci_transformed], axis=1, join="inner").apply(flatten_row, axis=1)
res.head(3)

date
2001-08-07    [i, a, b, j, a, a, j, a, a, j, a, a, j, a, a, ...
2001-08-08    [h, c, c, h, b, d, j, d, a, j, b, b, j, e, a, ...
2001-08-09    [d, b, g, c, d, h, b, a, j, a, a, j, e, b, f, ...
dtype: object

# Adding class label

### Loading flood dates

In [16]:
floods = pd.read_csv("floods.csv", index_col="date").drop(["Unnamed: 0"], axis =1)
floods.set_index(pd.DatetimeIndex(floods.index), inplace=True)
floods_new = pd.read_csv("floodphenomena.csv")
floods_new = floods_new[(floods_new["StartDate"].notna()) | (floods_new["EndDate"].notna())]
floods_new["StartDateTime"] = pd.to_datetime(floods_new["StartDate"], infer_datetime_format=True)
floods_new["StartDate"] = floods_new["StartDateTime"].dt.date
floods_new["EndDateTime"] = pd.to_datetime(floods_new["EndDate"], infer_datetime_format=True)
floods_new["EndDate"] = floods_new["EndDateTime"].dt.date
floods_new = floods_new[floods_new["StartDate"] >= datetime.strptime("2001-01-01", "%Y-%m-%d").date()]
flood_dates = pd.concat([floods_new["StartDateTime"], floods[floods["flood"]].index.to_series()], axis = 0)

### Adding class according to flood dates
True  -> date from resulting aggregation is in flood dates

False -> otherwise

Also, join lists into one string

In [17]:
df = pd.DataFrame(columns=["data", "class"], index=res.index)
df["data"] = res.apply(lambda x: "".join(x))
df["class"] = res.index.isin(flood_dates)
df.head(3)

Unnamed: 0_level_0,data,class
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2001-08-07,iabjaajaajaajaaibbjabjaajaajaajaajaaiabibbibbi...,False
2001-08-08,hcchbdjdajbbjeagbdfdehdcfbgibbifbgddigbhbchfcd...,False
2001-08-09,dbgcdhbajaajebffafdahebfgddfbeedfcbhbcibajdahe...,False


In [18]:
df.to_csv("sfa.csv")