In [1]:
import pandas as pd
import pickle
import os
import numpy as np

### Transformations are applied in this manner
- Differencing
- Train-Test split (reminder)
- Standardize

### Inverse Transformations should be applied in the reverse order
- Standardize
- Train-Test split (reminder)
- Differencing

In [2]:
WORKING_DIR = os.getcwd()
BASE_DIR = os.path.dirname(WORKING_DIR)
DATA_DIR = os.path.join(BASE_DIR, 'data')
CACHE = os.path.join(DATA_DIR, 'cache')

In [3]:
os.listdir(CACHE)

['speeds_train_out.csv',
 'positions_renamed.csv',
 'speeds_out.csv',
 'speeds.csv',
 'positions_train_out.csv',
 'positions.csv',
 'speeds_test_out.csv',
 'speeds_renamed.csv',
 'positions_test_out.csv']

#### Workflow
1. Load train and test datasets
2. Inverse standardize the four datasets
3. Concatenate speed and position datasets
4. Inverse Difference

In [4]:
pos_train_df = pd.read_csv(f'{CACHE}/positions_train_out.csv')

pos_test_df = pd.read_csv(f'{CACHE}/positions_test_out.csv')

speeds_train_df = pd.read_csv(f'{CACHE}/speeds_train_out.csv')

speeds_test_df = pd.read_csv(f'{CACHE}/speeds_test_out.csv')

pos_orig_df = pd.read_csv(f'{CACHE}/positions_renamed.csv')

speed_orig_df = pd.read_csv(f'{CACHE}/speeds_renamed.csv')

## Inverse Transformations for position

In [5]:
pos_train_df.head(n=2)

Unnamed: 0,veh_1_pos,veh_2_pos,veh_3_pos,veh_4_pos,veh_5_pos,veh_6_pos,veh_7_pos,veh_8_pos,veh_9_pos,veh_10_pos,veh_11_pos,veh_12_pos,veh_13_pos,veh_14_pos,veh_15_pos,veh_16_pos,veh_17_pos
0,0.778512,1.49977,1.340113,-0.040839,0.136562,0.234904,1.395683,-1.143272,1.026049,0.930249,-0.17094,1.736622,-6.295961,-1.485665,1.524073,-1.572532,-12.96663
1,-0.111419,0.84943,-0.076034,1.269443,0.507654,0.936787,-1.149528,0.906347,0.237767,-1.616866,-0.48824,0.221944,-7.26459,-1.460656,-0.333336,0.6941,-9.556867


In [6]:
with open('inverse_transformations.pkl', 'rb') as inverse_transformations:
    inv_tf = pickle.load(inverse_transformations)

In [7]:
inv_tf

{'position_mean': veh_1_pos     12.372568
 veh_2_pos      7.246528
 veh_3_pos     10.288456
 veh_4_pos     13.325584
 veh_5_pos     11.118480
 veh_6_pos      2.762984
 veh_7_pos     11.766160
 veh_8_pos      5.059504
 veh_9_pos      7.239512
 veh_10_pos     3.859392
 veh_11_pos    10.954648
 veh_12_pos    11.827944
 veh_13_pos     9.389896
 veh_14_pos     6.754048
 veh_15_pos     4.735624
 veh_16_pos     7.383584
 veh_17_pos     7.154192
 dtype: float64,
 'position_std': veh_1_pos     0.382052
 veh_2_pos     0.369038
 veh_3_pos     0.374255
 veh_4_pos     0.381597
 veh_5_pos     0.377265
 veh_6_pos     0.370432
 veh_7_pos     0.396824
 veh_8_pos     0.375680
 veh_9_pos     0.380575
 veh_10_pos    0.376897
 veh_11_pos    0.378191
 veh_12_pos    0.369715
 veh_13_pos    0.805262
 veh_14_pos    0.399853
 veh_15_pos    0.376869
 veh_16_pos    0.383829
 veh_17_pos    0.439913
 dtype: float64,
 'speed_mean': veh_1_speed     0.000168
 veh_2_speed     0.000624
 veh_3_speed     0.000208
 veh_4_s

In [8]:
inv_tf['position_mean'] # pd series

veh_1_pos     12.372568
veh_2_pos      7.246528
veh_3_pos     10.288456
veh_4_pos     13.325584
veh_5_pos     11.118480
veh_6_pos      2.762984
veh_7_pos     11.766160
veh_8_pos      5.059504
veh_9_pos      7.239512
veh_10_pos     3.859392
veh_11_pos    10.954648
veh_12_pos    11.827944
veh_13_pos     9.389896
veh_14_pos     6.754048
veh_15_pos     4.735624
veh_16_pos     7.383584
veh_17_pos     7.154192
dtype: float64

In [9]:
a = pos_train_df*inv_tf['position_std']

In [10]:
a = a + inv_tf['position_mean']

In [11]:
a

Unnamed: 0,veh_1_pos,veh_2_pos,veh_3_pos,veh_4_pos,veh_5_pos,veh_6_pos,veh_7_pos,veh_8_pos,veh_9_pos,veh_10_pos,veh_11_pos,veh_12_pos,veh_13_pos,veh_14_pos,veh_15_pos,veh_16_pos,veh_17_pos
0,12.67,7.80,10.79,13.31,11.17,2.85,12.32,4.63,7.63,4.21,10.89,12.47,4.32,6.16,5.31,6.78,1.45
1,12.33,7.56,10.26,13.81,11.31,3.11,11.31,5.40,7.33,3.25,10.77,11.91,3.54,6.17,4.61,7.65,2.95
2,11.73,7.21,10.26,13.15,10.97,2.97,12.42,4.76,6.76,3.82,10.74,12.42,3.45,7.23,4.66,6.92,5.52
3,12.86,7.84,9.89,12.98,11.72,2.57,12.24,4.48,6.63,3.34,10.86,12.07,3.24,6.45,4.70,7.51,7.55
4,11.96,7.20,10.38,13.42,11.10,2.68,11.75,5.16,7.69,3.46,11.08,11.47,2.15,7.32,5.06,7.04,7.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1245,12.78,6.74,9.83,12.79,11.73,2.76,11.36,4.49,7.74,4.28,10.92,12.13,8.90,6.27,4.77,7.97,7.41
1246,12.64,7.61,10.39,12.85,10.98,2.44,11.61,4.79,7.78,3.40,10.58,11.28,9.00,6.73,4.48,7.91,7.12
1247,12.85,7.55,10.19,12.84,11.67,2.60,12.23,4.99,7.85,3.42,10.49,11.75,9.99,6.71,4.85,7.27,7.68
1248,12.99,7.54,10.40,13.83,11.26,2.47,11.43,4.47,7.79,4.33,11.34,11.33,9.51,6.85,4.46,7.63,7.19


In [12]:
b = pos_test_df*inv_tf['position_std']

In [13]:
b = b + inv_tf['position_mean']

In [14]:
b

Unnamed: 0,veh_1_pos,veh_2_pos,veh_3_pos,veh_4_pos,veh_5_pos,veh_6_pos,veh_7_pos,veh_8_pos,veh_9_pos,veh_10_pos,veh_11_pos,veh_12_pos,veh_13_pos,veh_14_pos,veh_15_pos,veh_16_pos,veh_17_pos
0,12.11,7.67,10.39,13.86,11.04,2.33,11.26,4.59,7.88,3.95,10.73,11.27,9.35,7.13,4.55,7.11,7.56
1,12.30,7.56,10.08,13.34,10.97,3.29,12.03,5.05,7.68,3.28,10.97,11.85,10.01,7.30,4.22,6.95,7.57
2,12.91,7.36,9.75,13.11,10.88,2.95,12.07,4.75,7.21,4.34,11.39,11.54,9.67,7.10,4.79,7.02,6.60
3,12.85,7.17,10.91,13.90,11.77,2.18,11.98,4.94,7.81,3.90,11.38,12.21,9.94,7.31,4.57,7.69,7.52
4,12.48,7.10,10.49,13.83,11.40,3.14,11.60,5.15,7.82,3.56,11.06,12.31,10.06,7.17,5.04,6.91,7.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,12.83,7.22,10.45,12.75,11.48,3.37,11.54,4.88,7.22,3.52,10.43,12.17,9.66,6.62,5.31,7.81,7.15
216,12.27,6.76,9.88,13.66,11.70,2.75,11.91,4.57,6.96,3.99,10.98,11.77,9.34,7.38,5.14,7.36,7.03
217,12.91,7.17,10.79,13.55,11.61,2.61,12.27,4.70,6.60,4.18,11.26,11.53,9.55,6.38,5.37,7.80,7.72
218,12.53,7.03,10.21,13.84,10.98,2.36,12.42,4.70,7.46,4.11,11.59,11.31,8.86,6.78,4.50,7.45,7.10


In [15]:
positions_df = pd.concat([a, b], ignore_index=True, sort=False)

In [16]:
# now inverse difference

In [17]:
positions_df

Unnamed: 0,veh_1_pos,veh_2_pos,veh_3_pos,veh_4_pos,veh_5_pos,veh_6_pos,veh_7_pos,veh_8_pos,veh_9_pos,veh_10_pos,veh_11_pos,veh_12_pos,veh_13_pos,veh_14_pos,veh_15_pos,veh_16_pos,veh_17_pos
0,12.67,7.80,10.79,13.31,11.17,2.85,12.32,4.63,7.63,4.21,10.89,12.47,4.32,6.16,5.31,6.78,1.45
1,12.33,7.56,10.26,13.81,11.31,3.11,11.31,5.40,7.33,3.25,10.77,11.91,3.54,6.17,4.61,7.65,2.95
2,11.73,7.21,10.26,13.15,10.97,2.97,12.42,4.76,6.76,3.82,10.74,12.42,3.45,7.23,4.66,6.92,5.52
3,12.86,7.84,9.89,12.98,11.72,2.57,12.24,4.48,6.63,3.34,10.86,12.07,3.24,6.45,4.70,7.51,7.55
4,11.96,7.20,10.38,13.42,11.10,2.68,11.75,5.16,7.69,3.46,11.08,11.47,2.15,7.32,5.06,7.04,7.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,12.83,7.22,10.45,12.75,11.48,3.37,11.54,4.88,7.22,3.52,10.43,12.17,9.66,6.62,5.31,7.81,7.15
1466,12.27,6.76,9.88,13.66,11.70,2.75,11.91,4.57,6.96,3.99,10.98,11.77,9.34,7.38,5.14,7.36,7.03
1467,12.91,7.17,10.79,13.55,11.61,2.61,12.27,4.70,6.60,4.18,11.26,11.53,9.55,6.38,5.37,7.80,7.72
1468,12.53,7.03,10.21,13.84,10.98,2.36,12.42,4.70,7.46,4.11,11.59,11.31,8.86,6.78,4.50,7.45,7.10


In [18]:
pos_orig_df

Unnamed: 0,veh_1_pos,veh_2_pos,veh_3_pos,veh_4_pos,veh_5_pos,veh_6_pos,veh_7_pos,veh_8_pos,veh_9_pos,veh_10_pos,veh_11_pos,veh_12_pos,veh_13_pos,veh_14_pos,veh_15_pos,veh_16_pos,veh_17_pos
0,643.23,362.75,476.04,551.59,441.71,113.18,383.10,167.00,203.39,104.06,200.55,186.78,99.19,55.45,30.70,18.04,5.10
1,655.90,370.55,486.83,564.90,452.88,116.03,395.42,171.63,211.02,108.27,211.44,199.25,103.51,61.61,36.01,24.82,6.55
2,668.23,378.11,497.09,578.71,464.19,119.14,406.73,177.03,218.35,111.52,222.21,211.16,107.05,67.78,40.62,32.47,9.50
3,679.96,385.32,507.35,591.86,475.16,122.11,419.15,181.79,225.11,115.34,232.95,223.58,110.50,75.01,45.28,39.39,15.02
4,692.82,393.16,517.24,604.84,486.88,124.68,431.39,186.27,231.74,118.68,243.81,235.65,113.74,81.46,49.98,46.90,22.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1466,18785.21,10987.21,15560.36,20080.52,16735.93,4163.02,17634.01,7581.01,10819.13,5754.26,16258.77,17530.55,13886.35,9958.51,6975.33,10847.61,10493.85
1467,18797.48,10993.97,15570.24,20094.18,16747.63,4165.77,17645.92,7585.58,10826.09,5758.25,16269.75,17542.32,13895.69,9965.89,6980.47,10854.97,10500.88
1468,18810.39,11001.14,15581.03,20107.73,16759.24,4168.38,17658.19,7590.28,10832.69,5762.43,16281.01,17553.85,13905.24,9972.27,6985.84,10862.77,10508.60
1469,18822.92,11008.17,15591.24,20121.57,16770.22,4170.74,17670.61,7594.98,10840.15,5766.54,16292.60,17565.16,13914.10,9979.05,6990.34,10870.22,10515.70


In [19]:
# invert difference
def invert_difference(orig_data, diff_data, interval):
    return [diff_data[i-interval] + orig_data[i-interval] for i in range(interval, len(orig_data))]

# # define dataset
# data = pos_orig_df.iloc[:, 0]
# # print(data)
# # difference transform
# transformed = difference(data, 1)
# print(transformed)
# # invert difference
# inverted = invert_difference(data, transformed, 1)
# # print(inverted)
positions_df_func = pd.DataFrame()

for i, series in enumerate(positions_df):
    positions_df_func[series] = invert_difference(pos_orig_df[series], positions_df[series], 1)

In [20]:
positions_df_func

Unnamed: 0,veh_1_pos,veh_2_pos,veh_3_pos,veh_4_pos,veh_5_pos,veh_6_pos,veh_7_pos,veh_8_pos,veh_9_pos,veh_10_pos,veh_11_pos,veh_12_pos,veh_13_pos,veh_14_pos,veh_15_pos,veh_16_pos,veh_17_pos
0,655.90,370.55,486.83,564.90,452.88,116.03,395.42,171.63,211.02,108.27,211.44,199.25,103.51,61.61,36.01,24.82,6.55
1,668.23,378.11,497.09,578.71,464.19,119.14,406.73,177.03,218.35,111.52,222.21,211.16,107.05,67.78,40.62,32.47,9.50
2,679.96,385.32,507.35,591.86,475.16,122.11,419.15,181.79,225.11,115.34,232.95,223.58,110.50,75.01,45.28,39.39,15.02
3,692.82,393.16,517.24,604.84,486.88,124.68,431.39,186.27,231.74,118.68,243.81,235.65,113.74,81.46,49.98,46.90,22.57
4,704.78,400.36,527.62,618.26,497.98,127.36,443.14,191.43,239.43,122.14,254.89,247.12,115.89,88.78,55.04,53.94,30.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,18785.21,10987.21,15560.36,20080.52,16735.93,4163.02,17634.01,7581.01,10819.13,5754.26,16258.77,17530.55,13886.35,9958.51,6975.33,10847.61,10493.85
1466,18797.48,10993.97,15570.24,20094.18,16747.63,4165.77,17645.92,7585.58,10826.09,5758.25,16269.75,17542.32,13895.69,9965.89,6980.47,10854.97,10500.88
1467,18810.39,11001.14,15581.03,20107.73,16759.24,4168.38,17658.19,7590.28,10832.69,5762.43,16281.01,17553.85,13905.24,9972.27,6985.84,10862.77,10508.60
1468,18822.92,11008.17,15591.24,20121.57,16770.22,4170.74,17670.61,7594.98,10840.15,5766.54,16292.60,17565.16,13914.10,9979.05,6990.34,10870.22,10515.70


## Inverse Transform for speed

In [21]:
a = speeds_train_df*inv_tf['speed_std']

a = a + inv_tf['speed_mean']

a

b = speeds_test_df*inv_tf['speed_std']

b = b + inv_tf['speed_mean']

b

speeds_df = pd.concat([a, b], ignore_index=True, sort=False)

# now inverse difference
speeds_df_func = pd.DataFrame()

for i, series in enumerate(speeds_df):
    speeds_df_func[series] = invert_difference(speed_orig_df[series], speeds_df[series], 1)

speeds_df_func

Unnamed: 0,veh_1_speed,veh_2_speed,veh_3_speed,veh_4_speed,veh_5_speed,veh_6_speed,veh_7_speed,veh_8_speed,veh_9_speed,veh_10_speed,veh_11_speed,veh_12_speed,veh_13_speed,veh_14_speed,veh_15_speed,veh_16_speed,veh_17_speed
0,12.66,7.80,10.79,13.31,11.16,2.85,12.32,4.62,7.63,4.21,10.88,12.47,4.32,6.16,5.32,6.78,1.45
1,12.33,7.56,10.26,13.82,11.31,3.11,11.31,5.40,7.33,3.25,10.77,11.91,3.54,6.17,4.61,7.65,2.94
2,11.73,7.21,10.26,13.15,10.97,2.98,12.42,4.76,6.76,3.82,10.75,12.41,3.44,7.24,4.65,6.92,5.52
3,12.86,7.84,9.88,12.98,11.72,2.57,12.24,4.48,6.63,3.34,10.86,12.07,3.24,6.45,4.71,7.51,7.55
4,11.96,7.20,10.38,13.42,11.10,2.68,11.74,5.16,7.70,3.46,11.08,11.47,2.15,7.32,5.06,7.04,7.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,12.83,7.22,10.45,12.74,11.49,3.37,11.54,4.87,7.22,3.52,10.43,12.17,9.67,6.62,5.31,7.81,7.16
1466,12.27,6.76,9.88,13.66,11.70,2.75,11.91,4.57,6.96,3.99,10.97,11.76,9.33,7.38,5.14,7.36,7.03
1467,12.91,7.17,10.79,13.55,11.60,2.61,12.27,4.70,6.60,4.18,11.26,11.54,9.55,6.38,5.37,7.80,7.72
1468,12.53,7.03,10.21,13.84,10.98,2.35,12.42,4.70,7.46,4.11,11.59,11.31,8.87,6.78,4.50,7.45,7.11


In [22]:
assert (np.array(speeds_df_func.iloc[0, :]) == np.array(speed_orig_df.iloc[1, :])).all(), 'Wrong inverse transformation'

In [23]:
assert (np.array(positions_df_func.iloc[0, :16]) == np.array(pos_orig_df.iloc[1, :16])).all(), 'Wrong inverse transformation'