# SOLO vs LGBM run

https://github.com/kuolinhsu/SOLO

In [13]:
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('rnn_data_prajwal.csv', parse_dates=['DateTime'], index_col='DateTime')
df.drop('Unnamed: 0', axis=1, inplace=True)

df.index.min(), df.index.max()

(Timestamp('2013-01-01 01:00:00'), Timestamp('2014-01-01 00:00:00'))

In [4]:
Xvar = ['Ta', 'Ws', 'Fg', 'VPD', 'Fn', 'q', 'Ts', 'Sws']
yvar = 'Fc'

## Train-test splits
1. Test: 60 days starting randomly picked date
2. Train: Remaining data
    a. Layer 1 and Layer 2: 50:50 random split
    b. Layer 1 further divided into 1/m unique train sets, where m = number of unique models. 

In [11]:
# Test data set
# 60 days window starting from randomly picked day.
np.random.seed(41)
test_start_day = np.random.choice(df.index.date, 1)[0]
test_end_day = test_start_day + datetime.timedelta(days=60)
print('Test start date:', test_start_day)
print('Test end date:', test_end_day)

Test start date: 2013-03-21
Test end date: 2013-05-20


In [14]:
df_test = df.loc[test_start_day:test_end_day].copy()
df_train = pd.concat([df.loc[:test_start_day], df.loc[test_end_day:]]).copy()

# Scaling test frame
y_test_mean, y_test_std = df_test[yvar].mean(), df_test[yvar].std()
scaler_test = StandardScaler()
df_test[df_test.keys()] = scaler_test.fit_transform(df_test)

# Scaling train frame
y_train_mean, y_train_std = df_train[yvar].mean(), df_train[yvar].std()
scaler_train = StandardScaler()
df_train[df_train.keys()] = scaler_train.fit_transform(df_train)

In [15]:
print(df.loc[test_start_day:test_end_day].index.min(), '>', df.loc[:test_start_day].index.max())
print(df.loc[test_start_day:test_end_day].index.max(), '<',df.loc[test_end_day:].index.min())

assert df.loc[test_start_day:test_end_day].index.min() > df.loc[:test_start_day].index.max()
assert df.loc[test_start_day:test_end_day].index.max() < df.loc[test_end_day:].index.min()

2013-03-21 02:30:00 > 2013-03-20 21:00:00
2013-05-19 20:30:00 < 2013-05-20 06:00:00


In [18]:
X_test, y_test = df_test[Xvar], df_test[yvar]
X_train, y_train = df_train[Xvar], df_train[yvar]

### Preparing data from SOLO run

In [21]:
path = '/Users/pluto/Desktop/bag/tutoring/atbin/imputation/package/SOLO/data/'

In [22]:
# Preparing data for SOFM
df_train[Xvar].to_csv(path + 'sofm_input.csv', header=False, index=False)

# Preparing data for SOLO
df_train[Xvar + [yvar]].to_csv(path + 'solo_input.csv', header=False, index=False)

# test data
# Preparing data for SEQSOLO
df_train[Xvar + [yvar]].to_csv(path + 'seqsolo_input.csv', header=False, index=False)