In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


 # Dataloader


In [None]:
import torch
import numpy as np
import pandas as pd
def prepare_data_main_model(df, seq_length, output_size, date_range):
    '''
    df: pandas df contain all the data
    seq_length: number of days consider as input
    output_size: number of days to predict
    date_range: length of history to consider
    Output: prepared data and state list
    '''
    full_data = []
    state_ordered = []

    for state in df.index.get_level_values('stationid').unique():
        df_state = df.iloc[df.index.get_level_values('stationid') == state]

        if len(df_state) <= date_range:
            L = len(df_state.to_numpy())
            train_state = []
            for i in range(L - seq_length - output_size + 1):
                train_seq = df_state.to_numpy()[i:i + seq_length]
                train_label = df_state.to_numpy()[i:i + seq_length + output_size][seq_length:seq_length + output_size,
                              0]
                train_state.append((train_seq, train_label))

            for x in train_state:
                full_data.append(x)
            state_ordered.append(state)
        else:
            df_state = df.iloc[df.index.get_level_values('stationid') == state][-date_range:]

            train_state = []

            L = len(df_state.to_numpy())
            for i in range(L - seq_length - output_size + 1):
                train_seq = df_state.to_numpy()[i:i + seq_length]
                train_label = df_state.to_numpy()[i:i + seq_length + output_size][seq_length:seq_length + output_size,
                              0]
                train_state.append((train_seq, train_label))

            for x in train_state:
                full_data.append(x)
            state_ordered.append(state)
    return full_data, state_ordered

def splitdata(full_data, ratio, batch_size):
    train_size = int(ratio * len(full_data))
    test_size = len(full_data) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(full_data, [train_size, test_size])

    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True)

    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=batch_size,
                                              shuffle=True)

    return train_loader, test_loader
gd_number = pd.read_csv('ground_truth.csv')
gd_number.drop('Unnamed: 0', axis=1, inplace=True)

stationid           float64
date         datetime64[ns]
in_bike             float64
out_bike            float64
dtype: object

In [None]:
df

Unnamed: 0,stationid,date,in_bike,out_bike
0,2494.0,2017-07-01,37.0,42.0
1,2494.0,2017-07-02,46.0,49.0
2,2494.0,2017-07-03,45.0,31.0
3,2494.0,2017-07-04,30.0,36.0
4,2494.0,2017-07-05,30.0,31.0
...,...,...,...,...
925,3513.0,2017-07-26,8.0,8.0
926,3513.0,2017-07-27,13.0,16.0
927,3513.0,2017-07-28,7.0,12.0
928,3513.0,2017-07-29,6.0,11.0


# Models

In [None]:
import torch
import pandas as pd
from torch import nn
import torch.nn.functional as F
pd.options.mode.chained_assignment = None

## LSTM

In [None]:


class LSTM(nn.Module):

    def __init__(self, input_size, hidden_layer_size, num_layers, output_size, dropout_rate):
        super().__init__()

        self.input_size = input_size

        self.hidden_layer_size = hidden_layer_size

        self.num_layers = num_layers

        self.output_size = output_size

        self.lstm = nn.LSTM(self.input_size, hidden_layer_size, num_layers, batch_first=True, dropout=dropout_rate)

        self.linear = nn.Linear(hidden_layer_size, 1000)

        self.dropout = nn.Dropout(dropout_rate)

        self.linear2 = nn.Linear(1000, output_size)

    def forward(self, input_seq):
        h = (torch.zeros(self.num_layers, input_seq.size(0), self.hidden_layer_size).to(device),
             torch.zeros(self.num_layers, input_seq.size(0), self.hidden_layer_size).to(device))

        lstm_out, self.hidden_cell = self.lstm(input_seq, h)

        # only return the results for last sequence
        lstm_out = lstm_out[:, -1, :]
        predictions = self.linear(lstm_out)
        predictions = F.relu(predictions)
        predictions = self.dropout(predictions)
        predictions = self.linear2(predictions)

        return predictions

In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import MinMaxScaler
from torch import nn
from tqdm import trange
from datetime import date, timedelta

from collections import OrderedDict
from collections import namedtuple
from itertools import product

import torch.nn.functional as F

#data_file = '/content/drive/MyDrive/OR Project/Austin bike/bikedata.csv'
data_file ='bike infor.csv'
'''
Don't save trained model or results in github, there is no enough space
'''
model_file = 'LSTM Results/'
prediction_file = 'STM Results/'
summary_file ='LSTM Results/'
def perdelta(start, end, delta):
    curr = start
    while curr < end:
        yield curr
        curr += delta
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



params = OrderedDict(
    target = ['in_bike'], ### Select from 'in_bike' and 'out_bike'
    lr = [0.0001,0.00025,0.0005],
    batch_size = [2],
    seq_length = [10,20,30,40,50], #test
    output_size = [1],
    num_pred_features = [1],
    day_range = [60,90,120,150,180,210], #test
    input_size = [5],
    hidden_layer_size = [32,64,128,256,512,1024], #test
    num_layers = [1], #test
    ratio = [0.8],
    num_epochs = [50],
    dropout_rate = [0.25],
    lossfunc = [nn.MSELoss()
                ]
)


class RunBuilder():
    @staticmethod
    def get_runs(params):

        Run = namedtuple('Run', params.keys())

        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))

        return runs

pd.options.mode.chained_assignment = None

runs = RunBuilder.get_runs(params)
columns = [
    'Model',
    'lr',
    'batch_size',
    'seq_length',
    'day_range',
    'hidden_layer_size',
    'num_layers',
    'lossfunc',
    'mse_numerical_validation',
    'mse_numerical_test',
    'Target'
]

df_summary = pd.DataFrame(columns=columns)
for run in RunBuilder.get_runs(params):
    df = pd.read_csv(data_file)

    if run.target == 'in_bike':
        results_file = 'in_bike'
        df = df.drop('out_bike',axis=1)
    if run.target == 'out_bike':
        results_file = 'out_bike'
        df = df.drop('in_bike',axis=1)
    validation_predictions = []
    validation_labels = []


    df['date'] = pd.to_datetime(df['date'])
    df_test = df[(df['date'] >= (pd.to_datetime('2017-06-30') - timedelta(run.seq_length))) & (df['date'] <= pd.to_datetime('2017-07-30')) ]
    df_test = df_test.set_index(['stationid', 'date'])
    df_test1 = df_test.copy()

    df_train = df[df['date'] <= pd.to_datetime('2017-06-30')]
    df_train = df_train.set_index(['stationid', 'date'])

    first_col = df_train.pop(run.target)
    df_train.insert(0, run.target, first_col)
    first_col_2 = df_test.pop(run.target)
    df_test.insert(0, run.target, first_col_2)


    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(df_train.iloc[:, 1:])
    train_features_normalized = scaler.transform(df_train.iloc[:, 1:])
    test_feature_normalized = scaler.transform(df_test.iloc[:, 1:])

    scaler_target = MinMaxScaler(feature_range=(0, 1))
    scaler_target.fit(np.asarray(df_train.iloc[:, 0]).reshape(-1, 1))
    train_target_normalized = scaler_target.transform(np.asarray(df_train.iloc[:, 0]).reshape(-1, 1))
    test_target_normalized = scaler_target.transform(np.asarray(df_test.iloc[:, 0]).reshape(-1, 1))

    df_train.iloc[:, 1:] = df_train.iloc[:, 1:]
    df_train.iloc[:, 0] = train_target_normalized
    df_test.iloc[:, 1:] = df_test.iloc[:, 1:]
    df_test.iloc[:, 0] = test_target_normalized

    '''
    Training
    '''

    full_data_main, state_ordered = prepare_data_main_model(df_train, run.seq_length,
                                                            run.output_size, run.day_range)

    model_main = LSTM(run.input_size, run.hidden_layer_size, run.num_layers, run.output_size ,
                      run.dropout_rate).to(device)

    train_loader_main, test_loader_main = splitdata(full_data_main, run.ratio, run.batch_size)

    loss_function = run.lossfunc

    optimizer_main = torch.optim.Adam(model_main.parameters(), lr=run.lr)

    track_loss_train = []
    track_loss_test = []
    best_loss = 100000
    ###
    best_mse_numerical = None






    for i in trange(run.num_epochs):

        model_main.train()
        epoch_loss_train = 0

        for i, (seq, labels) in enumerate(train_loader_main):
            seq, labels = seq.to(device), labels.to(device)
            optimizer_main.zero_grad()
            seq = torch.as_tensor(seq).reshape(-1, run.seq_length, run.input_size)
            model_main.hidden_cell = (torch.zeros(run.num_layers, seq.size()[0], run.hidden_layer_size).to(device),
                          torch.zeros(run.num_layers, seq.size()[0], run.hidden_layer_size).to(device))

            y_pred = model_main(seq.float())

            single_loss = loss_function(y_pred, torch.as_tensor(labels).float())
            single_loss.backward()
            optimizer_main.step()

            epoch_loss_train += single_loss.item()

        track_loss_train.append(epoch_loss_train)

        with torch.no_grad():
            epoch_loss_test = 0
            ########
            validation_predictions = []
            validation_labels = []
            for i, (seq, labels) in enumerate(test_loader_main):
                seq, labels = seq.to(device), labels.to(device)
                seq = torch.as_tensor(seq).reshape(-1, run.seq_length, run.input_size)
                model_main.hidden_cell = (torch.zeros(run.num_layers, seq.size(0), run.hidden_layer_size).to(device),
                                 torch.zeros(run.num_layers, seq.size(0), run.hidden_layer_size).to(device))
                y_pred = model_main(seq.float())

                single_loss = loss_function(y_pred, torch.as_tensor(labels).float())
                epoch_loss_test += single_loss.item()
                ######
                validation_predictions.extend(y_pred.cpu().numpy())
                validation_labels.extend(labels.cpu().numpy())
            track_loss_test.append(epoch_loss_test)

        validation_predictions = np.concatenate(validation_predictions, axis=0)
        validation_labels = np.concatenate(validation_labels, axis=0)
        validation_predictions = scaler_target.inverse_transform(validation_predictions.reshape(-1, 1))
        validation_labels = scaler_target.inverse_transform(validation_labels.reshape(-1, 1))







        if epoch_loss_test  < best_loss:
          #####




            best_loss = epoch_loss_test
            print('Train Loss: ', epoch_loss_train)
            print('Test Loss: ', epoch_loss_test)
            es = 0
            torch.save(model_main.state_dict(),
                           model_file + run.target + '_' +
                           str(run.day_range) + '_' + str(run.hidden_layer_size)\
                           + '_' + str(run.seq_length) + '_' + str(run.num_layers)\
                           + '_LSTM_weights.pt')
        else:
            es += 1
            print("Counter {} of 5".format(es))
            print('Train Loss: ', epoch_loss_train)
            print('Test Loss: ', epoch_loss_test)


        if es > 4:
            print("Early stopping with best_loss: ", best_loss, "and test_loss for this epoch: ",
                      epoch_loss_test,
                      "...")

            break



    df_output = pd.DataFrame(columns=['stationid', 'date', 'Prediction_1d'
                                      ])

    test_weeks = df_test[df_test.index.get_level_values('date') \
                       >= pd.to_datetime('2017-07-01')].index.get_level_values('date').unique()
    test_states = df_test.index.get_level_values('stationid').unique()

    m_state_dict_main = torch.load(model_file + run.target + '_' +
                                   str(run.day_range) + '_' + str(run.hidden_layer_size) \
                                   + '_' + str(run.seq_length) + '_' + str(run.num_layers) \
                                   + '_LSTM_weights.pt')

    model_main = LSTM(run.input_size, run.hidden_layer_size, run.num_layers,
                      run.output_size, run.dropout_rate).to(device)
    model_main.load_state_dict(m_state_dict_main)

    with torch.no_grad():

        for stationid in test_states:
            for week in test_weeks:
                seq = df_test[(df_test.index.get_level_values('stationid') == stationid)\
                 & (df_test.index.get_level_values('date') <= week)][-run.seq_length:].to_numpy()

                seq = torch.tensor(seq).reshape(-1, run.seq_length, run.input_size).to(device)

                model_main.hidden_cell = (torch.zeros(run.num_layers, seq.size(0), run.hidden_layer_size).to(device),
                                 torch.zeros(run.num_layers, seq.size(0), run.hidden_layer_size).to(device))
                prediction = model_main(seq.float())

                prediction = scaler_target.inverse_transform(prediction.cpu().detach().numpy().reshape(-1, 1))
                prediction = np.rint(prediction)
                dic = {
                    'stationid' : stationid,
                    'date' : week,
                    'Prediction_1d' : prediction[0].item(),


                }

                df_output = pd.concat([df_output, pd.DataFrame([dic])], ignore_index=True)
                df_output_n = df_output.copy()

        df_output.to_csv(prediction_file + run.target + '_' +
                            str(run.day_range) + '_' + str(run.hidden_layer_size) \
                            + '_' + str(run.seq_length) + '_' + str(run.num_layers) \
                            + '.csv')




        result_numerical = df_output_n.merge(gd_number)

        dic_lstm = {
            'Model': 'LSTM',
            'lr': run.lr,
            'batch_size': run.batch_size,
            'seq_length': run.seq_length,
            'day_range': run.day_range,
            'hidden_layer_size': run.hidden_layer_size,
            'num_layers': run.num_layers,
            'lossfunc': run.lossfunc,


            'Target': run.target
        }


        df_summary = pd.concat([df_summary, pd.DataFrame([dic_lstm])], ignore_index=True)

        df_summary.to_csv(summary_file + 'summarytest1.csv')


  2%|▏         | 1/50 [00:02<01:51,  2.27s/it]

Train Loss:  3.0874663121103367
Test Loss:  0.6487757370559848


  2%|▏         | 1/50 [00:04<03:26,  4.22s/it]


KeyboardInterrupt: ignored

In [None]:
result_numerical

Unnamed: 0,stationid,date,Prediction_1d,in_bike,out_bike


In [None]:
df_output_n

Unnamed: 0,stationid,date,Prediction_1d
0,2494.0,2017-07-01,16.0
1,2494.0,2017-07-02,17.0
2,2494.0,2017-07-03,16.0
3,2494.0,2017-07-04,19.0
4,2494.0,2017-07-05,21.0
...,...,...,...
925,3513.0,2017-07-26,11.0
926,3513.0,2017-07-27,12.0
927,3513.0,2017-07-28,12.0
928,3513.0,2017-07-29,10.0
