In [8]:
import os
import pandas as pd
import numpy as np
import sys
import re
import logging
from Modules.Loader_wrangler import *
import random
import torch
import torch.nn as nn
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Configure basic logging
logging.basicConfig(level=logging.INFO, force=True, format='%(levelname)s: %(message)s')

In [None]:
play = loader(output_file_name="merged_df2017.pkl", chunksize=100000, sample_size=100000, survey_year=2017)

In [3]:
play = pd.read_pickle("/home/trapfishscott/Cambridge24.25/D200_ML_econ/ProblemSets/Project/data/merged_df2017.pkl")

### Obtaining only relevant variables and making into a time series

In [None]:
temporal_vars = ["TWSMonth", "TravelYear", "TravelWeekDay_B01ID"]
individual_vars =["PSUGOR_B02ID", "IndIncome2002_B02ID", "HHoldNumChildren", "DVLALengthBand_B01ID"]

outcome_vars = ["TripStart", "TripEnd", "TripDisExSW", "TripPurpose_B01ID"]
categorical_outcome_vars = ["TripPurpose_B01ID"]

extra_vars = ["IndividualID_x", "JourSeq"]

# Made up of temporal and individual
features = ["PSUGOR_B02ID", "IndIncome2002_B02ID", "HHoldNumChildren", "DVLALengthBand_B01ID", "TWSMonth", "TravelYear", "TravelWeekDay_B01ID"]


In [5]:
ts_df = play[extra_vars + individual_vars + temporal_vars + outcome_vars]

In [6]:
ts_df = ts_df.sort_values(["IndividualID_x", "TravelWeekDay_B01ID", "JourSeq"])

In [None]:
ts_df

### Data Manipulation pipeline

1. One-hot encode categorical features + any small cleaning steps
2. Add days of the week with no car travel
3. Make data frame into wide format
4. Convert to tensor


In [None]:
### small cleaning steps and one hot encoding

In [None]:
def impute_missing_travel_week_for_i(i_df, i_id, full_week_encoding):
        
    break_flag = False

    # Travel days with travel 
    included_travel_day = i_df["TravelWeekDay_B01ID"].to_list()

    # Travel days with no travel
    travel_day_no_drive = list(set(full_week_encoding) - set(included_travel_day))

    # These values will repeat for empty-travel travel days
    imputed_travel_df = pd.DataFrame({
        "TravelWeekDay_B01ID": travel_day_no_drive,
        "IndividualID_x": [i_id]*len(travel_day_no_drive),
        "JourSeq": [1]*len(travel_day_no_drive)
    })

    # Looping through all the columns in the original df
    for col in i_df.columns:

        # For days with no travel all outcomes vars will take 0
        if col in outcome_vars:
            imputed_travel_df[col] = [0]*len(travel_day_no_drive)
        
        # Individual vars will repeat
        if col in individual_vars:

            # Repeating individual vars (or temporal vars) usually signify an error
            if len(i_df[col].unique()) != 1:
                print(f"{col} is erroneous for {i}")
                print(f"Unique vals: {i_df[col].unique()}")
                break_flag = True
                break
            else:
                imputed_travel_df[col] = i_df[col].unique()[0]

        if col != "TravelWeekDay_B01ID" and col in temporal_vars:
            if len(i_df[col].unique()) != 1:
                print(f"{col} is erroneous for {i}")
                print(f"Unique vals: {i_df[col].unique()}")
                break_flag = True
                break
            else:
                imputed_travel_df[col] = i_df[col].unique()[0]

    if break_flag:
        print("Continuing to next individual")
        return
    

    # display(imputed_travel_df)

    # Concatenating df to include empty travel days
    full_df = pd.concat([i_df, imputed_travel_df])

    full_df = full_df.sort_values(["TravelYear", "TWSMonth", "TravelWeekDay_B01ID", "JourSeq", "TripStart", "TripEnd"])


    #display(full_df)


    return full_df

In [234]:
def transform_to_wide_for_i(i_df, max_journey_seq, seq_length = 7, outcome_vars=outcome_vars):
    df = i_df.copy()

    expected_cols = [f"{col}_{i}" for col in outcome_vars for i in range(1, max_journey_seq+1)]
    expected_categorical = [f"{col}_{i}" for col in categorical_outcome_vars for i in range(1, max_journey_seq+1)]

    df = df[df["JourSeq"]<=max_journey_seq]

    #

    df_wide = df.pivot(index="TravelWeekDay_B01ID",
                  columns = "JourSeq",
                  values = outcome_vars)
    
    df_wide.columns = [f"{col[0]}_{int(col[1])}" for col in df_wide.columns]

    for col in expected_cols:
        if col not in df_wide.columns:
            df_wide[col] = 0
    
    # Ensure column order is consistent
    df_wide = df_wide[expected_cols]
    
    df_wide = df_wide.fillna(0)

    df_wide.reset_index(inplace=True)

    # Dropping outcome columns
    df.drop(columns=outcome_vars + ["IndividualID_x",	"JourSeq"], axis=1, inplace = True)
    df.drop_duplicates(subset=["TravelWeekDay_B01ID"], inplace=True)

    df_wide = df_wide.merge(df, on="TravelWeekDay_B01ID", how="left")

    top_row = df_wide.head(1).copy()

    for col in expected_cols:
        top_row[col] = 0
        top_row["TravelWeekDay_B01ID"] = 0

    repeated_rows = pd.concat([top_row] * seq_length, ignore_index=True)

    df_wide = pd.concat([repeated_rows, df_wide], ignore_index=True)

    targets_only = df_wide.drop(columns=temporal_vars + individual_vars + extra_vars, axis=1, errors="ignore")

    targets_only = targets_only.iloc[seq_length:,:]

    targets_cont = targets_only.drop(columns=expected_categorical, axis=1)
    targets_cat = targets_only[expected_categorical]

    return df_wide, targets_cont, targets_cat

In [233]:
def prepare_data_for_LSTM(long_df, impute_missing_travel_weeks=True, transform_to_wide=False, transform_to_tensor=False, debug=False):

    df = long_df.copy()
    individual_ids = df["IndividualID_x"].unique()       # All unique individual id's to loop over
    df_chunks = []

    full_week_encoding = list(range(1,8))

    if debug:
        random_index = random.randint(0, len(individual_ids))

        debug_df = df[df["IndividualID_x"] == individual_ids[random_index]]

        display(debug_df)

        debug_df = impute_missing_travel_week_for_i(debug_df, i_id=individual_ids[random_index], full_week_encoding=full_week_encoding)

        display(debug_df)

        debug_df, debug_targets_cont, debug_targets_cat = transform_to_wide_for_i(debug_df, max_journey_seq=10)

        display(debug_df)

        display(debug_targets_cont)

        display(debug_targets_cat)

        return
    
    if transform_to_tensor:
        individual_tensors = []
        target_cont_tensors = []
        target_cat_tensors = []
    
    if impute_missing_travel_weeks:

        for i, individual_id in enumerate(individual_ids[:]):

            i_df = df[df["IndividualID_x"] == individual_id]

            full_df = impute_missing_travel_week_for_i(i_df, i_id=individual_id, full_week_encoding=full_week_encoding)

            #display(full_df)

            if full_df is not None:
                if not transform_to_wide:
                    df_chunks.append(full_df)

                else:

                    full_df, targets_cont, targets_cat = transform_to_wide_for_i(full_df, max_journey_seq=10)
                    
                    if transform_to_tensor:

                    
                        full_arr = full_df.to_numpy()
                        full_arr = np.expand_dims(full_arr, axis=1)

                        targets_cont_arr = targets_cont.to_numpy()
                        targets_cat_arr = targets_cat.to_numpy()

                        full_i_tensor = torch.tensor(full_arr)
                        target_cont_i_tensor = torch.tensor(targets_cont_arr)
                        target_cat_i_tensor = torch.tensor(targets_cat_arr)

                        individual_tensors.append(full_i_tensor)
                        target_cont_tensors.append(target_cont_i_tensor)
                        target_cat_tensors.append(target_cat_i_tensor)


                    else:

                        #display(full_df)
                        print("")
                        #display(targets)
                        df_chunks.append(full_df)

            sys.stdout.write(f"\rIndividual {i+1} out of {len(individual_ids)} Complete!    ")
            sys.stdout.flush()

        if transform_to_tensor:
            individual_tensors = torch.stack(individual_tensors, dim=0)
            target_cont_tensors = torch.stack(target_cont_tensors, dim=0)
            target_cat_tensors = torch.stack(target_cat_tensors, dim=0)
            return individual_tensors, target_cont_tensors, target_cat_tensors
        
        else:

            df_to_return = pd.concat(df_chunks)

            return df_to_return



    else:
        return df


In [235]:
df = prepare_data_for_LSTM(long_df=ts_df, debug=True)


Unnamed: 0,IndividualID_x,JourSeq,PSUGOR_B02ID,IndIncome2002_B02ID,HHoldNumChildren,VehMakeModel_B02ID,TWSMonth,TravelYear,TravelWeekDay_B01ID,TripStart,TripEnd,TripDisExSW,TripPurpose_B01ID
14369,2017006000.0,1.0,4.0,1.0,0.0,-10.0,6.0,2017.0,1.0,506.0,535.0,11.0,1.0
27577,2017006000.0,2.0,4.0,1.0,0.0,-10.0,6.0,2017.0,1.0,1050.0,1075.0,11.3,1.0
26992,2017006000.0,1.0,4.0,1.0,0.0,-10.0,6.0,2017.0,2.0,498.0,525.0,11.1,1.0
31900,2017006000.0,2.0,4.0,1.0,0.0,-10.0,6.0,2017.0,2.0,1026.0,1055.0,11.3,7.0
26993,2017006000.0,3.0,4.0,1.0,0.0,-10.0,6.0,2017.0,2.0,1086.0,1094.0,1.3,7.0
18726,2017006000.0,1.0,4.0,1.0,0.0,-10.0,6.0,2017.0,3.0,499.0,524.0,11.1,1.0
17306,2017006000.0,2.0,4.0,1.0,0.0,-10.0,6.0,2017.0,3.0,1030.0,1050.0,9.0,6.0
36551,2017006000.0,3.0,4.0,1.0,0.0,-10.0,6.0,2017.0,3.0,1080.0,1091.0,3.0,6.0
36946,2017006000.0,1.0,4.0,1.0,0.0,-10.0,6.0,2017.0,4.0,503.0,550.0,29.0,2.0
18763,2017006000.0,2.0,4.0,1.0,0.0,-10.0,6.0,2017.0,4.0,805.0,860.0,20.8,2.0


Unnamed: 0,IndividualID_x,JourSeq,PSUGOR_B02ID,IndIncome2002_B02ID,HHoldNumChildren,VehMakeModel_B02ID,TWSMonth,TravelYear,TravelWeekDay_B01ID,TripStart,TripEnd,TripDisExSW,TripPurpose_B01ID
14369,2017006000.0,1.0,4.0,1.0,0.0,-10.0,6.0,2017.0,1.0,506.0,535.0,11.0,1.0
27577,2017006000.0,2.0,4.0,1.0,0.0,-10.0,6.0,2017.0,1.0,1050.0,1075.0,11.3,1.0
26992,2017006000.0,1.0,4.0,1.0,0.0,-10.0,6.0,2017.0,2.0,498.0,525.0,11.1,1.0
31900,2017006000.0,2.0,4.0,1.0,0.0,-10.0,6.0,2017.0,2.0,1026.0,1055.0,11.3,7.0
26993,2017006000.0,3.0,4.0,1.0,0.0,-10.0,6.0,2017.0,2.0,1086.0,1094.0,1.3,7.0
18726,2017006000.0,1.0,4.0,1.0,0.0,-10.0,6.0,2017.0,3.0,499.0,524.0,11.1,1.0
17306,2017006000.0,2.0,4.0,1.0,0.0,-10.0,6.0,2017.0,3.0,1030.0,1050.0,9.0,6.0
36551,2017006000.0,3.0,4.0,1.0,0.0,-10.0,6.0,2017.0,3.0,1080.0,1091.0,3.0,6.0
36946,2017006000.0,1.0,4.0,1.0,0.0,-10.0,6.0,2017.0,4.0,503.0,550.0,29.0,2.0
18763,2017006000.0,2.0,4.0,1.0,0.0,-10.0,6.0,2017.0,4.0,805.0,860.0,20.8,2.0


Unnamed: 0,TravelWeekDay_B01ID,TripStart_1,TripStart_2,TripStart_3,TripStart_4,TripStart_5,TripStart_6,TripStart_7,TripStart_8,TripStart_9,...,TripPurpose_B01ID_7,TripPurpose_B01ID_8,TripPurpose_B01ID_9,TripPurpose_B01ID_10,PSUGOR_B02ID,IndIncome2002_B02ID,HHoldNumChildren,VehMakeModel_B02ID,TWSMonth,TravelYear
0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,4.0,1.0,0.0,-10.0,6.0,2017.0
1,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,4.0,1.0,0.0,-10.0,6.0,2017.0
2,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,4.0,1.0,0.0,-10.0,6.0,2017.0
3,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,4.0,1.0,0.0,-10.0,6.0,2017.0
4,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,4.0,1.0,0.0,-10.0,6.0,2017.0
5,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,4.0,1.0,0.0,-10.0,6.0,2017.0
6,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,4.0,1.0,0.0,-10.0,6.0,2017.0
7,1.0,506.0,1050.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,4.0,1.0,0.0,-10.0,6.0,2017.0
8,2.0,498.0,1026.0,1086.0,0.0,0,0,0,0,0,...,0,0,0,0,4.0,1.0,0.0,-10.0,6.0,2017.0
9,3.0,499.0,1030.0,1080.0,0.0,0,0,0,0,0,...,0,0,0,0,4.0,1.0,0.0,-10.0,6.0,2017.0


Unnamed: 0,TripStart_1,TripStart_2,TripStart_3,TripStart_4,TripStart_5,TripStart_6,TripStart_7,TripStart_8,TripStart_9,TripStart_10,...,TripDisExSW_1,TripDisExSW_2,TripDisExSW_3,TripDisExSW_4,TripDisExSW_5,TripDisExSW_6,TripDisExSW_7,TripDisExSW_8,TripDisExSW_9,TripDisExSW_10
7,506.0,1050.0,0.0,0.0,0,0,0,0,0,0,...,11.0,11.3,0.0,0.0,0,0,0,0,0,0
8,498.0,1026.0,1086.0,0.0,0,0,0,0,0,0,...,11.1,11.3,1.3,0.0,0,0,0,0,0,0
9,499.0,1030.0,1080.0,0.0,0,0,0,0,0,0,...,11.1,9.0,3.0,0.0,0,0,0,0,0,0
10,503.0,805.0,1025.0,1051.0,0,0,0,0,0,0,...,29.0,20.8,8.7,2.0,0,0,0,0,0,0
11,503.0,524.0,599.0,605.0,0,0,0,0,0,0,...,1.7,1.1,1.2,1.2,0,0,0,0,0,0
12,0.0,0.0,0.0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
13,0.0,0.0,0.0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0


Unnamed: 0,TripPurpose_B01ID_1,TripPurpose_B01ID_2,TripPurpose_B01ID_3,TripPurpose_B01ID_4,TripPurpose_B01ID_5,TripPurpose_B01ID_6,TripPurpose_B01ID_7,TripPurpose_B01ID_8,TripPurpose_B01ID_9,TripPurpose_B01ID_10
7,1.0,1.0,0.0,0.0,0,0,0,0,0,0
8,1.0,7.0,7.0,0.0,0,0,0,0,0,0
9,1.0,6.0,6.0,0.0,0,0,0,0,0,0
10,2.0,2.0,5.0,5.0,0,0,0,0,0,0
11,7.0,6.0,6.0,6.0,0,0,0,0,0,0
12,0.0,0.0,0.0,0.0,0,0,0,0,0,0
13,0.0,0.0,0.0,0.0,0,0,0,0,0,0


In [236]:
X, y_cont, y_cat = prepare_data_for_LSTM(long_df=ts_df, transform_to_wide=True, transform_to_tensor=True)

Individual 5949 out of 6838 Complete!    TravelYear is erroneous for 5
Unique vals: [2018. 2017.]
Continuing to next individual
Individual 5950 out of 6838 Complete!    TravelYear is erroneous for 5
Unique vals: [2018. 2017.]
Continuing to next individual
Individual 5998 out of 6838 Complete!    TravelYear is erroneous for 5
Unique vals: [2018. 2017.]
Continuing to next individual
Individual 6057 out of 6838 Complete!    TravelYear is erroneous for 5
Unique vals: [2018. 2017.]
Continuing to next individual
Individual 6058 out of 6838 Complete!    TravelYear is erroneous for 5
Unique vals: [2018. 2017.]
Continuing to next individual
Individual 6086 out of 6838 Complete!    TravelYear is erroneous for 5
Unique vals: [2018. 2017.]
Continuing to next individual
Individual 6164 out of 6838 Complete!    TravelYear is erroneous for 5
Unique vals: [2018. 2017.]
Continuing to next individual
Individual 6165 out of 6838 Complete!    TravelYear is erroneous for 5
Unique vals: [2018. 2017.]
Contin

In [237]:
print(f"Input shape: {X.shape}")
print(f"Cont Output shape: {y_cont.shape}")
print(f"Cat Output shape: {y_cat.shape}")

Input shape: torch.Size([6775, 14, 1, 47])
Cont Output shape: torch.Size([6775, 7, 30])
Cat Output shape: torch.Size([6775, 7, 10])


In [238]:
# Save tensors
with open("/home/trapfishscott/Cambridge24.25/D200_ML_econ/ProblemSets/Project/tensors/tensors.pkl", "wb") as f:
    pickle.dump((X, y_cont, y_cat), f)

### Creating the RNN

In [242]:
# Defining parameters
INPUT_SIZE = X.shape[3]
HIDDEN_SIZE = 3
NUM_LAYERS = 1
OUTPUT_SIZE_CONT = y_cont.shape[2]
OUTPUT_SIZE_CAT = y_cat.shape[2]

In [309]:
class RNNmodel(nn.Module):
    def __init__(self):
        super().__init__()

        # Define RNN layer

        self.rnn = nn.RNN(INPUT_SIZE, HIDDEN_SIZE)

        # Output layer

        self.output_cont = nn.Linear(HIDDEN_SIZE, OUTPUT_SIZE_CONT)
        self.output_cat = nn.Linear(HIDDEN_SIZE, OUTPUT_SIZE_CAT)


    def forward(self, X):

        out, hh = self.rnn(X)

        #print(f"out shape: {out.shape}")
        #print(f"hh shape: {hh.shape}")

        y_cont_hat_vector = self.output_cont(hh)
        y_cat_hat_vector = self.output_cat(hh)

        y_cat_hat = y_cat_hat_vector[0,0,:].detach()
        y_cont_hat = y_cont_hat_vector[0,0,:].detach()


        #print(y_hat_vector)

        '''
        y_cont_hat = {}

        for index in range(y_cont_hat_vector.shape[2]):
            y_cont_hat[index+1] = y_cont_hat_vector[:,:,index].detach()
        '''


        return y_cont_hat, y_cat_hat


In [318]:
# Taking one test draw

rnn_model = RNNmodel()

X0 = X[0,:,0,:].unsqueeze(1).to(torch.float32)
print(f"X1 shape: {X0.shape}")
print("")

y_cont_hat, y_cat_hat = rnn_model.forward(X0)

print(f"Categorical outputs:  {y_cat_hat}")
print(f"Ground truth categorical: {y_cat[0,0,:]}")
print("")
print(f"Continous outputs:  {y_cont_hat}")
print(f"Continous categorical: {y_cont[0,0,:]}")

loss_cat = nn.CrossEntropyLoss()  #(y_hat, y)
loss_cont = nn.MSELoss()

print(f"Categorical loss: {loss_cat(y_cat_hat, y_cat[0,0,:])}")
print(f"Continous loss: {loss_cont(y_cont_hat, y_cont[0,0,:])}")


X1 shape: torch.Size([14, 1, 47])

Categorical outputs:  tensor([-0.1230, -0.2642, -0.6163, -0.5335, -1.0169, -0.7919, -0.4985, -0.4965,
         0.0241,  0.4465])
Ground truth categorical: tensor([6., 6., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=torch.float64)

Continous outputs:  tensor([-0.0357, -1.0857,  0.1359, -0.0971,  0.1486, -0.7620, -0.1190, -0.6366,
         0.1821, -0.1810,  0.8555, -0.9568, -0.6962, -0.3217,  0.3392,  0.7974,
        -0.4109, -1.2556,  0.7726,  0.2620,  0.0318,  0.9279, -0.4495, -0.1383,
        -0.1678, -0.1445, -0.1017,  0.0550, -1.1105, -0.3945])
Continous categorical: tensor([600., 690.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 620., 710.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   4.,   4.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.], dtype=torch.float64)
Categorical loss: 26.335693359375
Continous loss: 57549.11202918984


In [252]:
loss_cat = nn.CrossEntropyLoss()  #(y_hat, y)
loss_cont = nn.MSELoss()

In [None]:
### LSTM for single batch size

# Looping over batch dimension

for batch in range(travel_tensor.shape[1]):
    print(travel_tensor[:,batch,:].shape)