In [2]:
import os
import pandas as pd
import numpy as np
import re
import logging
from Modules.Loader_wrangler import *
import random
import torch
import torch.nn as nn

In [3]:
# Configure basic logging
logging.basicConfig(level=logging.INFO, force=True, format='%(levelname)s: %(message)s')

In [4]:
play = loader(output_file_name="merged_df2017.pkl", chunksize=100000, sample_size=100000, survey_year=2017)

KeyboardInterrupt: 

In [5]:
play = pd.read_pickle("/home/trapfishscott/Cambridge24.25/D200_ML_econ/ProblemSets/Project/data/merged_df2017.pkl")

### Obtaining only relevant variables and making into a time series

In [6]:
temporal_vars = ["TWSMonth", "TravelYear", "TravelWeekDay_B01ID"]
individual_vars =["PSUGOR_B02ID", "IndIncome2002_B02ID", "HHoldNumChildren", "VehMakeModel_B02ID"]

outcome_vars = ["TripStart", "TripEnd", "TripDisExSW", "TripPurpose_B01ID"]
extra_vars = ["IndividualID_x", "JourSeq"]

In [7]:
ts_df = play[extra_vars + individual_vars + temporal_vars + outcome_vars]

In [8]:
ts_df = ts_df.sort_values(["IndividualID_x", "TravelWeekDay_B01ID", "JourSeq"])

In [9]:
weekly_travel = []


for i in ts_df["IndividualID_x"].unique():
    i_df = ts_df[ts_df["IndividualID_x"] == i]
    weekly_travel.append(len(i_df))

max_weekly_travel = max(weekly_travel)
mean_weekly_travel = sum(weekly_travel)/ len(weekly_travel)

print(f"Most weekly travel ~ {max_weekly_travel}")
print(f"Average weekly travel ~ {mean_weekly_travel}")


percentile_97 = np.percentile(weekly_travel, 97)

print(f"97th percentile of weekly travel ~ {percentile_97}")



Most weekly travel ~ 67
Average weekly travel ~ 14.4106463878327
97th percentile of weekly travel ~ 34.0


In [10]:
def impute_missing_travel_weeks(df, use_masking=False):
    df = df.copy()
    df_chunks = []
    full_week_encoding = list(range(0,9))
    individual_ids = df["IndividualID_x"].unique()

    for i in individual_ids:
        break_flag = False
        i_df = df[df["IndividualID_x"] == i]
        #display(i_df)
        #print("")
        included_travel_day = i_df["TravelWeekDay_B01ID"].to_list()
        #print(included_travel_day)
        travel_day_no_drive = list(set(full_week_encoding) - set(included_travel_day))
        #print(travel_day_no_drive)

        idle_row = {}

        imputed_travel_df = pd.DataFrame({
            "TravelWeekDay_B01ID": travel_day_no_drive,
            "IndividualID_x": [i]*len(travel_day_no_drive),
            "JourSeq": [0]*len(travel_day_no_drive)
        })

        for col in i_df.columns:
            if col in outcome_vars:
                imputed_travel_df[col] = [0]*len(travel_day_no_drive)
            if col in individual_vars:
                if len(i_df[col].unique()) != 1:
                    print(f"{col} is erroneous for {i}")
                    print(f"Unique vals: {i_df[col].unique()}")
                    break_flag = True
                    break
                else:
                    imputed_travel_df[col] = i_df[col].unique()[0]
                    idle_row[col] = i_df[col].unique()[0]

            if col != "TravelWeekDay_B01ID" and col in temporal_vars:
                if len(i_df[col].unique()) != 1:
                    print(f"{col} is erroneous for {i}")
                    print(f"Unique vals: {i_df[col].unique()}")
                    break_flag = True
                    break
                else:
                    imputed_travel_df[col] = i_df[col].unique()[0]
                    idle_row[col] = i_df[col].unique()[0]

        if break_flag:
            print("Continuing to next individual")
            continue

        full_df = pd.concat([i_df, imputed_travel_df])

        if use_masking:
            if len(full_df) < percentile_97:
                rows_to_mask = int(percentile_97 - len(full_df))
                #print(f"Length of imputed df: {len(full_df)}")
                #print(f"Number of rows to mask: {rows_to_mask}")

                new_rows = pd.DataFrame([{
                    "TravelWeekDay_B01ID": 8,
                    "IndividualID_x": i,
                    "JourSeq": 0,
                    **idle_row
                    }]* rows_to_mask)
                
                #print(new_row)

                full_df = pd.concat([full_df, new_rows], ignore_index=True)

                #print(len(full_df))

            if len(full_df) > percentile_97:
                print(f"Outlier individual with travel ~ {len(full_df)}")
                print(f"Continuing...")
                continue

        df_chunks.append(full_df)

        #display(imputed_travel_df)
        #print("")
        #display(full_df)

    df_to_return = pd.concat(df_chunks)

    df_to_return = df_to_return.sort_values(["IndividualID_x", "TravelYear", "TWSMonth", "TravelWeekDay_B01ID", "JourSeq", "TripStart", "TripEnd"] )[["IndividualID_x", "JourSeq"] + outcome_vars + temporal_vars]

    df_to_return.reset_index(inplace=True, drop=True)

    df_to_return.fillna(0, inplace=True)

    return df_to_return

In [11]:
df = impute_missing_travel_weeks(ts_df)

# When we are moving from year to year. Probably not a huge issue but might fix later

TravelYear is erroneous for 2017014397.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014398.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014552.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014714.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014715.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014773.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014964.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014965.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017015043.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017015044.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is errone

In [15]:
# Convert to LSTM

def convert_to_tensor(df, seq_length, cols_to_drop, debug=True):

    df = df.copy()

    df = df.drop(columns=cols_to_drop, axis=1, errors="ignore")

    df_array = df.to_numpy()

    # Drop observations to make it fit sequence length

    rows_to_drop = df_array.shape[0] % seq_length

    if rows_to_drop > 0:
        df_array = df_array[:-rows_to_drop,:]

    print(df_array.shape)

    n = int(df_array.shape[0]/seq_length)
    input_features = int(df_array.shape[1])

    df_array = df_array.reshape((n, seq_length, input_features))

    df_array = df_array.transpose(1,0,2)

    print(f"Reshaped (seq_length, n_batches, input_features): {df_array.shape}")

    # Ensuring the array transformation matches the df

    if debug is True:

        test_value = random.randint(0,df_array.shape[0])

        assert np.array_equal(
            df_array[:,test_value,:],
            df.iloc[test_value*seq_length:test_value*seq_length+seq_length, :].to_numpy()
        ), "Mismatch between reshaped array and original df"

    tensor_data = torch.tensor(df_array, dtype=torch.float32)

    return tensor_data


In [17]:
travel_tensor = convert_to_tensor(df=df, seq_length=14, cols_to_drop=["JourSeq", "NumTrips"])

(126602, 8)
Reshaped (seq_length, n_batches, input_features): (14, 9043, 8)


### Creating the RNN

In [63]:
# Defining parameters
INPUT_SIZE = travel_tensor.shape[2]
HIDDEN_SIZE = 3
NUM_LAYERS = 1
OUTPUT_SIZE = len(outcome_vars)

outcome_vars


['TripStart', 'TripEnd', 'TripDisExSW', 'TripPurpose_B01ID']

In [61]:
class RNNmodel(nn.Module):
    def __init__(self):
        super().__init__()

        # Define RNN layer

        self.rnn = nn.RNN(INPUT_SIZE, HIDDEN_SIZE)

        # Output layer

        self.output = nn.Linear(HIDDEN_SIZE, OUTPUT_SIZE)

    def forward(self, X):

        out, hh = self.rnn(X)

        print(f"out shape: {out.shape}")
        print(f"hh shape: {hh.shape}")

        y_hat_vector = self.output(hh)

        print(f"y_hat shape: {y_hat_vector.shape}")

        print(y_hat_vector)

        y_hat = {}

        for index in range(y_hat_vector.shape[2]):
            y_hat[index+1] = y_hat_vector[:,:,index].detach()

        return y_hat


In [62]:
# Taking one test draw

rnn_model = RNNmodel()

X1 = travel_tensor[:,0,:].unsqueeze(1)
print(f"X1 shape: {X1.shape}")

y_hat = rnn_model.forward(X1)

y_hat


X1 shape: torch.Size([14, 1, 8])
out shape: torch.Size([14, 1, 3])
hh shape: torch.Size([1, 1, 3])
y_hat shape: torch.Size([1, 1, 4])
tensor([[[ 0.5506, -0.3124,  0.2871, -0.1610]]], grad_fn=<ViewBackward0>)


{1: tensor([[0.5506]]),
 2: tensor([[-0.3124]]),
 3: tensor([[0.2871]]),
 4: tensor([[-0.1610]])}

In [67]:
loss_cat = nn.CrossEntropyLoss()  #(y_hat, y)

In [None]:
### LSTM for single batch size

# Looping over batch dimension

for batch in range(travel_tensor.shape[1]):
    print(travel_tensor[:,batch,:].shape)