In [1]:
import os
import pandas as pd
import numpy as np
import re
import logging
from Modules.Loader_wrangler import *
import random
import torch
import torch.nn as nn

In [2]:
# Configure basic logging
logging.basicConfig(level=logging.INFO, force=True, format='%(levelname)s: %(message)s')

In [4]:
play = loader(output_file_name="merged_df2017.pkl", chunksize=100000, sample_size=100000, survey_year=2017)

KeyboardInterrupt: 

In [3]:
play = pd.read_pickle("/home/trapfishscott/Cambridge24.25/D200_ML_econ/ProblemSets/Project/data/merged_df2017.pkl")

### Obtaining only relevant variables and making into a time series

In [4]:
temporal_vars = ["TWSMonth", "TravelYear", "TravelWeekDay_B01ID"]
individual_vars =["PSUGOR_B02ID", "IndIncome2002_B02ID", "HHoldNumChildren", "VehMakeModel_B02ID"]

outcome_vars = ["TripStart", "TripEnd", "TripDisExSW", "TripPurpose_B01ID"]
extra_vars = ["IndividualID_x", "JourSeq"]

In [5]:
ts_df = play[extra_vars + individual_vars + temporal_vars + outcome_vars]

In [6]:
ts_df = ts_df.sort_values(["IndividualID_x", "TravelWeekDay_B01ID", "JourSeq"])

In [7]:
ts_df

Unnamed: 0,IndividualID_x,JourSeq,PSUGOR_B02ID,IndIncome2002_B02ID,HHoldNumChildren,VehMakeModel_B02ID,TWSMonth,TravelYear,TravelWeekDay_B01ID,TripStart,TripEnd,TripDisExSW,TripPurpose_B01ID
15434,2.017000e+09,1.0,5.0,1.0,0.0,-10.0,1.0,2017.0,1.0,600.0,620.0,4.0,6.0
16026,2.017000e+09,2.0,5.0,1.0,0.0,-10.0,1.0,2017.0,1.0,690.0,710.0,4.0,6.0
15433,2.017000e+09,1.0,5.0,1.0,0.0,-10.0,1.0,2017.0,5.0,540.0,580.0,6.0,13.0
12891,2.017000e+09,2.0,5.0,1.0,0.0,-10.0,1.0,2017.0,5.0,960.0,995.0,6.0,13.0
16027,2.017000e+09,1.0,5.0,1.0,0.0,-10.0,1.0,2017.0,1.0,480.0,560.0,22.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19029,2.017017e+09,1.0,7.0,1.0,0.0,-10.0,1.0,2018.0,4.0,570.0,840.0,100.0,15.0
25809,2.017017e+09,1.0,7.0,1.0,0.0,-10.0,1.0,2018.0,5.0,570.0,585.0,2.0,5.0
37414,2.017017e+09,2.0,7.0,1.0,0.0,-10.0,1.0,2018.0,5.0,645.0,660.0,2.0,5.0
43207,2.017017e+09,1.0,7.0,1.0,0.0,-10.0,1.0,2018.0,6.0,660.0,680.0,2.0,13.0


In [8]:
weekly_travel = []


for i in ts_df["IndividualID_x"].unique():
    i_df = ts_df[ts_df["IndividualID_x"] == i]
    weekly_travel.append(len(i_df))

max_weekly_travel = max(weekly_travel)
mean_weekly_travel = sum(weekly_travel)/ len(weekly_travel)

print(f"Most weekly travel ~ {max_weekly_travel}")
print(f"Average weekly travel ~ {mean_weekly_travel}")


percentile_97 = np.percentile(weekly_travel, 97)

print(f"97th percentile of weekly travel ~ {percentile_97}")



Most weekly travel ~ 67
Average weekly travel ~ 14.4106463878327
97th percentile of weekly travel ~ 34.0


### Data Manipulation pipeline

* 

In [None]:
def impute_missing_travel_week_for_i(i_df, i_id, full_week_encoding):
        
    break_flag = False

    # Travel days with travel 
    included_travel_day = i_df["TravelWeekDay_B01ID"].to_list()

    # Travel days with no travel
    travel_day_no_drive = list(set(full_week_encoding) - set(included_travel_day))

    # These values will repeat for empty-travel travel days
    imputed_travel_df = pd.DataFrame({
        "TravelWeekDay_B01ID": travel_day_no_drive,
        "IndividualID_x": [i_id]*len(travel_day_no_drive),
        "JourSeq": [1]*len(travel_day_no_drive)
    })

    # Looping through all the columns in the original df
    for col in i_df.columns:

        # For days with no travel all outcomes vars will take 0
        if col in outcome_vars:
            imputed_travel_df[col] = [0]*len(travel_day_no_drive)
        
        # Individual vars will repeat
        if col in individual_vars:

            # Repeating individual vars (or temporal vars) usually signify an error
            if len(i_df[col].unique()) != 1:
                print(f"{col} is erroneous for {i}")
                print(f"Unique vals: {i_df[col].unique()}")
                break_flag = True
                break
            else:
                imputed_travel_df[col] = i_df[col].unique()[0]

        if col != "TravelWeekDay_B01ID" and col in temporal_vars:
            if len(i_df[col].unique()) != 1:
                print(f"{col} is erroneous for {i}")
                print(f"Unique vals: {i_df[col].unique()}")
                break_flag = True
                break
            else:
                imputed_travel_df[col] = i_df[col].unique()[0]

    if break_flag:
        print("Continuing to next individual")
        return
    

    # display(imputed_travel_df)

    # Concatenating df to include empty travel days
    full_df = pd.concat([i_df, imputed_travel_df])

    full_df = full_df.sort_values(["TravelYear", "TWSMonth", "TravelWeekDay_B01ID", "JourSeq", "TripStart", "TripEnd"])


    #display(full_df)


    return full_df

In [135]:
def transform_to_wide_for_i(i_df, max_journey_seq, seq_length = 7, outcome_vars=outcome_vars):
    df = i_df.copy()

    expected_cols = [f"{col}_{i}" for col in outcome_vars for i in range(1, max_journey_seq+1)]

    df = df[df["JourSeq"]<=max_journey_seq]

    #

    df_wide = df.pivot(index="TravelWeekDay_B01ID",
                  columns = "JourSeq",
                  values = outcome_vars)
    
    df_wide.columns = [f"{col[0]}_{int(col[1])}" for col in df_wide.columns]

    for col in expected_cols:
        if col not in df_wide.columns:
            df_wide[col] = 0
    
    # Ensure column order is consistent
    df_wide = df_wide[expected_cols]
    
    df_wide = df_wide.fillna(0)

    df_wide.reset_index(inplace=True)

    # Dropping outcome columns
    df.drop(columns=outcome_vars + ["IndividualID_x",	"JourSeq"], axis=1, inplace = True)
    df.drop_duplicates(subset=["TravelWeekDay_B01ID"], inplace=True)

    df_wide = df_wide.merge(df, on="TravelWeekDay_B01ID", how="left")

    top_row = df_wide.head(1).copy()

    for col in expected_cols:
        top_row[col] = 0
        top_row["TravelWeekDay_B01ID"] = 0

    repeated_rows = pd.concat([top_row] * seq_length, ignore_index=True)

    df_wide = pd.concat([repeated_rows, df_wide], ignore_index=True)

    return df_wide

In [141]:
def prepare_data_for_LSTM(long_df, impute_missing_travel_weeks=True, transform_to_wide=False, transform_to_tensor=False, debug=False):

    df = long_df.copy()
    individual_ids = df["IndividualID_x"].unique()       # All unique individual id's to loop over
    df_chunks = []

    full_week_encoding = list(range(1,8))

    if debug:
        random_index = random.randint(0, len(individual_ids))

        debug_df = df[df["IndividualID_x"] == individual_ids[random_index]]

        display(debug_df)

        debug_df = impute_missing_travel_week_for_i(debug_df, i_id=individual_ids[random_index], full_week_encoding=full_week_encoding)

        display(debug_df)

        debug_df = transform_to_wide_for_i(debug_df, max_journey_seq=10)

        display(debug_df)

        return


    if impute_missing_travel_weeks:



        for individual_id in individual_ids[:1]:

            i_df = df[df["IndividualID_x"] == individual_id]

            full_df = impute_missing_travel_week_for_i(i_df, i_id=individual_id, full_week_encoding=full_week_encoding)

            #display(full_df)

            if full_df is not None:
                if not transform_to_wide:
                    df_chunks.append(full_df)

                else:
                    
                    if transform_to_tensor:
                        pass

                    else:
                        full_df = transform_to_wide_for_i(full_df, max_journey_seq=10)
                        display(full_df)
                        df_chunks.append(full_df)

                    

        
        df_to_return = pd.concat(df_chunks)

        return df_to_return



    else:
        return df


In [140]:
df = prepare_data_for_LSTM(long_df=ts_df)


In [108]:
df

Unnamed: 0,IndividualID_x,JourSeq,PSUGOR_B02ID,IndIncome2002_B02ID,HHoldNumChildren,VehMakeModel_B02ID,TWSMonth,TravelYear,TravelWeekDay_B01ID,TripStart,TripEnd,TripDisExSW,TripPurpose_B01ID
15434,2017000000.0,1.0,5.0,1.0,0.0,-10.0,1.0,2017.0,1.0,600.0,620.0,4.0,6.0
16026,2017000000.0,2.0,5.0,1.0,0.0,-10.0,1.0,2017.0,1.0,690.0,710.0,4.0,6.0
0,2017000000.0,1.0,5.0,1.0,0.0,-10.0,1.0,2017.0,2.0,0.0,0.0,0.0,0.0
1,2017000000.0,1.0,5.0,1.0,0.0,-10.0,1.0,2017.0,3.0,0.0,0.0,0.0,0.0
2,2017000000.0,1.0,5.0,1.0,0.0,-10.0,1.0,2017.0,4.0,0.0,0.0,0.0,0.0
15433,2017000000.0,1.0,5.0,1.0,0.0,-10.0,1.0,2017.0,5.0,540.0,580.0,6.0,13.0
12891,2017000000.0,2.0,5.0,1.0,0.0,-10.0,1.0,2017.0,5.0,960.0,995.0,6.0,13.0
3,2017000000.0,1.0,5.0,1.0,0.0,-10.0,1.0,2017.0,6.0,0.0,0.0,0.0,0.0
4,2017000000.0,1.0,5.0,1.0,0.0,-10.0,1.0,2017.0,7.0,0.0,0.0,0.0,0.0


In [142]:
df = prepare_data_for_LSTM(long_df=ts_df, transform_to_wide=True)

Unnamed: 0,TravelWeekDay_B01ID,TripStart_1,TripStart_2,TripStart_3,TripStart_4,TripStart_5,TripStart_6,TripStart_7,TripStart_8,TripStart_9,...,TripPurpose_B01ID_7,TripPurpose_B01ID_8,TripPurpose_B01ID_9,TripPurpose_B01ID_10,PSUGOR_B02ID,IndIncome2002_B02ID,HHoldNumChildren,VehMakeModel_B02ID,TWSMonth,TravelYear
0,0.0,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,5.0,1.0,0.0,-10.0,1.0,2017.0
1,0.0,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,5.0,1.0,0.0,-10.0,1.0,2017.0
2,0.0,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,5.0,1.0,0.0,-10.0,1.0,2017.0
3,0.0,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,5.0,1.0,0.0,-10.0,1.0,2017.0
4,0.0,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,5.0,1.0,0.0,-10.0,1.0,2017.0
5,0.0,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,5.0,1.0,0.0,-10.0,1.0,2017.0
6,0.0,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,5.0,1.0,0.0,-10.0,1.0,2017.0
7,1.0,600.0,690.0,0,0,0,0,0,0,0,...,0,0,0,0,5.0,1.0,0.0,-10.0,1.0,2017.0
8,2.0,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,5.0,1.0,0.0,-10.0,1.0,2017.0
9,3.0,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,5.0,1.0,0.0,-10.0,1.0,2017.0


In [None]:
def impute_missing_travel_weeks(df, transform_to_wide = True):
    df = df.copy()
    df_chunks = []
    full_week_encoding = list(range(1,8))   # Using 0 to indicate start of Sequence, 8 to indicate end of sequence
    individual_ids = df["IndividualID_x"].unique()       # All unique individual id's to loop over

    for i in individual_ids[:]:
        break_flag = False
        i_df = df[df["IndividualID_x"] == i]

        # Travel days with travel 
        included_travel_day = i_df["TravelWeekDay_B01ID"].to_list()

        # Travel days with no travel
        travel_day_no_drive = list(set(full_week_encoding) - set(included_travel_day))


        idle_row = {}

        # These values will repeat for empty-travel travel days
        imputed_travel_df = pd.DataFrame({
            "TravelWeekDay_B01ID": travel_day_no_drive,
            "IndividualID_x": [i]*len(travel_day_no_drive),
            "JourSeq": [1]*len(travel_day_no_drive)
        })

        # Looping through all the columns in the original df
        for col in i_df.columns:

            # For days with no travel all outcomes vars will take 0
            if col in outcome_vars:
                imputed_travel_df[col] = [0]*len(travel_day_no_drive)
            
            # Individual vars will repeat
            if col in individual_vars:

                # Repeating individual vars (or temporal vars) usually signify an error
                if len(i_df[col].unique()) != 1:
                    print(f"{col} is erroneous for {i}")
                    print(f"Unique vals: {i_df[col].unique()}")
                    break_flag = True
                    break
                else:
                    imputed_travel_df[col] = i_df[col].unique()[0]
                    idle_row[col] = i_df[col].unique()[0]

            if col != "TravelWeekDay_B01ID" and col in temporal_vars:
                if len(i_df[col].unique()) != 1:
                    print(f"{col} is erroneous for {i}")
                    print(f"Unique vals: {i_df[col].unique()}")
                    break_flag = True
                    break
                else:
                    imputed_travel_df[col] = i_df[col].unique()[0]
                    idle_row[col] = i_df[col].unique()[0]

        if break_flag:
            print("Continuing to next individual")
            continue

        # Concatenating df to include empty travel days
        full_df = pd.concat([i_df, imputed_travel_df])

        # Transforming to wide format
        if transform_to_wide:
            full_df = transform_to_wide_func(full_df, 10)

            # Add Individual ID columns
            for col in i_df.columns:
                if col in individual_vars:
                    full_df[col] = i_df[col].unique()[0]
                if col in temporal_vars:
                    full_df[col] = i_df[col].unique()[0]

            full_df["IndividualID_x"] = [i]*len(full_df)
                
            #display(full_df)

        df_chunks.append(full_df)

        #display(imputed_travel_df)
        #print("")
        #display(full_df)

    df_to_return = pd.concat(df_chunks)

    if not transform_to_wide:
        df_to_return = df_to_return.sort_values(["IndividualID_x", "TravelYear", "TWSMonth", "TravelWeekDay_B01ID", "JourSeq", "TripStart", "TripEnd"] )[["IndividualID_x", "JourSeq"] + outcome_vars + temporal_vars]


        df_to_return.reset_index(inplace=True, drop=True)

        df_to_return.fillna(0, inplace=True)

        return df_to_return
    
    else:
        df_to_return.fillna(0, inplace=True)
        return df_to_return

In [83]:
df = impute_missing_travel_weeks(ts_df)

# When we are moving from year to year. Probably not a huge issue but might fix later

TravelYear is erroneous for 2017014397.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014398.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014552.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014714.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014715.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014773.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014964.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017014965.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017015043.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is erroneous for 2017015044.0
Unique vals: [2018. 2017.]
Continuing to next individual
TravelYear is errone

In [85]:
df.to_csv("/home/trapfishscott/Cambridge24.25/D200_ML_econ/ProblemSets/Project/data/play_wide.csv")

In [44]:
pivot_trial_2 = pivot_trial.pivot(index="TravelWeekDay_B01ID",
                  columns = "JourSeq",
                  values = ["TripStart", "TripEnd", "TripDisExSW",	"TripPurpose_B01ID"]).reset_index()

pivot_trial_2["Hello"] = [0]*len(pivot_trial_2)

In [46]:
pivot_trial_2.fillna(0)

Unnamed: 0_level_0,TravelWeekDay_B01ID,TripStart,TripStart,TripEnd,TripEnd,TripDisExSW,TripDisExSW,TripPurpose_B01ID,TripPurpose_B01ID,Hello
JourSeq,Unnamed: 1_level_1,1.0,2.0,1.0,2.0,1.0,2.0,1.0,2.0,Unnamed: 10_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1.0,600.0,690.0,620.0,710.0,4.0,4.0,6.0,6.0,0
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,5.0,540.0,960.0,580.0,995.0,6.0,6.0,13.0,13.0,0
6,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [15]:
# Convert to LSTM

def convert_to_tensor(df, seq_length, cols_to_drop, debug=True):

    df = df.copy()

    df = df.drop(columns=cols_to_drop, axis=1, errors="ignore")

    df_array = df.to_numpy()

    # Drop observations to make it fit sequence length

    rows_to_drop = df_array.shape[0] % seq_length

    if rows_to_drop > 0:
        df_array = df_array[:-rows_to_drop,:]

    print(df_array.shape)

    n = int(df_array.shape[0]/seq_length)
    input_features = int(df_array.shape[1])

    df_array = df_array.reshape((n, seq_length, input_features))

    df_array = df_array.transpose(1,0,2)

    print(f"Reshaped (seq_length, n_batches, input_features): {df_array.shape}")

    # Ensuring the array transformation matches the df

    if debug is True:

        test_value = random.randint(0,df_array.shape[0])

        assert np.array_equal(
            df_array[:,test_value,:],
            df.iloc[test_value*seq_length:test_value*seq_length+seq_length, :].to_numpy()
        ), "Mismatch between reshaped array and original df"

    tensor_data = torch.tensor(df_array, dtype=torch.float32)

    return tensor_data


In [17]:
travel_tensor = convert_to_tensor(df=df, seq_length=14, cols_to_drop=["JourSeq", "NumTrips"])

(126602, 8)
Reshaped (seq_length, n_batches, input_features): (14, 9043, 8)


### Creating the RNN

In [63]:
# Defining parameters
INPUT_SIZE = travel_tensor.shape[2]
HIDDEN_SIZE = 3
NUM_LAYERS = 1
OUTPUT_SIZE = len(outcome_vars)

outcome_vars


['TripStart', 'TripEnd', 'TripDisExSW', 'TripPurpose_B01ID']

In [68]:
class RNNmodel(nn.Module):
    def __init__(self):
        super().__init__()

        # Define RNN layer

        self.rnn = nn.RNN(INPUT_SIZE, HIDDEN_SIZE)

        # Output layer

        self.output = nn.Linear(HIDDEN_SIZE, OUTPUT_SIZE)

    def forward(self, X):

        out, hh = self.rnn(X)

        print(f"out shape: {out.shape}")
        print(f"hh shape: {hh.shape}")

        y_hat_vector = self.output(hh)

        print(f"y_hat shape: {y_hat_vector.shape}")

        print(y_hat_vector)

        y_hat = {}

        for index in range(y_hat_vector.shape[2]):
            y_hat[index+1] = y_hat_vector[:,:,index].detach()

        return y_hat


In [62]:
# Taking one test draw

rnn_model = RNNmodel()

X1 = travel_tensor[:,0,:].unsqueeze(1)
print(f"X1 shape: {X1.shape}")

y_hat = rnn_model.forward(X1)

y_hat


X1 shape: torch.Size([14, 1, 8])
out shape: torch.Size([14, 1, 3])
hh shape: torch.Size([1, 1, 3])
y_hat shape: torch.Size([1, 1, 4])
tensor([[[ 0.5506, -0.3124,  0.2871, -0.1610]]], grad_fn=<ViewBackward0>)


{1: tensor([[0.5506]]),
 2: tensor([[-0.3124]]),
 3: tensor([[0.2871]]),
 4: tensor([[-0.1610]])}

In [67]:
loss_cat = nn.CrossEntropyLoss()  #(y_hat, y)

In [None]:
### LSTM for single batch size

# Looping over batch dimension

for batch in range(travel_tensor.shape[1]):
    print(travel_tensor[:,batch,:].shape)