In [113]:
import os
import pandas as pd
import numpy as np
import re
import logging
from Modules.Loader_wrangler import *

In [2]:
# Configure basic logging
logging.basicConfig(level=logging.INFO, force=True, format='%(levelname)s: %(message)s')

In [3]:
play = loader(output_file_name="merged_df2017.pkl", chunksize=100000, sample_size=100000, survey_year=2017)

SurveyYear = 2017 not found in chunk 54. ContinuingTrip type mapping
('Other', 'Home'): 0
('Home', 'Other'): 1
('Work', 'Home'): 2
('Home', 'Work'): 3
('Other', 'Other'): 4
('Other', 'Work'): 5
('Work', 'Other'): 6

Merged chunks saved to pickle!


### Obtaining only relevant variables and making into a time series

In [91]:
play["TripPurpose_B01ID"]

0        23.0
1        23.0
2        23.0
3         1.0
4         1.0
         ... 
98585    21.0
98586    21.0
98587     9.0
98588     9.0
98589     9.0
Name: TripPurpose_B01ID, Length: 98540, dtype: float64

In [92]:
temporal_vars = ["TWSMonth", "TravelYear", "TravelWeekDay_B01ID"]
individual_vars =["PSUGOR_B02ID", "IndIncome2002_B02ID", "HHoldNumChildren", "VehMakeModel_B02ID"]

outcome_vars = ["TripStart", "TripEnd", "NumTrips", "TripDisExSW", "TripPurpose_B01ID"]
extra_vars = ["IndividualID_x", "JourSeq"]

In [93]:
ts_df = play[extra_vars + individual_vars + temporal_vars + outcome_vars]

In [94]:
ts_df = ts_df.sort_values(["IndividualID_x", "TravelWeekDay_B01ID", "JourSeq"])

In [119]:
weekly_travel = []


for i in ts_df["IndividualID_x"].unique():
    i_df = ts_df[ts_df["IndividualID_x"] == i]
    weekly_travel.append(len(i_df))

max_weekly_travel = max(weekly_travel)
mean_weekly_travel = sum(weekly_travel)/ len(weekly_travel)

print(f"Most weekly travel ~ {max_weekly_travel}")
print(f"Average weekly travel ~ {mean_weekly_travel}")


percentile_97 = np.percentile(weekly_travel, 97)

print(f"97th percentile of weekly travel ~ {percentile_97}")



Most weekly travel ~ 67
Average weekly travel ~ 14.4106463878327
97th percentile of weekly travel ~ 34.0


In [132]:
def impute_missing_travel_weeks(df):
    df = df.copy()
    df_chunks = []
    full_week_encoding = list(range(0,9))
    individual_ids = df["IndividualID_x"].unique()

    for i in individual_ids:
        break_flag = False
        i_df = ts_df[ts_df["IndividualID_x"] == i]
        #display(i_df)
        #print("")
        included_travel_day = i_df["TravelWeekDay_B01ID"].to_list()
        #print(included_travel_day)
        travel_day_no_drive = list(set(full_week_encoding) - set(included_travel_day))
        #print(travel_day_no_drive)

        idle_row = {}

        imputed_travel_df = pd.DataFrame({
            "TravelWeekDay_B01ID": travel_day_no_drive,
            "IndividualID_x": [i]*len(travel_day_no_drive),
            "JourSeq": [0]*len(travel_day_no_drive)
        })

        for col in i_df.columns:
            if col in outcome_vars:
                imputed_travel_df[col] = [0]*len(travel_day_no_drive)
            if col in individual_vars:
                if len(i_df[col].unique()) != 1:
                    print(f"{col} is erroneous for {i}")
                    print(f"Unique vals: {i_df[col].unique()}")
                    break_flag = True
                    break
                else:
                    imputed_travel_df[col] = i_df[col].unique()[0]
                    idle_row[col] = i_df[col].unique()[0]

            if col != "TravelWeekDay_B01ID" and col in temporal_vars:
                if len(i_df[col].unique()) != 1:
                    print(f"{col} is erroneous for {i}")
                    print(f"Unique vals: {i_df[col].unique()}")
                    break_flag = True
                    break
                else:
                    imputed_travel_df[col] = i_df[col].unique()[0]
                    idle_row[col] = i_df[col].unique()[0]

        if break_flag:
            print("Continuing to next individual")
            continue

        full_df = pd.concat([i_df, imputed_travel_df])

        if len(full_df) < percentile_97:
            rows_to_mask = percentile_97 - len(full_df)
            #print(f"Length of imputed df: {len(full_df)}")
            #print(f"Number of rows to mask: {rows_to_mask}")

            new_row = {"TravelWeekDay_B01ID": 8, "IndividualID_x": i, "JourSeq": 0}
            new_row.update(idle_row)

            #print(new_row)

            full_df = pd.concat([full_df, pd.DataFrame([new_row])])

        if len(full_df) > percentile_97:
            print(f"Outlier individual with travel ~ {len(full_df)}")
            print(f"Continuing...")
            continue

        df_chunks.append(full_df)

        #display(imputed_travel_df)
        #print("")
        #display(full_df)

    df_to_return = pd.concat(df_chunks)

    return df_to_return

In [133]:
df = impute_missing_travel_weeks(ts_df)

# When we are moving from year to year. Probably not a huge issue but might fix later

Outlier individual with travel ~ 38
Continuing...
Outlier individual with travel ~ 40
Continuing...
Outlier individual with travel ~ 36
Continuing...
Outlier individual with travel ~ 39
Continuing...
Outlier individual with travel ~ 35
Continuing...
Outlier individual with travel ~ 40
Continuing...
Outlier individual with travel ~ 50
Continuing...
Outlier individual with travel ~ 40
Continuing...
Outlier individual with travel ~ 52
Continuing...
Outlier individual with travel ~ 43
Continuing...
Outlier individual with travel ~ 38
Continuing...
Outlier individual with travel ~ 38
Continuing...
Outlier individual with travel ~ 45
Continuing...
Outlier individual with travel ~ 38
Continuing...
Outlier individual with travel ~ 44
Continuing...
Outlier individual with travel ~ 38
Continuing...
Outlier individual with travel ~ 47
Continuing...
Outlier individual with travel ~ 39
Continuing...
Outlier individual with travel ~ 36
Continuing...
Outlier individual with travel ~ 35
Continuing...


In [142]:
df_i = df["IndividualID_x"].unique()


test = df.sort_values(["IndividualID_x", "TravelWeekDay_B01ID", "JourSeq", "TripStart", "TripEnd"] )[["IndividualID_x", "JourSeq"] + outcome_vars + temporal_vars]

In [143]:
test.reset_index(inplace=True, drop=True)

In [144]:
test

Unnamed: 0,IndividualID_x,JourSeq,TripStart,TripEnd,NumTrips,TripDisExSW,TripPurpose_B01ID,TWSMonth,TravelYear,TravelWeekDay_B01ID
0,2.017000e+09,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2017.0,0.0
1,2.017000e+09,1.0,600.0,620.0,2.0,4.0,6.0,1.0,2017.0,1.0
2,2.017000e+09,2.0,690.0,710.0,2.0,4.0,6.0,1.0,2017.0,1.0
3,2.017000e+09,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2017.0,2.0
4,2.017000e+09,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2017.0,3.0
...,...,...,...,...,...,...,...,...,...,...
122200,2.017017e+09,1.0,660.0,680.0,2.0,2.0,13.0,1.0,2018.0,6.0
122201,2.017017e+09,2.0,840.0,860.0,2.0,2.0,13.0,1.0,2018.0,6.0
122202,2.017017e+09,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2018.0,7.0
122203,2.017017e+09,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2018.0,8.0


In [138]:
test.to_csv("/home/trapfishscott/Cambridge24.25/D200_ML_econ/ProblemSets/Project/data/play.csv", index=False)

In [106]:
test.to_pickle("/home/trapfishscott/Cambridge24.25/D200_ML_econ/ProblemSets/Project/data/LSTM_ready_df.pkl")