### **Data preprocessing - Random Forest model** <br>
This notebook is used for the pre-processing of the data that is used in the Random Forest model. <br>

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import numpy as np
import seaborn as sns
from scipy import stats
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from scipy.stats import pointbiserialr
from itertools import combinations
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')

In [4]:
data_2018_2019_final = pd.read_csv('./data/data_final_2018_2019.csv', index_col = 0)
data_2018_2019_final.head(2)

Unnamed: 0,opid,hhpers,hhsam,hhlft1,hhlft2,hhlft3,hhlft4,wopc,wogem,sted,...,pt_dist,walk_dur,walk_dist,car_changes,pt_changes,bike_changes,walk_changes,season,vertprov,actduur
0,56026580953,3,6,0,0,2,1,1191,437,3,...,,5766.0,7375.85,0.0,,0.0,0.0,3.0,8.0,365.0
1,56026587525,1,1,0,0,0,1,1441,439,2,...,,5859.0,7600.31,0.0,,0.0,0.0,3.0,8.0,600.0


In [5]:
data_2018_2019_final.columns

Index(['opid', 'hhpers', 'hhsam', 'hhlft1', 'hhlft2', 'hhlft3', 'hhlft4',
       'wopc', 'wogem', 'sted', 'gemgr', 'prov', 'geslacht', 'leeftijd',
       'herkomst', 'betwerk', 'onbbez', 'opleiding', 'hhgestinkg',
       'oprijbewijsau', 'hhauto', 'brandstofpa1', 'brandstofepa1',
       'brandstofpa2', 'brandstofepa2', 'hhefiets', 'ovstkaart', 'weekdag',
       'feestdag', 'verplid', 'doel', 'kmotiefv', 'vertpc', 'aankpc', 'khvm',
       'verttijd', 'aanktijd', 'choice_dur', 'choice_dist', 'bike_dur',
       'bike_dist', 'car_dur', 'car_dist', 'pt_dur', 'pt_dist', 'walk_dur',
       'walk_dist', 'car_changes', 'pt_changes', 'bike_changes',
       'walk_changes', 'season', 'vertprov', 'actduur'],
      dtype='object')

In [6]:
# add features

# parking costs
parking_costs = pd.read_csv('./data/pc4_parking_avg_charge.csv')
parking_costs.head(2)

parking_costs.rename(columns = {'PC4':'aankpc'}, inplace = True)
data_2018_2019_final = pd.merge(data_2018_2019_final, parking_costs, on = 'aankpc')

# trips with purpose 'going home' are assumed to have no associated costs
data_2018_2019_final.loc[(data_2018_2019_final.doel == 1),'AVG_CHARGE'] = 0

# peak-hour (1: trip takes place during peak hours,0: trip does not take place during peak hours)
data_2018_2019_final.loc[(data_2018_2019_final.verttijd < '09:00') & (data_2018_2019_final.verttijd >= '06:30'),'peak_hour'] = 1
data_2018_2019_final.loc[(data_2018_2019_final.verttijd < '18:30') & (data_2018_2019_final.verttijd >= '16:00'),'peak_hour'] = 1
data_2018_2019_final.loc[data_2018_2019_final.peak_hour != 1, 'peak_hour'] = 0

# urbanity level of the origin and destination postcodes
sted_data = pd.read_csv('./data/sted_data.csv', sep = ',')
sted_data.rename(columns = {'PC4':'vertpc', 'sted':'sted_o'}, inplace = True)
data_2018_2019_final = pd.merge(data_2018_2019_final, sted_data, on = 'vertpc')

sted_data.rename(columns = {'vertpc':'aankpc','sted_o':'sted_d'}, inplace = True)
data_2018_2019_final = pd.merge(data_2018_2019_final, sted_data, on = 'aankpc')

In [7]:
data_2018_2019_final.isnull().sum() # inspect the NaN values of the dataset

opid                 0
hhpers               0
hhsam                0
hhlft1               0
hhlft2               0
hhlft3               0
hhlft4               0
wopc                 0
wogem                0
sted                 0
gemgr                0
prov                 0
geslacht             0
leeftijd             0
herkomst             0
betwerk              0
onbbez               0
opleiding            0
hhgestinkg           0
oprijbewijsau        0
hhauto               0
brandstofpa1         0
brandstofepa1        0
brandstofpa2         0
brandstofepa2        0
hhefiets             0
ovstkaart            0
weekdag              0
feestdag             0
verplid              0
doel                 0
kmotiefv             0
vertpc               0
aankpc               0
khvm                 0
verttijd             0
aanktijd             0
choice_dur           0
choice_dist          0
bike_dur          5328
bike_dist         5328
car_dur          12375
car_dist         12375
pt_dur     

In [8]:
# fill in the missing values for the activity duration (actduur) by using the median value derived from trips with 
# the same purpose

data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 1)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 1,:].actduur.median()
data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 2)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 2,:].actduur.median()
data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 3)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 3,:].actduur.median()
data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 4)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 4,:].actduur.median()
data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 5)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 5,:].actduur.median()
data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 6)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 6,:].actduur.median()
data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 7)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 7,:].actduur.median()
data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 8)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 8,:].actduur.median()
data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 9)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 9,:].actduur.median()
data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 10)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 10,:].actduur.median()
data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 11)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 11,:].actduur.median()
data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 12)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 12,:].actduur.median()

data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 13)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 13,:].actduur.median()
data_2018_2019_final.loc[(pd.isna(data_2018_2019_final.actduur) & (data_2018_2019_final.doel == 14)),'actduur'] = \
                          data_2018_2019_final.loc[data_2018_2019_final.doel == 14,:].actduur.median()

In [9]:
# merge mode categories 3 (train) and 4 (bus/tram/metro) under 'transit' category (label 3)
data_2018_2019_final.loc[data_2018_2019_final.khvm == 4,'khvm'] = 3
data_2018_2019_final.loc[data_2018_2019_final.khvm == 5,'khvm'] = 4
data_2018_2019_final.loc[data_2018_2019_final.khvm == 6,'khvm'] = 5

In [10]:
# imbalance ratio between the car (majority) class and the transit (minority) class
round(data_2018_2019_final.khvm.value_counts()[1]/data_2018_2019_final.khvm.value_counts()[3],2)

5.06

In [11]:
# encode the categorical features

# opleiding (education)
data_2018_2019_final.loc[data_2018_2019_final.opleiding != 4,'opleiding'] = 0
data_2018_2019_final.loc[data_2018_2019_final.opleiding != 0, 'opleiding'] = 1

# age 
data_2018_2019_final.loc[data_2018_2019_final.leeftijd <= 17,'leeftijd'] = 1
data_2018_2019_final.loc[((data_2018_2019_final.leeftijd >= 18) & (data_2018_2019_final.leeftijd <= 64)),'leeftijd'] = 2
data_2018_2019_final.loc[data_2018_2019_final.leeftijd >= 65, 'leeftijd'] = 3

# income 
data_2018_2019_final.loc[(data_2018_2019_final.hhgestinkg < 5),'hhgestinkg'] = 1 # low income
data_2018_2019_final.loc[(data_2018_2019_final.hhgestinkg == 11),'hhgestinkg'] = 4 # unknown income values
data_2018_2019_final.loc[((data_2018_2019_final.hhgestinkg != 1) & (data_2018_2019_final.hhgestinkg != 4)),'hhgestinkg'] = 2 # high income

# departure and arrival times

data_2018_2019_final.loc[(data_2018_2019_final.verttijd < str(12)),'period_o'] = 1 # morning
data_2018_2019_final.loc[((data_2018_2019_final.verttijd >= str(12)) & (data_2018_2019_final.verttijd < str(18))),'period_o'] = 2 # afternoon
data_2018_2019_final.loc[((data_2018_2019_final.verttijd >= str(18)) & (data_2018_2019_final.verttijd < str(4))), 'period_o'] = 3 # evening

# weekdag (0: weekend and 1: weekday)

data_2018_2019_final.loc[((data_2018_2019_final.weekdag == 1) | (data_2018_2019_final.weekdag == 7)), 'weekdag'] = 0
data_2018_2019_final.loc[(data_2018_2019_final.weekdag != 0), 'weekdag'] = 1

# hhauto (0: no car, 1: car, category 10: is unknown)
data_2018_2019_final.loc[((data_2018_2019_final.hhauto != 0) & \
                 (data_2018_2019_final.hhauto != 10)),'hhauto'] = 1

# hhpers, (1: single-person household, 0: non single-person household)
data_2018_2019_final.loc[(data_2018_2019_final.hhpers != 1),'hhpers'] = 0

# betwerk (category 4 is unknown, 0: no paid-work, 1: paid-work)
data_2018_2019_final.loc[data_2018_2019_final['betwerk'] == 5, 'betwerk'] = 0
data_2018_2019_final.loc[((data_2018_2019_final['betwerk'] != 0)), 'betwerk'] = 1

# hhsam [categories 3, 4, 6, 7 correspond to 1 (children), while the remaining categories correspond to 0 (no children)].
data_2018_2019_final.loc[((data_2018_2019_final.hhsam != 3) & (data_2018_2019_final.hhsam != 4) &  (data_2018_2019_final.hhsam != 6) &  (data_2018_2019_final.hhsam != 7)),'hhsam'] = 0 
data_2018_2019_final.loc[data_2018_2019_final.hhsam != 0, 'hhsam'] = 1

# gemgr 
data_2018_2019_final.loc[((data_2018_2019_final.gemgr == 1)|(data_2018_2019_final.gemgr == 2)|(data_2018_2019_final.gemgr == 3)|\
                 (data_2018_2019_final.gemgr == 4)),'gemgr'] = 1
data_2018_2019_final.loc[((data_2018_2019_final.gemgr == 5)|(data_2018_2019_final.gemgr == 6)),'gemgr'] = 2
data_2018_2019_final.loc[((data_2018_2019_final.gemgr != 1) & (data_2018_2019_final.gemgr != 2)) ,'gemgr'] = 3

# kmotiefv (1: commute, 2: business, 3: other)
data_2018_2019_final.loc[((data_2018_2019_final.kmotiefv != 1) & (data_2018_2019_final.kmotiefv != 2)), 'kmotiefv'] = 3

# ovstkaart (category 4: unknown, unknown category  is replaced with 0, since according to survey explanations respondent 
# was either younger than 15 or older than 40 years old)

data_2018_2019_final.loc[data_2018_2019_final.ovstkaart == 4, 'ovstkaart'] = 0

In [12]:
# remove respondents who have chosen car as their means of transport but do not poccess a driving license
index_remove = data_2018_2019_final.loc\
                     [((data_2018_2019_final.khvm == 1) & (data_2018_2019_final.oprijbewijsau == 0)),:].index

data_2018_2019_final.drop(index_remove, inplace = True)

#drop the the unused columns
data_2018_2019_final.drop(columns = ['onbbez','hhlft1', 'hhlft2', 'hhlft3', 'hhlft4','brandstofpa1', 'brandstofepa1',
                'brandstofpa2', 'brandstofepa2','opid','wopc','wogem','sted','prov','verplid','vertpc', 'aankpc','choice_dur',
                'choice_dist','verttijd','aanktijd','doel'],inplace = True)

In [13]:
y = data_2018_2019_final.khvm # target variable
X = data_2018_2019_final.loc[:, data_2018_2019_final.columns != 'khvm'] # explanatory features

# split the dataset into the train and test sets
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, stratify = y, random_state = 42)

# keep an extra 10% of the data for validation purposes (hyperparameters tuning)
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train, y_train, stratify = y_train,\
                                                              test_size=0.1, random_state = 42)

# train set 
X_train_RF = X_train_final.copy()
y_train_RF = y_train_final.copy()

# test set
X_test_RF = X_test.copy()
y_test_RF = y_test.copy()

# validation test
y_val_RF = y_val
X_val_RF = X_val

In [14]:
# missing values

percent_missing = X_train_RF.isnull().sum() * 100 / len(X_train_RF)
missing_value_X_train_RF = pd.DataFrame({'column_name': X_train_RF.columns,
                                 'percent_missing': percent_missing})

# percentage of missing values in each column
missing_value_X_train_RF.loc[missing_value_X_train_RF.percent_missing != 0,:]

Unnamed: 0,column_name,percent_missing
bike_dur,bike_dur,2.812608
bike_dist,bike_dist,2.812608
car_dur,car_dur,6.44159
car_dist,car_dist,6.44159
pt_dur,pt_dur,15.038437
pt_dist,pt_dist,15.038437
walk_dur,walk_dur,29.840169
walk_dist,walk_dist,29.840169
car_changes,car_changes,6.44159
pt_changes,pt_changes,15.038437


In [15]:
# fill in the missing values in the training set using the features' median values
# use the same values to impute the missing values in the test set and the validation set

missing_values_ODiN_RF = ['herkomst','ovstkaart','hhauto','hhgestinkg']
missing_values_OTP_RF = data_2018_2019_final.isna().any()[lambda x: x].keys().to_list()

dict_median_RF = {}
for i in data_2018_2019_final.isna().any()[lambda x: x].keys().to_list():
    dict_median_RF[i] = round(X_train_RF[i].median(),2)
for i in missing_values_ODiN_RF:
    dict_median_RF[i] = round(X_train_RF[i].median(),2)

X_train_RF.loc[X_train_RF.herkomst == 4,'herkomst'] = round(X_train_RF.herkomst.median(),2)
X_train_RF.loc[X_train_RF.ovstkaart == 3, 'ovstkaart'] = round(X_train_RF.ovstkaart.median(),2)
X_train_RF.loc[X_train_RF.hhauto == 10, 'hhauto'] = round(X_train_RF.hhauto.median(),2)
X_train_RF.loc[X_train_RF.hhgestinkg == 4, 'hhgestinkg'] = round(X_train_RF.hhgestinkg.median(),2)

for value in missing_values_OTP_RF:
    X_train_RF.loc[pd.isna(X_train_RF[value]), value] = dict_median_RF[value]

In [16]:
# add the transit costs
transit_rate = (0.147 + 0.166) / 2  # eur/km averag value from Rotterdam and the Hague areas
transit_base = 0.96  # euros

X_train_RF['transit_cost'] = transit_base + transit_rate * (X_train_RF['pt_dist']/1000)

X_train_RF.loc[((X_train_RF.ovstkaart == 1) & (X_train_RF.weekdag == 1)), "transit_cost"] = 0
X_train_RF.loc[(X_train_RF.ovstkaart == 2) & (X_train_RF.weekdag == 0), "transit_cost"] = 0


X_train_RF.loc[X_train_RF.ovstkaart == 2,'ovstkaart'] = 1

# add the car costs
car_cost_km = 0.34 # euros/km
X_train_RF['car_cost'] = (car_cost_km * (X_train_RF['car_dist']/1000))\
                                   + ((X_train_RF['actduur']/60) * X_train_RF['AVG_CHARGE'])

X_train_RF.drop(columns = ['actduur','AVG_CHARGE'],inplace = True)

# perform one-hot encoding for the features having no inherent order
X_train_RF_final = pd.get_dummies(data = X_train_RF, columns = ['herkomst','kmotiefv','period_o','season','vertprov'], drop_first = True)

X_train_RF_final.columns = [col.split('.')[0] for col in X_train_RF_final.columns]

In [17]:
# fill in the missing values in the validation set
X_val_RF.loc[X_val_RF.herkomst == 4,'herkomst'] = round(X_train_RF.herkomst.median(),2)
X_val_RF.loc[X_val_RF.ovstkaart == 3, 'ovstkaart'] = round(X_train_RF.ovstkaart.median(),2)
X_val_RF.loc[X_val_RF.hhauto == 10, 'hhauto'] = round(X_train_RF.hhauto.median(),2)
X_val_RF.loc[X_val_RF.hhgestinkg == 4, 'hhgestinkg'] = round(X_train_RF.hhgestinkg.median(),2)

for value in missing_values_OTP_RF:
    X_val_RF.loc[pd.isna(X_val_RF[value]), value] = dict_median_RF[value]
    
X_val_RF['transit_cost'] = transit_base + transit_rate * (X_val_RF['pt_dist']/1000)

X_val_RF.loc[((X_val_RF.ovstkaart == 1) & (X_val_RF.weekdag == 1)), "transit_cost"] = 0
X_val_RF.loc[(X_val_RF.ovstkaart == 2) & (X_val_RF.weekdag == 0), "transit_cost"] = 0


X_val_RF.loc[X_val_RF.ovstkaart == 2,'ovstkaart'] = 1

X_val_RF['car_cost'] = (car_cost_km * (X_val_RF['car_dist']/1000))\
                                   + ((X_val_RF['actduur']/60) * X_val['AVG_CHARGE'])

X_val_RF.drop(columns = ['actduur','AVG_CHARGE'],inplace = True) 

X_val_RF_final = pd.get_dummies(data = X_val_RF, columns = ['herkomst',\
     'kmotiefv','period_o','season','vertprov'], drop_first = True)

X_val_RF_final.columns = [col.split('.')[0] for col in X_val_RF_final.columns]

In [18]:
# fill in the missing values in the test set
X_test_RF.loc[X_test_RF.herkomst == 4,'herkomst'] = round(X_train_RF.herkomst.median(),2)
X_test_RF.loc[X_test_RF.ovstkaart == 3, 'ovstkaart'] = round(X_train_RF.ovstkaart.median(),2)
X_test_RF.loc[X_test_RF.hhauto == 10, 'hhauto'] = round(X_train_RF.hhauto.median(),2)
X_test_RF.loc[X_test_RF.hhgestinkg == 4, 'hhgestinkg'] = round(X_train_RF.hhgestinkg.median(),2)

for value in missing_values_OTP_RF:
    X_test_RF.loc[pd.isna(X_test_RF[value]), value] = dict_median_RF[value]
    
X_test_RF['transit_cost'] = transit_base + transit_rate * (X_test_RF['pt_dist']/1000)

X_test_RF.loc[((X_test_RF.ovstkaart == 1) & (X_test_RF.weekdag == 1)), "transit_cost"] = 0
X_test_RF.loc[(X_test_RF.ovstkaart == 2) & (X_test_RF.weekdag == 0), "transit_cost"] = 0


X_test_RF.loc[X_test_RF.ovstkaart == 2,'ovstkaart'] = 1

X_test_RF['car_cost'] = (car_cost_km * (X_test_RF['car_dist']/1000))\
                                   + ((X_test_RF['actduur']/60) * X_test['AVG_CHARGE'])

X_test_RF.drop(columns = ['actduur','AVG_CHARGE'],inplace = True) 

X_test_RF_final = pd.get_dummies(data = X_test_RF, columns = ['herkomst',\
     'kmotiefv','period_o','season','vertprov'], drop_first = True)

X_test_RF_final.columns = [col.split('.')[0] for col in X_test_RF_final.columns]

In [19]:
# check the correlation between the variables

X_train_RF_final_corr = pd.get_dummies(data = X_train_RF_final, \
                        columns = ['gemgr','leeftijd','hhgestinkg','sted_o', 'sted_d'], drop_first = True)

In [20]:
continuous_variables = ['car_dur','car_dist','pt_dur','pt_dist','walk_dur','walk_dist','bike_dur','bike_dist','car_cost',
                       'transit_cost','pt_changes']

binary_variables = ['hhpers','hhsam','geslacht','betwerk','opleiding','oprijbewijsau','hhauto','hhefiets','ovstkaart',
                   'feestdag', 'weekdag','herkomst_2', 'herkomst_3',
       'kmotiefv_2','kmotiefv_3',
       'period_o_2', 'period_o_3', 'season_2', 'season_3', 'season_4',
       'vertprov_2', 'vertprov_3', 'vertprov_4', 'vertprov_5',
       'vertprov_6', 'vertprov_7', 'vertprov_8', 'vertprov_9',
       'vertprov_10', 'vertprov_11', 'vertprov_12','gemgr_2', 'gemgr_3', 'leeftijd_2',
       'leeftijd_3', 'hhgestinkg_2','sted_o_2', 'sted_o_3',
       'sted_o_4', 'sted_o_5', 'sted_d_2', 'sted_d_3', 'sted_d_4', 'sted_d_5']

In [21]:
def calculate_point_biserial_corr_for_all_pairs(data):
    binary_cols = data.columns[(data.nunique() == 2)].tolist()
    combinations_of_pairs = list(combinations(binary_cols, 2))
    results = {}
    for pair in combinations_of_pairs:
        point_biserial_corr, _ = pointbiserialr(data[pair[0]], data[pair[1]])
        results[pair] = point_biserial_corr
    return results

# Calculate point-biserial correlation coefficient for all pairs of binary variables
point_biserial_corr_results = calculate_point_biserial_corr_for_all_pairs(X_train_RF_final_corr[binary_variables])

# Print the results
for pair, correlation in point_biserial_corr_results.items():
    if correlation > 0.7:
        print(f"Point-biserial correlation coefficient between {pair[0]} and {pair[1]}: {correlation}")

In [22]:
# explore correlation between continuous and categorical variables

#calculate point-biserial correlation
for i in continuous_variables:
    for j in binary_variables:
        if stats.pointbiserialr(X_train_RF_final_corr[i], X_train_RF_final_corr[j])[0] > 0.7:
            print(i, j,stats.pointbiserialr(X_train_RF_final_corr[i], X_train_RF_final_corr[j]))

In [23]:
# explore correlation between numerical variables
X_train_RF_final_corr = pd.get_dummies(data = X_train_RF_final, columns = ['gemgr','leeftijd','hhgestinkg','sted_o', 'sted_d'], drop_first = True)

corr = X_train_RF_final[continuous_variables].corr()
corr.style.background_gradient(cmap='coolwarm')


Unnamed: 0,car_dur,car_dist,pt_dur,pt_dist,walk_dur,walk_dist,bike_dur,bike_dist,car_cost,transit_cost,pt_changes
car_dur,1.0,0.958854,0.479014,0.660915,0.140491,0.137835,0.684444,0.6815,0.869337,0.618598,0.544449
car_dist,0.958854,1.0,0.47119,0.681345,0.055282,0.053991,0.690583,0.689441,0.895837,0.639519,0.516947
pt_dur,0.479014,0.47119,1.0,0.68786,0.127021,0.126681,0.507474,0.508474,0.40578,0.659717,0.629463
pt_dist,0.660915,0.681345,0.68786,1.0,0.053841,0.053215,0.716149,0.718181,0.611608,0.951619,0.710456
walk_dur,0.140491,0.055282,0.127021,0.053841,1.0,0.999749,0.069456,0.063123,0.048746,0.052742,0.216652
walk_dist,0.137835,0.053991,0.126681,0.053215,0.999749,1.0,0.068182,0.062107,0.046368,0.05227,0.214418
bike_dur,0.684444,0.690583,0.507474,0.716149,0.069456,0.068182,1.0,0.999365,0.628363,0.676727,0.549602
bike_dist,0.6815,0.689441,0.508474,0.718181,0.063123,0.062107,0.999365,1.0,0.625587,0.67912,0.546183
car_cost,0.869337,0.895837,0.40578,0.611608,0.048746,0.046368,0.628363,0.625587,1.0,0.570475,0.462379
transit_cost,0.618598,0.639519,0.659717,0.951619,0.052742,0.05227,0.676727,0.67912,0.570475,1.0,0.675703


In [24]:
# drop correlated features
# due to a significant number of missing values for the walking trips (approximately 30%), we chose to exclude the walking class entirely from our analysis.
X_train_RF_final.drop(columns = ['car_dist','pt_dist','walk_dist','bike_dist','walk_dur','car_changes','walk_changes','bike_changes'], inplace = True)
X_val_RF_final.drop(columns = ['car_dist','pt_dist','walk_dist','bike_dist','walk_dur','car_changes','walk_changes','bike_changes'], inplace = True)
X_test_RF_final.drop(columns = ['car_dist','pt_dist','walk_dist','bike_dist','walk_dur','car_changes','walk_changes','bike_changes'], inplace = True)

In [25]:
# keep samples only from the car, transit and bike classes
data_train_RF = X_train_RF_final.join(y_train_RF)
data_train_RF = data_train_RF.loc[(data_train_RF.khvm == 1) | (data_train_RF.khvm == 3) | (data_train_RF.khvm == 4)]

data_test_RF = X_test_RF_final.join(y_test_RF)
data_test_RF = data_test_RF.loc[(data_test_RF.khvm == 1) | (data_test_RF.khvm == 3) | (data_test_RF.khvm == 4)]

data_val_RF = X_val_RF_final.join(y_val_RF)
data_val_RF = data_val_RF.loc[(data_val_RF.khvm == 1) | (data_val_RF.khvm == 3) | (data_val_RF.khvm == 4)]

In [27]:
# save the datasets
data_train_RF.to_csv('./data/data_train_RF.csv')
data_test_RF.to_csv('./data/data_test_RF.csv')
data_val_RF.to_csv('./data/data_val_RF.csv')