In [127]:
import pandas as pd

In [128]:
def read_tab(var):
  dir = f'/content/drive/MyDrive/socio_pred/UKDA-5340-tab/tab/{var}_eul_2002-2020.tab'
  df = pd.read_csv(dir,sep='\t')
  #select surveys after 2018
  if 'SurveyYear' in df.columns:
    df = df[df['SurveyYear']>2018]
  return df

In [None]:
#indev = read_tab('indev')
#stage = read_tab('stage')

In [129]:
#select the relevant vars
#PSU
psu = read_tab('psu')
psu = psu[['PSUID','SurveyYear','PSUStatsReg_B01ID']]

In [130]:
#Trip
trip = read_tab('trip')

cols_trip = ['TripID','DayID','IndividualID','HouseholdID','PSUID',\
             'SeriesCall_B01ID','ShortWalkTrip_B01ID','NumStages_B01ID',\
             'MainMode_B04ID','TripPurpose_B04ID','TripStart_B01ID',\
             'TripStart_B02ID','TripEnd_B01ID','TripEnd_B02ID',\
             'TripDisIncSW_B01ID','TripDisExSW_B01ID','TripTotalTime_B01ID',\
             'TripTravTime_B01ID','TripOrigGOR_B02ID','TripDestGOR_B02ID']

trip = trip[cols_trip]  
trip.shape

  


(277460, 20)

In [131]:
#filter trips
# no shortwalk trips 
trip = trip[trip['ShortWalkTrip_B01ID']==2]

In [132]:
#Household
household = read_tab('household')

cols_houshold = ['HouseholdID','PSUID','TWSDay','TWSMonth','TWSYear',\
                 'TWSMonth_B01ID','TWSWeekday_B01ID','TWEMonth_B01ID',\
                 'TWEWeekday_B01ID','HHIncome2002_B02ID','Ten1_B02ID',\
                 'HHoldGOR_B02ID','HHoldStruct_B02ID','HRPWorkStat_B02ID',\
                 'HRPEmpStat_B01ID','HRPSEGWorkStat_B01ID','HHoldEmploy_B01ID',
                 'WalkBus_B01ID','WalkRail_B01ID','BusRail_B01ID','Settlement2011EW_B03ID',\
                 'HHoldOAClass2011_B03ID','HHIncQDS2020Eng_B01ID','HHIncQIS2020Eng_B01ID']
household = household[cols_houshold]
household.shape

  


(9839, 24)

In [133]:
# household filters:
# exclude 'DEAD' rows
household = household[household['TWSMonth_B01ID']>0]
#Nov, Dec 2019, Jan and Feb 2020
household=household[((household['TWSMonth_B01ID'] <3) & (household['TWSYear']==2020))\
          | ((household['TWSMonth_B01ID'] >10) & (household['TWSYear']==2019))]

household.shape

(2192, 24)

In [134]:
#trip filter
# is from selected hh 
trip = trip[trip['HouseholdID'].isin(household['HouseholdID'])]
trip.shape

#filter hou

(66252, 20)

In [135]:
#Individual
individual = read_tab('individual')

cols_ind = ['IndividualID','HouseholdID','PSUID','VehicleID','Age_B04ID',
 'Sex_B01ID','MarStat_B01ID','EthGroupTS_B02ID','EdAttn3_B01ID',
 'CarAccess_B01ID','DrivLic_B02ID','IndIncome2002_B02ID',\
 'WkPlace_B01ID','EcoStat_B02ID','SC_B01ID','OftHome_B01ID','BusOut_B01ID',\
 'Educ_B01ID','WkMuch_B01ID','OwnPhone_B01ID']
individual = individual[cols_ind]

individual.shape

  


(22767, 20)

In [136]:
#individual filters
# is from selected hh 
individual = individual[individual['HouseholdID'].isin(household['HouseholdID'])]
individual.shape

(5240, 20)

In [137]:
#Vehicle
vehicle = read_tab('vehicle')

cols_veh = ['VehicleID','HouseholdID'	,'PSUID'	,'IndividualID','VehNo',\
            'VehAvail_B01ID','VehComMile_B01ID','VehBusMile_B01ID',\
             'VehPriMile_B01ID']
vehicle = vehicle[cols_veh]

# is from selected hh 

vehicle = vehicle[vehicle['HouseholdID'].isin(household['HouseholdID'])]

#Day
day = read_tab('day')

cols_day = ['DayID','IndividualID','HouseholdID','PSUID','TravelWeekDay_B01ID',\
            'TravelWeekDay_B02ID','TravelWeekDay_B03ID','TravelDayType_B01ID']
day = day[cols_day]
# is from selected hh 
day = day[day['HouseholdID'].isin(household['HouseholdID'])]
day.shape

(32949, 8)

In [138]:
#merge with day 
trip_day = trip.merge(day, left_on='DayID', right_on='DayID', how='inner',suffixes=('', '_y'))

trip_day.drop(trip_day.filter(regex='_y$').columns, axis=1, inplace=True)

#merge with individual
trip_day_ind = trip_day.merge(individual, left_on='IndividualID', right_on='IndividualID', how='inner',suffixes=('', '_y'))

trip_day_ind.drop(trip_day_ind.filter(regex='_y$').columns, axis=1, inplace=True)
#merge with hh
trip_day_ind_hh = trip_day_ind.merge(household, left_on='HouseholdID', right_on='HouseholdID', how='inner',suffixes=('', '_y'))

trip_day_ind_hh.drop(trip_day_ind_hh.filter(regex='_y$').columns, axis=1, inplace=True)
#merge with PSU
trip_day_ind_hh_psu = trip_day_ind_hh.merge(psu, left_on='PSUID', right_on='PSUID', how='inner',suffixes=('', '_y'))

trip_day_ind_hh_psu.drop(trip_day_ind_hh_psu.filter(regex='_y$').columns, axis=1, inplace=True)
#merge with vehicle
all_tabs = trip_day_ind_hh_psu.merge(vehicle, left_on='VehicleID', right_on='VehicleID', how='inner',suffixes=('', '_y'))

all_tabs.drop(all_tabs.filter(regex='_y$').columns, axis=1, inplace=True)

#drop irrelivant columns
#to keep things simple, only some of the vars are kept
all_tabs.drop(columns=['DayID','ShortWalkTrip_B01ID','HouseholdID','PSUID',\
                       'SeriesCall_B01ID','MainMode_B04ID','TripPurpose_B04ID',\
                       'TripStart_B02ID','TripEnd_B02ID','TripOrigGOR_B02ID',\
                       'TripDestGOR_B02ID','TravelWeekDay_B02ID','TravelDayType_B01ID',\
                       'VehicleID','TWEWeekday_B01ID','TWSDay', 'TWSMonth',\
                       'TWSYear','TWSWeekday_B01ID','HHoldGOR_B02ID',
                       'WalkBus_B01ID', 'WalkRail_B01ID','BusRail_B01ID',\
                       'VehNo', 'VehComMile_B01ID','VehBusMile_B01ID',\
                       'VehPriMile_B01ID','NumStages_B01ID'],inplace=True)

In [139]:
all_tabs.columns

Index(['TripID', 'IndividualID', 'TripStart_B01ID', 'TripEnd_B01ID',
       'TripDisIncSW_B01ID', 'TripDisExSW_B01ID', 'TripTotalTime_B01ID',
       'TripTravTime_B01ID', 'TravelWeekDay_B01ID', 'TravelWeekDay_B03ID',
       'Age_B04ID', 'Sex_B01ID', 'MarStat_B01ID', 'EthGroupTS_B02ID',
       'EdAttn3_B01ID', 'CarAccess_B01ID', 'DrivLic_B02ID',
       'IndIncome2002_B02ID', 'WkPlace_B01ID', 'EcoStat_B02ID', 'SC_B01ID',
       'OftHome_B01ID', 'BusOut_B01ID', 'Educ_B01ID', 'WkMuch_B01ID',
       'OwnPhone_B01ID', 'TWSMonth_B01ID', 'TWEMonth_B01ID',
       'HHIncome2002_B02ID', 'Ten1_B02ID', 'HHoldStruct_B02ID',
       'HRPWorkStat_B02ID', 'HRPEmpStat_B01ID', 'HRPSEGWorkStat_B01ID',
       'HHoldEmploy_B01ID', 'Settlement2011EW_B03ID', 'HHoldOAClass2011_B03ID',
       'HHIncQDS2020Eng_B01ID', 'HHIncQIS2020Eng_B01ID', 'SurveyYear',
       'PSUStatsReg_B01ID', 'VehAvail_B01ID'],
      dtype='object')

Dealing with Missing values

In [140]:
#columns with negative values(missing in NTS data)
missing_cols = all_tabs[all_tabs<=0].any()[all_tabs[all_tabs<=0].any()==True].index
missing_cols

Index(['TripStart_B01ID', 'TripEnd_B01ID', 'EdAttn3_B01ID', 'WkPlace_B01ID',
       'EcoStat_B02ID', 'SC_B01ID', 'OftHome_B01ID', 'BusOut_B01ID',
       'Educ_B01ID', 'WkMuch_B01ID', 'OwnPhone_B01ID', 'Ten1_B02ID',
       'HHoldStruct_B02ID', 'HRPWorkStat_B02ID', 'HRPEmpStat_B01ID',
       'HHoldOAClass2011_B03ID', 'HHIncQDS2020Eng_B01ID',
       'HHIncQIS2020Eng_B01ID'],
      dtype='object')

In [141]:
for c in missing_cols:
  print(all_tabs[c].value_counts())

 16    3873
 9     3804
 17    3644
 18    3470
 12    3421
 13    3328
 11    3295
 15    3224
 14    2863
 10    2744
 19    2568
 8     2476
 20    1740
 21    1043
 7      935
 22     766
-8      674
 23     561
 6      358
 24     327
 5       96
 1       80
 2       37
 3       24
 4       12
Name: TripStart_B01ID, dtype: int64
 9     3937
 16    3846
 18    3546
 13    3364
 17    3358
 12    3320
 11    3266
 15    3117
 19    2989
 10    2965
 14    2896
 20    2214
 8     1709
 21    1210
 22     819
-8      747
 23     614
 7      573
 24     398
 6      215
 1      125
 2       50
 5       48
 3       19
 4       18
Name: TripEnd_B01ID, dtype: int64
-9    45363
Name: EdAttn3_B01ID, dtype: int64
 1    22676
-9    13808
 3     4881
 2     2262
 4     1724
-8       12
Name: WkPlace_B01ID, dtype: int64
 1    23868
 4    11026
 2     7687
 6     2169
 5      407
 3      201
-9        5
Name: EcoStat_B02ID, dtype: int64
 2    18729
 3     8426
 4     6516
 5     5494
 1     3262


In [142]:
#vars with many missing data
miss_all = ['EdAttn3_B01ID','WkPlace_B01ID','OftHome_B01ID','BusOut_B01ID',\
            'Educ_B01ID','WkMuch_B01ID','HHoldOAClass2011_B03ID',
            'HHIncQDS2020Eng_B01ID','HHIncQIS2020Eng_B01ID','OwnPhone_B01ID']

all_tabs.drop(columns=miss_all,inplace=True)
#vars with few missing data
miss_few = ['TripStart_B01ID','TripEnd_B01ID','EcoStat_B02ID','SC_B01ID','Ten1_B02ID',\
            'HHoldStruct_B02ID',
            'HRPWorkStat_B02ID','HRPEmpStat_B01ID']
for c in miss_few:
  all_tabs = all_tabs[all_tabs[c]>0]

#remaining after deleting missing values
all_tabs.shape


(41839, 32)

Dealing with categorical vars

In [143]:
noncat_cols = ['TripID', 'IndividualID','SurveyYear']
all_tabs_dummy = all_tabs 
for c in all_tabs.columns:
  if c not in noncat_cols:
    c_dummy = pd.get_dummies(all_tabs[c],prefix=c)
    all_tabs_dummy = pd.concat([all_tabs_dummy,c_dummy], axis=1)
    all_tabs_dummy.drop(columns = c,inplace=True)


In [144]:
#aggregate columns
all_tabs_dummy.columns

Index(['TripID', 'IndividualID', 'SurveyYear', 'TripStart_B01ID_1',
       'TripStart_B01ID_2', 'TripStart_B01ID_3', 'TripStart_B01ID_4',
       'TripStart_B01ID_5', 'TripStart_B01ID_6', 'TripStart_B01ID_7',
       ...
       'PSUStatsReg_B01ID_6', 'PSUStatsReg_B01ID_7', 'PSUStatsReg_B01ID_8',
       'PSUStatsReg_B01ID_9', 'PSUStatsReg_B01ID_10', 'PSUStatsReg_B01ID_11',
       'PSUStatsReg_B01ID_12', 'PSUStatsReg_B01ID_13', 'VehAvail_B01ID_1',
       'VehAvail_B01ID_2'],
      dtype='object', length=211)

In [145]:
# definitions based on britannica
# Night         9 pm to 4 am
all_tabs_dummy['start_night'] = all_tabs_dummy.iloc[:,3:7].sum(axis=1)+\
all_tabs_dummy.iloc[:,24:27].sum(axis=1)
#Evening     5 pm to 9 pm
all_tabs_dummy['start_evening'] = all_tabs_dummy.iloc[:,20:24].sum(axis=1)
#Afternoon     12 pm to 5 pm
all_tabs_dummy['start_afternoon'] = all_tabs_dummy.iloc[:,15:20].sum(axis=1)
#Morning     4 am to 12 pm
all_tabs_dummy['start_morning'] = all_tabs_dummy.iloc[:,7:15].sum(axis=1)

#trip ends
# Night         9 pm to 4 am
all_tabs_dummy['end_night'] = all_tabs_dummy.iloc[:,27:31].sum(axis=1)+\
all_tabs_dummy.iloc[:,48:51].sum(axis=1)
#Evening     5 pm to 9 pm
all_tabs_dummy['end_evening'] = all_tabs_dummy.iloc[:,44:48].sum(axis=1)
#Afternoon     12 pm to 5 pm
all_tabs_dummy['end_afternoon'] = all_tabs_dummy.iloc[:,39:44].sum(axis=1)
#Morning     4 am to 12 pm
all_tabs_dummy['end_morning'] = all_tabs_dummy.iloc[:,31:39].sum(axis=1)


In [146]:
#replace bands with band average
#TripDisIncSW
tmp_df = pd.DataFrame()
tmp_df['dist_band'] = all_tabs_dummy.iloc[:,51:63].idxmax(axis=1)

distInc = {'TripDisIncSW_B01ID_1':	0.5, #miles
'TripDisIncSW_B01ID_2':	1.5,
'TripDisIncSW_B01ID_3':	2.5,
'TripDisIncSW_B01ID_4':	4,
'TripDisIncSW_B01ID_5':	7.5,
'TripDisIncSW_B01ID_6':	12.5,
'TripDisIncSW_B01ID_7':	20,
'TripDisIncSW_B01ID_8':	30,
'TripDisIncSW_B01ID_9':	42.5,
'TripDisIncSW_B01ID_10':	75,
'TripDisIncSW_B01ID_11':	150,
'TripDisIncSW_B01ID_12':	250}
for n in distInc.keys():
  all_tabs_dummy.loc[tmp_df["dist_band"] == n, "distance"] = distInc[n]
#replace bands with band average
#TripDisExSW_B01ID

tmp_df = pd.DataFrame()
tmp_df['dist_band'] = all_tabs_dummy.iloc[:,63:75].idxmax(axis=1)

distEx = {'TripDisExSW_B01ID_1':	0.5, #miles
'TripDisExSW_B01ID_2':	1.5,
'TripDisExSW_B01ID_3':	2.5,
'TripDisExSW_B01ID_4':	4,
'TripDisExSW_B01ID_5':	7.5,
'TripDisExSW_B01ID_6':	12.5,
'TripDisExSW_B01ID_7':	20,
'TripDisExSW_B01ID_8':	30,
'TripDisExSW_B01ID_9':	42.5,
'TripDisExSW_B01ID_10':	75,
'TripDisExSW_B01ID_11':	150,
'TripDisExSW_B01ID_12':	250}
for n in distEx.keys():
  all_tabs_dummy.loc[tmp_df["dist_band"] == n, "distance_ex"] = distEx[n]

In [147]:
# merge total trip time
# short         less than 15 
all_tabs_dummy['time_short'] = all_tabs_dummy.iloc[:,75:78].sum(axis=1)
# mid     15 min to 1 hr
all_tabs_dummy['time_mid'] = all_tabs_dummy.iloc[:,78:81].sum(axis=1)
#long     more than 1 hr
all_tabs_dummy['time_long'] = all_tabs_dummy.iloc[:,81:89].sum(axis=1)

# merge total travel time
# short         less than 15 
all_tabs_dummy['travel_short'] = all_tabs_dummy.iloc[:,89:92].sum(axis=1)
# mid     15 min to 1 hr
all_tabs_dummy['travel_mid'] = all_tabs_dummy.iloc[:,92:95].sum(axis=1)
#long     more than 1 hr
all_tabs_dummy['travel_long'] = all_tabs_dummy.iloc[:,95:103].sum(axis=1)

In [148]:
#min and max speed
duration_minmax={
  1: [1,3],
  2: [3,8],
  3:	[8,15], 
  4:	[15, 30], 
  5:	[30,45], 
  6:	[45,60],
  7:	[60,90], 
  8:	[90,120], 
  9:	[120, 150], 
  10:	[150, 180], 
  11:	[180,240], 
  12:	[240,300], 
  13:	[300,360], 
  14:	[360,420]}


distEx_minmax = {'TripDisExSW_B01ID_1':	[0.1,1], #miles
'TripDisExSW_B01ID_2':	[1,2],
'TripDisExSW_B01ID_3':	[2,3],
'TripDisExSW_B01ID_4':	[3,5],
'TripDisExSW_B01ID_5':	[5,10],
'TripDisExSW_B01ID_6':	[10,15],
'TripDisExSW_B01ID_7':	[15,25],
'TripDisExSW_B01ID_8':	[25,35],
'TripDisExSW_B01ID_9':	[35,50],
'TripDisExSW_B01ID_10':	[50,100],
'TripDisExSW_B01ID_11':	[100,200],
'TripDisExSW_B01ID_12':	[200,300]}
tmp_df['time_band'] = all_tabs_dummy.iloc[:,75:89].idxmax(axis=1).str.replace\
('TripTotalTime_B01ID_','').astype(int)

for key, value in distEx_minmax.items():
  tmp_df.loc[tmp_df["dist_band"] == key,'dist_min']=value[0]
  tmp_df.loc[tmp_df["dist_band"] == key,'dist_max']=value[1]
for key, value in duration_minmax.items():
  tmp_df.loc[tmp_df["time_band"] == key,'time_min']=value[0]
  tmp_df.loc[tmp_df["time_band"] == key,'time_max']=value[1]

In [149]:
all_tabs_dummy['speed_max'] = tmp_df['dist_max']/tmp_df['time_min']*60 #mph
all_tabs_dummy['speed_min'] = tmp_df['dist_min']/tmp_df['time_max']*60 #mph


In [150]:
all_tabs_dummy.to_csv('/content/drive/MyDrive/socio_pred/UKDA-5340-tab/alltabs_dummy.csv',\
                 index=False)

In [152]:
ind_gp = all_tabs_dummy.groupby('IndividualID')
ind_gp_weekday = all_tabs_dummy[all_tabs_dummy['TravelWeekDay_B03ID_1']==1]\
.groupby('IndividualID')


#total sum
mobility_sum_1 = pd.DataFrame(ind_gp.sum().iloc[:,109:110]).reset_index() #weekday
mobility_sum_1.rename(columns = {'TravelWeekDay_B03ID_1':'Count_Weekday_Trips'},\
                                 inplace = True)

mobility_sum_2 = pd.DataFrame(ind_gp.sum().iloc[:,210:226]).reset_index()
mobility_sum_2.rename(columns = {'start_night':'Count_start_night',\
                                 'start_evening':'Count_start_evening',\
                                 'start_afternoon':'Count_start_afternoon',\
                                 'start_morning':'Count_start_morning',\
                                 'end_night':'Count_end_night',\
                                 'end_evening':'Count_end_evening',\
                                 'end_afternoon':'Count_end_afternoon',\
                                 'end_morning':'Count_end_morning',\
                                 'distance':'Total_Distance',\
                                 'distance_ex':'Total_Distance_ex',\
                                 'time_short':'Count_time_short',\
                                 'time_mid':'Count_time_mid',\
                                 'time_long':'Count_time_long',\
                                 'travel_short':'Count_travel_short',\
                                 'travel_mid':'Count_travel_mid',\
                                 'travel_long':'Count_travel_long'},\
                                 inplace = True)

#weekday sum:
mobility_sum_3 = pd.DataFrame(ind_gp_weekday.sum().iloc[:,210:226]).reset_index()
mobility_sum_3.rename(columns = {'start_night':'Count_start_night_wk',\
                                 'start_evening':'Count_start_evening_wk',\
                                 'start_afternoon':'Count_start_afternoon_wk',\
                                 'start_morning':'Count_start_morning_wk',\
                                 'end_night':'Count_end_night_wk',\
                                 'end_evening':'Count_end_evening_wk',\
                                 'end_afternoon':'Count_end_afternoon_wk',\
                                 'end_morning':'Count_end_morning_wk',\
                                 'distance':'Total_Distance_wk',\
                                 'distance_ex':'Total_Distance_ex_wk',\
                                 'time_short':'Count_time_short_wk',\
                                 'time_mid':'Count_time_mid_wk',\
                                 'time_long':'Count_time_long_wk',\
                                 'travel_short':'Count_travel_short_wk',\
                                 'travel_mid':'Count_travel_mid_wk',\
                                 'travel_long':'Count_travel_long_wk'},\
                                 inplace = True)

#total mean
mobility_mean_1 = pd.DataFrame(ind_gp.mean().iloc[:,109:110]).reset_index() #weekday
mobility_mean_1.rename(columns = {'TravelWeekDay_B03ID_1':'%_Weekday_Trips'},\
                                 inplace = True)
mobility_mean_2 = pd.DataFrame(ind_gp.mean().iloc[:,210:]).reset_index()
mobility_mean_2.rename(columns = {'start_night':'%_start_night',\
                                 'start_evening':'%_start_evening',\
                                 'start_afternoon':'%_start_afternoon',\
                                 'start_morning':'%_start_morning',\
                                 'end_night':'%_end_night',\
                                 'end_evening':'%_end_evening',\
                                 'end_afternoon':'%_end_afternoon',\
                                 'end_morning':'%_end_morning',\
                                 'distance':'avg_Distance',\
                                 'distance_ex':'avg_Distance_ex',\
                                 'time_short':'%_time_short',\
                                 'time_mid':'%_time_mid',\
                                 'time_long':'%_time_long',\
                                 'travel_short':'%_travel_short',\
                                 'travel_mid':'%_travel_mid',\
                                 'travel_long':'%_travel_long',
                                  'speed_max':'avg_max_speed',\
                                  'speed_min':'avg_min_speed'},\
                                 inplace = True)
#weekday mean:
mobility_mean_3 = pd.DataFrame(ind_gp_weekday.mean().iloc[:,210:]).reset_index()
mobility_mean_3.rename(columns = {'start_night':'%_start_night_wk',\
                                 'start_evening':'%_start_evening_wk',\
                                 'start_afternoon':'%_start_afternoon_wk',\
                                 'start_morning':'%_start_morning_wk',\
                                 'end_night':'%_end_night_wk',\
                                 'end_evening':'%_end_evening_wk',\
                                 'end_afternoon':'%_end_afternoon_wk',\
                                 'end_morning':'%_end_morning_wk',\
                                 'distance':'avg_Distance_wk',\
                                 'distance_ex':'avg_Distance_ex_wk',\
                                 'time_short':'%_time_short_wk',\
                                 'time_mid':'%_time_mid_wk',\
                                 'time_long':'%_time_long_wk',\
                                 'travel_short':'%_travel_short_wk',\
                                 'travel_mid':'%_travel_mid_wk',\
                                 'travel_long':'%_travel_long_wk',\
                                  'speed_max':'avg_max_speed_wk',\
                                  'speed_min':'avg_min_speed_wk'},\
                                 inplace = True)

#total std
mobility_std_2 = pd.DataFrame(ind_gp.std().iloc[:,210:]).reset_index()
mobility_std_2.rename(columns = {'start_night':'std_start_night',\
                                 'start_evening':'std_start_evening',\
                                 'start_afternoon':'std_start_afternoon',\
                                 'start_morning':'std_start_morning',\
                                 'end_night':'std_end_night',\
                                 'end_evening':'std_end_evening',\
                                 'end_afternoon':'std_end_afternoon',\
                                 'end_morning':'std_end_morning',\
                                 'distance':'std_Distance',\
                                 'distance_ex':'std_Distance_ex',\
                                 'time_short':'std_time_short',\
                                 'time_mid':'std_time_mid',\
                                 'time_long':'std_time_long',\
                                 'travel_short':'std_travel_short',\
                                 'travel_mid':'std_travel_mid',\
                                 'travel_long':'std_travel_long',\
                                 'speed_max':'std_max_speed',\
                                 'speed_min':'std_min_speed'},\
                                 inplace = True)
#weekday std:
mobility_std_3 = pd.DataFrame(ind_gp_weekday.std().iloc[:,210:]).reset_index()

mobility_std_3.rename(columns = {'start_night':'std_start_night_wk',\
                                 'start_evening':'std_start_evening_wk',\
                                 'start_afternoon':'std_start_afternoon_wk',\
                                 'start_morning':'std_start_morning_wk',\
                                 'end_night':'std_end_night_wk',\
                                 'end_evening':'std_end_evening_wk',\
                                 'end_afternoon':'std_end_afternoon_wk',\
                                 'end_morning':'std_end_morning_wk',\
                                 'distance':'std_Distance_wk',\
                                 'distance_ex':'std_Distance_ex_wk',\
                                 'time_short':'std_time_short_wk',\
                                 'time_mid':'std_time_mid_wk',\
                                 'time_long':'std_time_long_wk',\
                                 'travel_short':'std_travel_short_wk',\
                                 'travel_mid':'std_travel_mid_wk',\
                                 'travel_long':'std_travel_long_wk',\
                                 'speed_max':'std_max_speed_wk',\
                                 'speed_min':'std_min_speed_wk'},\
                                 inplace = True)





In [153]:
from functools import reduce
dfs = [mobility_sum_1, mobility_sum_2, mobility_sum_3,\
       mobility_mean_1, mobility_mean_2, mobility_mean_3,\
       mobility_std_2, mobility_std_3]

mobility = reduce(lambda  left,right: pd.merge(left,right,left_on='IndividualID',\
                                          right_on='IndividualID',how='left'),dfs)



In [154]:
socio = ind_gp.mean().iloc[:,111:210].reset_index()

individual_tab = pd.DataFrame()
individual_tab ['IndividualID'] = ind_gp.size().index
individual_tab ['Total Number of Trips'] = ind_gp.size().values
individual_tab ['SurveyYear'] = ind_gp.mean()['SurveyYear'].values

In [155]:
dfs = [individual_tab,socio, mobility]

NTS_dummy = reduce(lambda  left,right: pd.merge(left,right,left_on='IndividualID',\
                                          right_on='IndividualID',how='inner'),dfs)



In [156]:
NTS_dummy

Unnamed: 0,IndividualID,Total Number of Trips,SurveyYear,Age_B04ID_4,Age_B04ID_5,Age_B04ID_6,Age_B04ID_7,Age_B04ID_8,Age_B04ID_9,Sex_B01ID_1,...,std_Distance_wk,std_Distance_ex_wk,std_time_short_wk,std_time_mid_wk,std_time_long_wk,std_travel_short_wk,std_travel_mid_wk,std_travel_long_wk,std_max_speed_wk,std_min_speed_wk
0,2019011363,16,2019.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.452267,0.452267,0.000000,0.452267,0.452267,0.000000,4.748804,1.809068
1,2019011428,18,2019.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.375534,0.375534,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8.946959,1.011156
2,2019011564,23,2019.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,5.543389,5.543389,0.500000,0.516398,0.341565,0.500000,0.516398,0.341565,10.389270,2.381934
3,2019011569,15,2019.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,12.806946,12.806946,0.497245,0.000000,0.497245,0.497245,0.267261,0.513553,27.412657,4.821794
4,2019011570,13,2019.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,35.543401,35.543401,0.492366,0.389249,0.522233,0.492366,0.389249,0.522233,25.349410,9.194265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2440,2020001977,30,2020.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,2.097409,2.097409,0.229416,0.229416,0.000000,0.229416,0.229416,0.000000,13.135873,4.762679
2441,2020002004,15,2020.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.404520,0.404520,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,10.202495,2.776730
2442,2020002009,8,2020.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,19.442222,19.442222,0.462910,0.000000,0.462910,0.462910,0.000000,0.462910,6.091238,7.373638
2443,2020002010,6,2020.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,4.303315,1.807392


In [157]:
NTS_dummy.to_csv('/content/drive/MyDrive/socio_pred/UKDA-5340-tab/NTS_dummy.csv',\
                 index=False)
