<a href="https://colab.research.google.com/github/arashk1990/covid19mobility/blob/main/feature_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import sklearn
from sklearn.cluster import KMeans
NTS_dummy = pd.read_csv('/content/drive/MyDrive/socio_pred/UKDA-5340-tab/NTS_dummy.csv')

In [6]:
NTS_dummy.head()

Unnamed: 0,IndividualID,Total Number of Trips,SurveyYear,Age_B04ID_4,Age_B04ID_5,Age_B04ID_6,Age_B04ID_7,Age_B04ID_8,Age_B04ID_9,Sex_B01ID_1,...,std_end_afternoon_wk,std_end_morning_wk,std_Distance_wk,std_Distance_ex_wk,std_time_short_wk,std_time_mid_wk,std_time_long_wk,std_travel_short_wk,std_travel_mid_wk,std_travel_long_wk
0,2019011363,16,2019.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.288675,0.522233,0.0,0.0,0.452267,0.452267,0.0,0.452267,0.452267,0.0
1,2019011428,18,2019.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.518875,0.50637,0.375534,0.375534,0.0,0.0,0.0,0.0,0.0,0.0
2,2019011564,23,2019.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.5,0.447214,5.543389,5.543389,0.5,0.516398,0.341565,0.5,0.516398,0.341565
3,2019011569,15,2019.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.468807,12.806946,12.806946,0.497245,0.0,0.497245,0.497245,0.267261,0.513553
4,2019011570,13,2019.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.514929,35.543401,35.543401,0.492366,0.389249,0.522233,0.492366,0.389249,0.522233


In [23]:
mobility = pd.concat([NTS_dummy.iloc[:,102:],NTS_dummy.iloc[:,1:2]],axis=1)
socio = NTS_dummy.iloc[:,3:102]

In [30]:

# Sociodemographic columns:
socio_cols = {'age' : socio.columns[0:6],
'sex' : socio.columns[6:8],
'marital' : socio.columns[8:13],
'ethincity': socio.columns[13:15],
'caraccess': socio.columns[15:19],
'license': socio.columns[19:20],
'income': socio.columns[20:23],
'work': socio.columns[23:29],
'socialclass': socio.columns[29:36],
'hhincome': socio.columns[45:48],
'tenancy': socio.columns[48:51],
'hh_struc': socio.columns[51:57],
'hh_work': socio.columns[57:61],
'hh_emp': socio.columns[61:65],
'hh_socialclass': socio.columns[65:71],
'urban_rural': socio.columns[82:84],
'car': socio.columns[97:99]}

In [31]:
#age: under 21 is a small proportion
socio[socio_cols['age']].sum()

Age_B04ID_4     41.0
Age_B04ID_5    221.0
Age_B04ID_6    421.0
Age_B04ID_7    416.0
Age_B04ID_8    514.0
Age_B04ID_9    832.0
dtype: float64

In [32]:
# Merge marital: married vs. not married(single/divorced/seperated,etc):
socio['married'] = socio[socio_cols['marital'][0]]
socio['not_married'] = socio[socio_cols['marital'][1:5]].sum(axis=1)


In [33]:
#ethnicity is imbalanced
socio[socio_cols['ethincity']].sum()

EthGroupTS_B02ID_1    2278.0
EthGroupTS_B02ID_2     167.0
dtype: float64

In [34]:
#Most people have car acess, and are main drivier
socio[socio_cols['caraccess']].sum()


CarAccess_B01ID_1      79.0
CarAccess_B01ID_2    2052.0
CarAccess_B01ID_3     313.0
CarAccess_B01ID_5       1.0
dtype: float64

In [35]:
#Every one has license
socio[socio_cols['license']].sum()

DrivLic_B02ID_1    2445.0
dtype: float64

In [36]:
#Mostly from lower income 
socio[socio_cols['income']].sum()

IndIncome2002_B02ID_1    1392.0
IndIncome2002_B02ID_2     744.0
IndIncome2002_B02ID_3     309.0
dtype: float64

In [37]:
#Unemployed and students are rare. Mostly either fulltime or retired/disabled
socio[socio_cols['work']].sum()

EcoStat_B02ID_1    1325.0
EcoStat_B02ID_2     367.0
EcoStat_B02ID_3      12.0
EcoStat_B02ID_4     646.0
EcoStat_B02ID_5      10.0
EcoStat_B02ID_6      85.0
dtype: float64

In [38]:
# Merge work: employed/parttime/notworking
socio['fulltime'] = socio[socio_cols['work'][0]]
socio['parttime'] = socio[socio_cols['work'][1]]
socio['not_working'] = socio[socio_cols['work'][2:6]].sum(axis=1)


In [39]:
#Mostly professoonal/technical
socio[socio_cols['socialclass']].sum()

SC_B01ID_1     183.0
SC_B01ID_2    1016.0
SC_B01ID_3     482.0
SC_B01ID_4     416.0
SC_B01ID_5     298.0
SC_B01ID_6      45.0
SC_B01ID_7       5.0
dtype: float64

In [40]:
#Unlike individual income, mostly from higher income hh
socio[socio_cols['hhincome']].sum()

HHIncome2002_B02ID_1     568.0
HHIncome2002_B02ID_2     720.0
HHIncome2002_B02ID_3    1157.0
dtype: float64

In [41]:
#Imbalanced, mostly owners
socio[socio_cols['tenancy']].sum()

Ten1_B02ID_1    2026.0
Ten1_B02ID_2     413.0
Ten1_B02ID_3       6.0
dtype: float64

In [42]:
#not many single parents
socio[socio_cols['hh_struc']].sum()

HHoldStruct_B02ID_1    281.0
HHoldStruct_B02ID_2    982.0
HHoldStruct_B02ID_3    405.0
HHoldStruct_B02ID_4     45.0
HHoldStruct_B02ID_5    539.0
HHoldStruct_B02ID_6    193.0
dtype: float64

In [43]:
#fairly distributed, not many from unemployed hh
socio[socio_cols['hh_work']].sum()

HRPWorkStat_B02ID_1    1365.0
HRPWorkStat_B02ID_2     311.0
HRPWorkStat_B02ID_3     673.0
HRPWorkStat_B02ID_4      96.0
dtype: float64

In [44]:
# Merge hh work: employed/parttime/notworking
socio['hh_fulltime'] = socio[socio_cols['hh_work'][0]]
socio['hh_parttime'] = socio[socio_cols['hh_work'][1]]
socio['hh_not_working'] = socio[socio_cols['hh_work'][2:4]].sum(axis=1)


In [45]:
#mostly in other category
socio[socio_cols['hh_emp']].sum()

HRPEmpStat_B01ID_1     380.0
HRPEmpStat_B01ID_2     265.0
HRPEmpStat_B01ID_3    1784.0
HRPEmpStat_B01ID_4      16.0
dtype: float64

In [46]:
#mostly in other category
socio[socio_cols['hh_socialclass']].sum()

HRPSEGWorkStat_B01ID_1    471.0
HRPSEGWorkStat_B01ID_2    663.0
HRPSEGWorkStat_B01ID_3    367.0
HRPSEGWorkStat_B01ID_4    192.0
HRPSEGWorkStat_B01ID_5    673.0
HRPSEGWorkStat_B01ID_6     79.0
dtype: float64

In [47]:
#mostly urban
socio[socio_cols['urban_rural']].sum()

Settlement2011EW_B03ID_1    1814.0
Settlement2011EW_B03ID_2     631.0
dtype: float64

In [48]:
#almost every one has vehcile available to them
socio[socio_cols['car']].sum()

VehAvail_B01ID_1    2432.0
VehAvail_B01ID_2      13.0
dtype: float64

In [58]:
# vars to keep 
socio_cols_2 = {'age' : socio.columns[0:6],
'sex' : socio.columns[6:8],
'marital' : socio.columns[99:101],
'income': socio.columns[20:23],
'work': socio.columns[101:104],
'hhincome': socio.columns[45:48],
'hh_struc': socio.columns[51:57],
'hh_work': socio.columns[104:107],
'hh_socialclass': socio.columns[65:71],
'urban_rural': socio.columns[82:84]}

In [60]:
socio_2 = pd.DataFrame()
for k in socio_cols_2.keys():
  socio_2[socio_cols_2[k]]= socio[socio_cols_2[k]]

In [64]:
tmp = pd.concat([NTS_dummy[['IndividualID','SurveyYear']],socio_2],axis=1)
nts_v2 = pd.concat([tmp,mobility],axis=1)

In [67]:
nts_v2.to_csv('/content/drive/MyDrive/socio_pred/UKDA-5340-tab/NTS_v2.csv',\
                 index=False)