In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import sklearn
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.decomposition import PCA
from numpy import mean, std

In [None]:
NTS_dummy = pd.read_csv('/content/drive/MyDrive/socio_pred/UKDA-5340-tab/NTS_v2.csv')
# replace NA with zero
#na occurs for std of users with one trip, or count of users with no weekend trip
NTS_dummy = NTS_dummy.fillna(0)

mobility = NTS_dummy.iloc[:,38:]
socio = NTS_dummy.drop(columns = ['IndividualID','SurveyYear']).iloc[:,0:36]

socio_cols = {'age' : socio.columns[0:6],
'sex' : socio.columns[6:7],
'marital' : socio.columns[8:9],
'income': socio.columns[10:13],
'work': socio.columns[13:16],
'hhincome': socio.columns[16:19],
'hh_struc': socio.columns[19:25],
'hh_work': socio.columns[25:28],
'hh_socialclass': socio.columns[28:34],
'urban_rural': socio.columns[34:35]}

In [None]:
#select dependant var
def dependant(train,test,cols,var,dummy=0):
  y_train = train[cols[var]]
  y_test = test[cols[var]]
  label_encoder = LabelEncoder()

  if dummy ==1:
    #label_encoder = LabelEncoder()
    #y_train = y_train.idxmax(axis=1)
    #y_test = y_test.idxmax(axis=1)
    #label_encoder.fit(y_train)
    y_train = pd.get_dummies(y_train)
    y_test = pd.get_dummies(y_test)
  else:
    # convert to one columns labeles
    y_train = y_train.idxmax(axis=1)
    y_test = y_test.idxmax(axis=1)
    label_encoder.fit(y_train)
    y_train = label_encoder.transform(y_train)
    y_test = label_encoder.transform(y_test)

  #to get back labels
  #le.inverse_transform(y_train)
  return y_train,y_test,label_encoder

In [None]:
def RF(x_train,y_train,x_test,y_test,var,results):

  #no hyperparameter tuning
  model = RandomForestClassifier(n_estimators=100,n_jobs=-1, random_state=42)
  model.fit(x_train,y_train)
  y_pred = model.predict(x_test)
  y_pred_train = model.predict(x_train)
  acc_train = metrics.accuracy_score(y_train, y_pred_train)
  acc_test = metrics.accuracy_score(y_test, y_pred)
  f1_test = metrics.f1_score(y_test, y_pred, average = 'macro')

  #print('Dependant Variable <', var, ' >:')
  #print('Random Forest Accuracy, Test Set: ')
  #print(acc_test)
  #print('Random Forest F1 Score, Test Set: ')
  #print(f1_test)

  # 8-fold crossvalidation

  param_grid = {
      'max_depth': [5,10,15],
      'max_features': [4,6],
      'min_samples_leaf': [5,10],
      'min_samples_split': [5,10],
      'n_estimators': [100, 200,1000]
  }
  # Create a based model
  classifier = RandomForestClassifier()

  # Instantiate the grid search model
  grid_search = GridSearchCV(estimator = classifier, param_grid = param_grid, 
                            cv = 8, n_jobs = -1, verbose = 0)

  # Fit the grid search to the data
  #print('8-fold crossvalidation on training data: ')
  grid_search.fit(x_train,y_train)
  model = grid_search.best_estimator_
  y_pred_train = model.predict(x_train)
  y_pred_test = model.predict(x_test)
  cv_acc = metrics.accuracy_score(y_train,y_pred_train)
  cv_test = metrics.accuracy_score(y_test,y_pred_test)
  cv_f1_test = metrics.f1_score(y_test, y_pred_test, average = 'macro')
  
  #print('Best CV Accuracy: ', cv_acc)
  #print('Out of sample CV test accuracy: ',cv_test)
  lst = [var,acc_train,acc_test,f1_test,cv_acc,cv_test,cv_f1_test]
  return lst

In [None]:
def LR(x_train,y_train,x_test,y_test,var,results):

  #no hyperparameter tuning
  model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
  model.fit(x_train,y_train)
  y_pred = model.predict(x_test)
  y_pred_train = model.predict(x_train)
  acc_train = metrics.accuracy_score(y_train, y_pred_train)
  acc_test = metrics.accuracy_score(y_test, y_pred)
  f1_test = metrics.f1_score(y_test, y_pred, average = 'macro')
  lst = [var,acc_train,acc_test,f1_test]

  #print('Dependant Variable <', var, ' >:')
  #print('Random Forest Accuracy, Test Set: ')
  #print(acc_test)
  #print('Random Forest F1 Score, Test Set: ')
  #print(f1_test)

  # 8-fold crossvalidation

  # param_grid = {
  #     'max_depth': [5,10,15],
  #     'max_features': [4,6],
  #     'min_samples_leaf': [5,10],
  #     'min_samples_split': [5,10],
  #     'n_estimators': [100, 200,1000]
  # }
  # # Create a based model
  # classifier = RandomForestClassifier()

  # # Instantiate the grid search model
  # grid_search = GridSearchCV(estimator = classifier, param_grid = param_grid, 
  #                           cv = 8, n_jobs = -1, verbose = 0)

  # # Fit the grid search to the data
  # #print('8-fold crossvalidation on training data: ')
  # grid_search.fit(x_train,y_train)
  # model = grid_search.best_estimator_
  # y_pred_train = model.predict(x_train)
  # y_pred_test = model.predict(x_test)
  # cv_acc = metrics.accuracy_score(y_train,y_pred_train)
  # cv_test = metrics.accuracy_score(y_test,y_pred_test)
  # cv_f1_test = metrics.f1_score(y_test, y_pred_test, average = 'macro')
  
  # #print('Best CV Accuracy: ', cv_acc)
  # #print('Out of sample CV test accuracy: ',cv_test)
  # lst = [var,acc_train,acc_test,f1_test,cv_acc,cv_test,cv_f1_test]
  return lst

In [None]:
def xgboost(x_train,y_train,x_test,y_test,var,results):

  #no hyperparameter tuning\
  if len(np.unique(y_train))>2: #multiclass
    obj = "multi:softprob"
  else:
    obj = 'binary:logistic' #binary

  model = xgb.XGBClassifier(objective=obj, random_state=42)
  model.fit(x_train,y_train)
  y_pred = model.predict(x_test)
  y_pred_train = model.predict(x_train)
  acc_train = metrics.accuracy_score(y_train, y_pred_train)
  acc_test = metrics.accuracy_score(y_test, y_pred)
  f1_test = metrics.f1_score(y_test, y_pred, average = 'macro')
  lst = [var,acc_train,acc_test,f1_test]

  #print('Dependant Variable <', var, ' >:')
  #print('Random Forest Accuracy, Test Set: ')
  #print(acc_test)
  #print('Random Forest F1 Score, Test Set: ')
  #print(f1_test)

  # 8-fold crossvalidation

  # param_grid = {
  #     'max_depth': [5,10,15],
  #     'max_features': [4,6],
  #     'min_samples_leaf': [5,10],
  #     'min_samples_split': [5,10],
  #     'n_estimators': [100, 200,1000]
  # }
  # # Create a based model
  # classifier = RandomForestClassifier()

  # # Instantiate the grid search model
  # grid_search = GridSearchCV(estimator = classifier, param_grid = param_grid, 
  #                           cv = 8, n_jobs = -1, verbose = 0)

  # # Fit the grid search to the data
  # #print('8-fold crossvalidation on training data: ')
  # grid_search.fit(x_train,y_train)
  # model = grid_search.best_estimator_
  # y_pred_train = model.predict(x_train)
  # y_pred_test = model.predict(x_test)
  # cv_acc = metrics.accuracy_score(y_train,y_pred_train)
  # cv_test = metrics.accuracy_score(y_test,y_pred_test)
  # cv_f1_test = metrics.f1_score(y_test, y_pred_test, average = 'macro')
  
  # #print('Best CV Accuracy: ', cv_acc)
  # #print('Out of sample CV test accuracy: ',cv_test)
  # lst = [var,acc_train,acc_test,f1_test,cv_acc,cv_test,cv_f1_test]
  return lst

In [None]:
def NN(x_train,y_train,x_test,y_test,var,results):
  n_features = x_train.shape[1]
  model = Sequential()
  if len(np.unique(y_train))>2: #multiclass
    act = "softmax"
  else:
    act = 'sigmoid' #binary
  out_node = y_train.shape[1]  
  model.add(Dense(12, input_shape=(n_features,), activation='relu'))
  model.add(Dense(8, activation='relu'))
  model.add(Dense(out_node, activation=act))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)


In [None]:
# Train Test split
mobility = NTS_dummy.iloc[:,38:]
socio = NTS_dummy.drop(columns = ['IndividualID','SurveyYear']).iloc[:,0:36].\
drop(columns=['Sex_B01ID_2','not_married','Settlement2011EW_B03ID_2']) # remove extra column in binary var
X_train, X_test, y_all_tr, y_all_ts = train_test_split(mobility, socio, random_state=42)

In [None]:
#normalize
scaler = MinMaxScaler().fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train),columns = X_train.columns)
#PCA
pca = PCA(n_components=0.85) #85% of variance explained
PC_train = pca.fit_transform(X_train)

#transoform test set
X_test = pd.DataFrame(scaler.transform(X_test),columns = X_test.columns)
PC_test = pca.transform(X_test)

#principalDf = pd.DataFrame(data = principalComponents)
pca.n_components_


10

#Sociodemographic prediction: RF

In [None]:
results = []
for var in socio_cols.keys():
  y_train,y_test, le = dependant(y_all_tr,y_all_ts,socio_cols,var)
  results.append(RF(PC_train,y_train,PC_test,y_test,var,results))



In [None]:
RF_results = pd.DataFrame(results,columns= ['Y', 'Train Accuracy', 'Test Accuracy',\
                               'F1 Macro','CV Accuracy', 'CV Test Accuracy',\
                               'CV F1 Macro'] )


RF_results.to_csv('/content/drive/MyDrive/socio_pred/UKDA-5340-tab/RF_results.csv',\
                 index=False)
RF_results

Unnamed: 0,Y,Train Accuracy,Test Accuracy,F1 Macro,CV Accuracy,CV Test Accuracy,CV F1 Macro
0,age,0.997272,0.356209,0.213756,0.665576,0.372549,0.205859
1,sex,0.992908,0.553922,0.553911,0.660666,0.545752,0.54493
2,marital,0.997818,0.580065,0.520942,0.932897,0.584967,0.506527
3,income,0.997818,0.54085,0.359669,0.714675,0.553922,0.331274
4,work,0.998363,0.612745,0.433115,0.863066,0.638889,0.455164
5,hhincome,0.998909,0.462418,0.399601,0.828696,0.47549,0.392702
6,hh_struc,0.997818,0.380719,0.177404,0.449536,0.418301,0.130025
7,hh_work,1.0,0.611111,0.429379,0.796508,0.616013,0.421949
8,hh_socialclass,0.998363,0.370915,0.247813,0.461538,0.385621,0.196811
9,urban_rural,1.0,0.728758,0.454358,0.864703,0.74183,0.449427


#Sociodemographic prediction: LR

In [None]:
results = []
for var in socio_cols.keys():
  y_train,y_test, le = dependant(y_all_tr,y_all_ts,socio_cols,var)
  results.append(LR(PC_train,y_train,PC_test,y_test,var,results))

LR_results = pd.DataFrame(results,columns= ['Y', 'Train Accuracy', 'Test Accuracy',\
                               'F1 Macro'] )


# LR_results.to_csv('/content/drive/MyDrive/socio_pred/UKDA-5340-tab/LR_results.csv',\
#                  index=False)
LR_results

Unnamed: 0,Y,Train Accuracy,Test Accuracy,F1 Macro
0,age,0.376432,0.362745,0.189356
1,sex,0.595745,0.55719,0.55699
2,marital,0.609929,0.607843,0.465284
3,income,0.584288,0.560458,0.310763
4,work,0.627932,0.620915,0.427839
5,hhincome,0.504092,0.468954,0.36026
6,hh_struc,0.415712,0.431373,0.161804
7,hh_work,0.61593,0.622549,0.414694
8,hh_socialclass,0.398254,0.379085,0.192555
9,urban_rural,0.740862,0.745098,0.426966


#Sociodemographic prediction: XGB





In [None]:
results = []
for var in socio_cols.keys():
  y_train,y_test, le = dependant(y_all_tr,y_all_ts,socio_cols,var)
  results.append(xgboost(PC_train,y_train,PC_test,y_test,var,results))

XGB_results = pd.DataFrame(results,columns= ['Y', 'Train Accuracy', 'Test Accuracy',\
                               'F1 Macro'] )


# XGB_results.to_csv('/content/drive/MyDrive/socio_pred/UKDA-5340-tab/XGB_results.csv',\
#                  index=False)
XGB_results

Unnamed: 0,Y,Train Accuracy,Test Accuracy,F1 Macro
0,age,0.62084,0.383987,0.221601
1,sex,0.738134,0.552288,0.552288
2,marital,0.752864,0.589869,0.500389
3,income,0.690671,0.54085,0.338865
4,work,0.743044,0.632353,0.442745
5,hhincome,0.667212,0.436275,0.353806
6,hh_struc,0.573377,0.421569,0.180667
7,hh_work,0.725041,0.612745,0.411598
8,hh_socialclass,0.599564,0.379085,0.218909
9,urban_rural,0.779051,0.746732,0.473184


#Sociodemographic prediction: NN


In [None]:
results = []
for var in socio_cols.keys():
  y_train,y_test, le = dependant(y_all_tr,y_all_ts,socio_cols,var,dummy=1)
  results.append(NN(PC_train,y_train,PC_test,y_test,var,results))

NN_results = pd.DataFrame(results,columns= ['Y', 'Train Accuracy', 'Test Accuracy',\
                               'F1 Macro'] )


# NN_results.to_csv('/content/drive/MyDrive/socio_pred/UKDA-5340-tab/NN_results.csv',\
#                  index=False)
NN_results