In [35]:
from sklearn.model_selection import train_test_split
import pandas as pd
import sklearn
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA
from numpy import mean, std

In [3]:
NTS_dummy = pd.read_csv('/content/drive/MyDrive/socio_pred/UKDA-5340-tab/NTS_v2.csv')
# replace NA with zero
#na occurs for std of users with one trip, or count of users with no weekend trip
NTS_dummy = NTS_dummy.fillna(0)

mobility = NTS_dummy.iloc[:,38:]
socio = NTS_dummy.drop(columns = ['IndividualID','SurveyYear']).iloc[:,0:36]

socio_cols = {'age' : socio.columns[0:6],
'sex' : socio.columns[6:8],
'marital' : socio.columns[8:10],
'income': socio.columns[10:13],
'work': socio.columns[13:16],
'hhincome': socio.columns[16:19],
'hh_struc': socio.columns[19:25],
'hh_work': socio.columns[25:28],
'hh_socialclass': socio.columns[28:34],
'urban_rural': socio.columns[34:36]}

In [50]:
#evaluation function
def evaluate(model, x_test, y_test):
    y_pred = model.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print (accuracy)
    print(confusion_matrix(y_test,y_pred))

In [76]:
#select dependant var
def dependant(train,test,cols,var):
  y_train = train[cols[var]]
  y_test = test[cols[var]]

  # convert to one columns labeles
  label_encoder = LabelEncoder()
  y_train = y_train.idxmax(axis=1)
  y_test = y_test.idxmax(axis=1)
  label_encoder.fit(y_train)
  y_train = label_encoder.transform(y_train)
  y_test = label_encoder.transform(y_test)

  #to get back labels
  #le.inverse_transform(y_train)
  return y_train,y_test,label_encoder

In [None]:
def RF(x_train,y_train,x_test,y_test,var):
  #no hyperparameter tuning
  model = RandomForestClassifier(n_estimators=200,n_jobs=-1, random_state=42)
  model.fit(x_train,y_train)
  y_pred = model.predict(x_test)
  print('Dependant Variable <', var, ' >:')
  print('Random Forest Accuracy, Test Set: ')
  print(metrics.accuracy_score(y_test, y_pred))
  print('Random Forest F1 Score, Test Set: ')
  print(metrics.f1_score(y_test, y_pred, average = 'macro'))

  # 8-fold crossvalidation

  param_grid = {
      'max_depth': [5,10,15],
      'max_features': [4,6],
      'min_samples_leaf': [5,10],
      'min_samples_split': [5,10],
      'n_estimators': [100, 200,1000]
  }
  # Create a based model
  classifier = RandomForestClassifier()

  # Instantiate the grid search model
  grid_search = GridSearchCV(estimator = classifier, param_grid = param_grid, 
                            cv = 8, n_jobs = -1, verbose = 0)

  # Fit the grid search to the data
  print('8-fold crossvalidation on training data: ')
  grid_search.fit(x_train,y_train)
  model = grid_search.best_estimator_
  y_pred_train = model.predict(x_train)
  y_pred_test = model.predict(x_test)
  print('Best CV Accuracy: ', metrics.accuracy_score(y_train,y_pred_train))
  print('Out of sample CV test accuracy: ',metrics.accuracy_score(y_test\
                                                                  ,y_pred_test))

In [77]:
# Train Test split
mobility = NTS_dummy.iloc[:,38:]
socio = NTS_dummy.drop(columns = ['IndividualID','SurveyYear']).iloc[:,0:36]
X_train, X_test, y_all_tr, y_all_ts = train_test_split(mobility, socio, random_state=42)

In [78]:
#normalize
scaler = MinMaxScaler().fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train),columns = X_train.columns)
#PCA
pca = PCA(n_components=0.85) #85% of variance explained
PC_train = pca.fit_transform(X_train)

#transoform test set
X_test = pd.DataFrame(scaler.transform(X_test),columns = X_test.columns)
PC_test = pca.transform(X_test)


#principalDf = pd.DataFrame(data = principalComponents)
pca.n_components_


10

#Sociodemographic prediction: RF

In [81]:
for var in socio_cols.keys():
  y_train,y_test, le = dependant(y_all_tr,y_all_ts,socio_cols,var)
  RF(PC_train,y_train,PC_test,y_test,var)



Dependant Variable < age  >:
Random Forest Accuracy, Test Set: 
0.3562091503267974
Random Forest F1 Score, Test Set: 
0.3562091503267974
8-fold crossvalidation on training data: 
Best CV Accuracy:  0.5090016366612111
Out of sample CV test accuracy:  0.34967320261437906
Dependant Variable < sex  >:
Random Forest Accuracy, Test Set: 
0.5473856209150327
Random Forest F1 Score, Test Set: 
0.5473856209150327
8-fold crossvalidation on training data: 


KeyboardInterrupt: ignored

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report the model performance
print('Mean Accuracy: %.3f' % (mean(n_scores)))

Mean Accuracy: 0.579
