In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

In [22]:
# Load the dataset:
survey_data = pd.read_csv("//Users//louis//Downloads//surveydata.csv")

# Q5 was "How many days per week do you work?". Converting it as type integer.
survey_data["Q5"] = pd.to_numeric(survey_data["Q5"], errors="coerce").fillna(0).astype(int)

In [23]:
# Define variable types:
numerical_vars = ["Q5"] # All variables, except for this one, are categorical in our dataset.
target_variable = "Q23"  # This is the predictor variable.
# Q23 was "Would you be interested in an app that will help you choose more sustainable modes of transport?".

categorical_vars = [col for col in survey_data.columns 
                    if col not in [target_variable] + numerical_vars]

# Remove blanks from the target variable:
survey_data = survey_data[survey_data[target_variable] != " "]
# If we don't remove this, the model will try to predict blank responses.

In [24]:
# Encode the target variable (0-Yes, 1-Maybe, 2-No, 3-I don't know):
le = LabelEncoder()
y = le.fit_transform(survey_data[target_variable]) # Again, this is our predictor variable (y).

# We are applying the encoder to our target variable here. This will result in 4 levels.
print("New Target Variable Encoding:", dict(zip(le.classes_, le.transform(le.classes_))))

New Target Variable Encoding: {'1': 0, '2': 1, '3': 2, '4': 3}


In [25]:
# Define variable (X):
X = survey_data[categorical_vars + numerical_vars]

# One-hot encode categorical vars.
X = pd.get_dummies(X, columns=categorical_vars, drop_first=True)
# drop_first ensures there are no issues of collinearity.

# Scale numerical variables:
scaler = StandardScaler()
X[numerical_vars] = scaler.fit_transform(X[numerical_vars])

# Split data for initial Logistic Regression model:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Logistic Regression Model:
log_reg = LogisticRegression(
    multi_class="multinomial",
     solver="lbfgs", # (L-BFGS)
      max_iter=500)
# Limited-memory Broyden–Fletcher–Goldfarb–Shanno finds the optimal weights for the logistic regression model.
log_reg.fit(X_train, y_train)

# Make predictions:
y_pred = log_reg.predict(X_test)

In [26]:
# Evaluate the model:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.37333333333333335

Classification Report:
               precision    recall  f1-score   support

           0       0.06      0.04      0.05        25
           1       0.35      0.32      0.33        50
           2       0.55      0.64      0.59        58
           3       0.11      0.12      0.11        17

    accuracy                           0.37       150
   macro avg       0.27      0.28      0.27       150
weighted avg       0.35      0.37      0.36       150



In [27]:
# Feature Importance from Logistic Regression:
feature_importance = pd.DataFrame(
    {"Feature": X.columns, 
      "Importance": log_reg.coef_[0]})
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)
print(feature_importance.to_string())

              Feature  Importance
45            Q13_6_1    1.194404
173             Q26_2    1.082967
54            Q14_1_1    1.052972
18              Q34_1    0.942319
140           Q20_8_1    0.801076
164           Q21_7_3    0.755724
158           Q21_5_3    0.696857
35              Q12_2    0.691313
7                Q7_7    0.687125
188             Q28_4    0.617354
50           Q13_11_1    0.615444
118           Q19_5_7    0.593908
39              Q12_7    0.585011
22            Q10_1_1    0.576920
163           Q21_7_1    0.566397
88            Q19_1_5    0.554680
166           Q21_8_1    0.550014
136           Q20_4_1    0.529951
10               Q8_1    0.521340
24           Q10_12_1    0.505083
20              Q34_3    0.503317
131           Q19_7_6    0.501199
16            Q33_3_1    0.496622
72           Q15_11_1    0.448165
167           Q21_8_3    0.446660
170           Q21_9_3    0.446536
59            Q14_7_1    0.442753
96            Q19_2_6    0.410790
169           

In [28]:
# Select only variables with positive feature importance.
filtered_features = feature_importance[feature_importance["Importance"] > 0]

# Extract feature names:
new_categorical_vars = filtered_features["Feature"].tolist()
print("Selected Features (Positive Importance):")
print(new_categorical_vars)

Selected Features (Positive Importance):
['Q13_6_1', 'Q26_2', 'Q14_1_1', 'Q34_1', 'Q20_8_1', 'Q21_7_3', 'Q21_5_3', 'Q12_2', 'Q7_7', 'Q28_4', 'Q13_11_1', 'Q19_5_7', 'Q12_7', 'Q10_1_1', 'Q21_7_1', 'Q19_1_5', 'Q21_8_1', 'Q20_4_1', 'Q8_1', 'Q10_12_1', 'Q34_3', 'Q19_7_6', 'Q33_3_1', 'Q15_11_1', 'Q21_8_3', 'Q21_9_3', 'Q14_7_1', 'Q19_2_6', 'Q21_9_1', 'Q26_7', 'Q19_6_2', 'Q13_7_1', 'Q19_5_1', 'Q16_3', 'Q16R_3', 'Q21_6_5', 'Q14_4_1', 'Q25_5', 'Q25_4', 'Income_3cat_4', 'Q25_3', 'Q19_4_4', 'Q33_2_1', 'Q19_2_1', 'Q21_4_5', 'Q18_1', 'Q8_3', 'Q7_6', 'Q19_3_4', 'Q19_6_3', 'Q7_8', 'Q14_9_1', 'Q19_4_3', 'Q21_1_5', 'Q20_2_1', 'Q21_2_5', 'Q18_2', 'Q20_5_1', 'Q19_5_4', 'Q28_9', 'Q12_1', 'Q13_12_1', 'Q16_5', 'Q16R_1', 'Q18_3', 'Q28_8', 'Q26_3', 'Q21_5_1', 'Q13_1_1', 'Q19_7_7', 'Q13_3_1', 'Q19_7_3', 'Q19_2_2', 'Q14_12_1', 'Q19_7_1', 'Q19_1_2', 'Q19_1_3', 'Q20_1_1', 'Q8_2', 'Q21_4_3', 'Income_3cat_8', 'Q19_3_7', 'Q21_3_5', 'Q7_drive_alone_3', 'Q7_3', 'Q15_3_1', 'Q19_4_5', 'Q20_3_1', 'Income_3cat_1', 'Q13_4_1

In [29]:
# Create another copy to avoid confusion:
survey_data_3level = survey_data.copy()

# Convert "I Don't Know" into "Maybe" to create a 3-Level Target Variable:
survey_data_3level[target_variable] = survey_data_3level[target_variable].replace({"4": "2"})

# Encode the target variable for 3-Level Classification:
le_3level = LabelEncoder()
y_3level = le_3level.fit_transform(survey_data_3level[target_variable])
print("3-Level Target Encoding:", dict(zip(le_3level.classes_, le_3level.transform(le_3level.classes_))))

3-Level Target Encoding: {'1': 0, '2': 1, '3': 2}


In [30]:
# Select only the important features for training all models:
important_features = filtered_features["Feature"].tolist()
X_important = X[new_categorical_vars]

In [31]:
# Split and Train 3-Level Logistic Regression Model:
X_train, X_test, y_train, y_test = train_test_split(X_important, y_3level, test_size=0.2, random_state=42, stratify=y_3level)
log_reg = LogisticRegression(
    multi_class="multinomial", 
     solver="lbfgs", 
      max_iter=500)
log_reg.fit(X_train, y_train)

# Evaluate 3-Level Logistic Regression Model:
y_pred = log_reg.predict(X_test)
print("3-Level Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report (3-Level):\n", classification_report(y_test, y_pred, target_names=le_3level.classes_))

3-Level Model Accuracy: 0.49333333333333335

Classification Report (3-Level):
               precision    recall  f1-score   support

           1       0.26      0.20      0.23        25
           2       0.51      0.58      0.54        67
           3       0.56      0.52      0.54        58

    accuracy                           0.49       150
   macro avg       0.44      0.43      0.43       150
weighted avg       0.48      0.49      0.49       150



In [32]:
# Train Random Forest Model on All Features:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest Model:
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.5

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        25
           1       0.49      0.66      0.56        67
           2       0.53      0.53      0.53        58

    accuracy                           0.50       150
   macro avg       0.34      0.40      0.36       150
weighted avg       0.43      0.50      0.46       150



In [33]:
# Feature Importance from Random Forest:
feature_importance_rf = pd.DataFrame(
    {"Feature": X_important.columns,
      "Importance": rf_model.feature_importances_})
feature_importance_rf = feature_importance_rf.sort_values(by="Importance", ascending=False)
print("\nRandom Forest Feature Importance:\n", feature_importance_rf.to_string())


Random Forest Feature Importance:
               Feature  Importance
97            Q21_6_3    0.026544
87            Q20_3_1    0.021471
4             Q20_8_1    0.020270
45              Q18_1    0.020060
5             Q21_7_3    0.019633
25            Q21_9_3    0.019270
66              Q26_3    0.019267
98            Q19_3_2    0.019081
37              Q25_5    0.018262
56              Q18_2    0.018227
79            Q21_4_3    0.018036
58            Q19_5_4    0.017785
90            Q21_4_1    0.017633
77            Q20_1_1    0.017462
67            Q21_5_1    0.017449
54            Q20_2_1    0.017291
7               Q12_2    0.017049
19           Q10_12_1    0.017007
6             Q21_5_3    0.016988
14            Q21_7_1    0.016703
27            Q19_2_6    0.016530
95           Q20_10_1    0.015634
41            Q19_4_4    0.015107
28            Q21_9_1    0.014444
52            Q19_4_3    0.014224
48            Q19_3_4    0.014027
75            Q19_1_2    0.013885
13          

In [34]:
# Train a Random Forest Model on Important Features Only:
X_train_rf_imp, X_test_rf_imp, y_train_rf_imp, y_test_rf_imp = train_test_split(X_important, y_3level, test_size=0.2, random_state=42, stratify=y_3level)
rf_model_imp = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_imp.fit(X_train_rf_imp, y_train_rf_imp)
y_pred_rf_imp = rf_model_imp.predict(X_test_rf_imp)

# Evaluate Performance
print("Random Forest Accuracy (Important Features Only):", accuracy_score(y_test_rf_imp, y_pred_rf_imp))
print("\nRandom Forest Classification Report (Important Features Only):\n", classification_report(y_test_rf_imp, y_pred_rf_imp, target_names=le_3level.classes_))

Random Forest Accuracy (Important Features Only): 0.5

Random Forest Classification Report (Important Features Only):
               precision    recall  f1-score   support

           1       0.00      0.00      0.00        25
           2       0.49      0.66      0.56        67
           3       0.53      0.53      0.53        58

    accuracy                           0.50       150
   macro avg       0.34      0.40      0.36       150
weighted avg       0.43      0.50      0.46       150



In [35]:
# Train XGBoost Model on All Features:
xgb_model = XGBClassifier(eval_metric="mlogloss", random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate XGBoost Model:
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nXGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb, target_names=le_3level.classes_))

XGBoost Accuracy: 0.4266666666666667

XGBoost Classification Report:
               precision    recall  f1-score   support

           1       0.27      0.12      0.17        25
           2       0.43      0.49      0.46        67
           3       0.44      0.48      0.46        58

    accuracy                           0.43       150
   macro avg       0.38      0.37      0.36       150
weighted avg       0.41      0.43      0.41       150



In [36]:
# Train XGBoost Model on Important Features Only:
X_train_xgb_imp, X_test_xgb_imp, y_train_xgb_imp, y_test_xgb_imp = train_test_split(X_important, y_3level, test_size=0.2, random_state=42, stratify=y_3level)
xgb_model_imp = XGBClassifier(eval_metric="mlogloss", random_state=42)
xgb_model_imp.fit(X_train_xgb_imp, y_train_xgb_imp)
y_pred_xgb_imp = xgb_model_imp.predict(X_test_xgb_imp)

# Evaluate Performance:
print("XGBoost Accuracy (Important Features Only):", accuracy_score(y_test_xgb_imp, y_pred_xgb_imp))
print("\nXGBoost Classification Report (Important Features Only):\n", classification_report(y_test_xgb_imp, y_pred_xgb_imp, target_names=le_3level.classes_))

XGBoost Accuracy (Important Features Only): 0.4266666666666667

XGBoost Classification Report (Important Features Only):
               precision    recall  f1-score   support

           1       0.27      0.12      0.17        25
           2       0.43      0.49      0.46        67
           3       0.44      0.48      0.46        58

    accuracy                           0.43       150
   macro avg       0.38      0.37      0.36       150
weighted avg       0.41      0.43      0.41       150



In [37]:
# Feature Importance for Updated XGBoost Model:
feature_importance_xgb_imp = pd.DataFrame(
    {"Feature": X_important.columns,
      "Importance": xgb_model_imp.feature_importances_})
feature_importance_xgb_imp = feature_importance_xgb_imp.sort_values(by="Importance", ascending=False)
print("\nXGBoost Feature Importance After Retraining:\n", feature_importance_xgb_imp.to_string())


XGBoost Feature Importance After Retraining:
               Feature  Importance
2             Q14_1_1    0.026892
25            Q21_9_3    0.026757
3               Q34_1    0.025304
42            Q33_2_1    0.023985
43            Q19_2_1    0.022210
64              Q18_3    0.020229
17            Q20_4_1    0.019760
6             Q21_5_3    0.017528
97            Q21_6_3    0.016573
57            Q20_5_1    0.016541
89            Q13_4_1    0.016047
101          Q20_12_1    0.015671
62              Q16_5    0.014186
44            Q21_4_5    0.014017
82            Q21_3_5    0.013871
49            Q19_6_3    0.013581
13            Q10_1_1    0.013512
31            Q13_7_1    0.013419
8                Q7_7    0.013239
32            Q19_5_1    0.013161
96            Q19_3_5    0.012851
88      Income_3cat_1    0.012768
85            Q15_3_1    0.012672
73           Q14_12_1    0.012467
4             Q20_8_1    0.012375
67            Q21_5_1    0.011981
1               Q26_2    0.011938
3