In [12]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [14]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Preliminary Cleaning/Setup

In [15]:

columns = [
"birth_year",
"gender",
"industry_desc",
"occupation_desc",
"organization_size",
"manage_others",
"household",
"years_at_job",
"metro_or_regional",
"rw_percentage_2020",
"org_encouraged_rw",
"org_prepared_for_rw",
"rw_is_common_at_org",
"rw_permission_is_attainable",
"rw_collaboration_easy",
"preferred_rw_percentage_2020",
"preferred_rw_percentage_future",
"if_no_covid_employer_encourage_rw",
"if_no_covid_employer_support_rw",
"if_no_covid_i_would_have_choice_about_rw",
"productivity_remote_vs_office",
"inperson_hours__commuting",
"inperson_hours__working",
"inperson_hours__personal_family_time",
"inperson_hours_domestic_responsibilities",
"remote_hours_commuting",
"remote_hours_working",
"remote_hours_personal_family_time",
"remote_hours_domestic_responsibilities",
]

target = ["preferred_rw_percentage_future"]



In [16]:
# Load the data
file_path = Path('merged_df_cleaned_040323.csv')
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()

# Convert the target column values to More_or_same_productive and less_productive based on their values
x = dict.fromkeys(['0-1 days','1-2 days','2-3 days','No response'], '0-3 days remote')
df = df.replace(x)

x = dict.fromkeys(['3-4 days', '4-5 days'], '3-5 days remote')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()


Unnamed: 0,birth_year,gender,industry_desc,occupation_desc,organization_size,manage_others,household,years_at_job,metro_or_regional,rw_percentage_2020,...,if_no_covid_i_would_have_choice_about_rw,productivity_remote_vs_office,inperson_hours__commuting,inperson_hours__working,inperson_hours__personal_family_time,inperson_hours_domestic_responsibilities,remote_hours_commuting,remote_hours_working,remote_hours_personal_family_time,remote_hours_domestic_responsibilities
0,1972,Female,Other,Clerical and administrative,Between 20 and 199,No,Couple with no dependent children,More than 5 years,Regional,3-5 days remote,...,Strongly disagree,much more productive,2.0,8.0,2.0,2.0,0.5,8.0,3.5,2.0
1,1972,Male,Other,Managers,Between 1 and 4,Yes,Couple with dependent children,More than 5 years,Metro,0-3 days remote,...,Somewhat agree,less productive,2.0,7.0,3.0,3.0,0.0,7.0,3.0,3.0
2,1982,Male,Other,Managers,More than 200,Yes,One parent family with dependent children,More than 5 years,Metro,0-3 days remote,...,Somewhat agree,less productive,6.0,1.0,6.0,5.0,5.0,2.0,7.0,7.0
3,1987,Female,"Professional, Scientific and Technical Services",Professionals,Between 20 and 199,No,Couple with dependent children,Between 1 and 5 years,Metro,3-5 days remote,...,Somewhat agree,same productivity,1.0,9.0,1.0,2.0,0.0,9.0,3.0,2.0
4,1991,Male,Other,Managers,Between 5 and 19,Yes,Couple with no dependent children,More than 5 years,Metro,3-5 days remote,...,Strongly disagree,more productive,1.0,8.0,3.5,2.0,0.0,6.0,4.0,3.0


# Split the Data into Training and Testing

In [17]:
# Create our features
X = df.drop(columns= 'preferred_rw_percentage_future')
X = pd.get_dummies(X)

# Create our target
y = df[target]

In [18]:
X.describe()

Unnamed: 0,birth_year,inperson_hours__commuting,inperson_hours__working,inperson_hours__personal_family_time,inperson_hours_domestic_responsibilities,remote_hours_commuting,remote_hours_working,remote_hours_personal_family_time,remote_hours_domestic_responsibilities,gender_Female,...,if_no_covid_i_would_have_choice_about_rw_Somewhat unlikely,if_no_covid_i_would_have_choice_about_rw_Strongly agree,if_no_covid_i_would_have_choice_about_rw_Strongly disagree,if_no_covid_i_would_have_choice_about_rw_Very likely,if_no_covid_i_would_have_choice_about_rw_Very unlikely,productivity_remote_vs_office_less productive,productivity_remote_vs_office_more productive,productivity_remote_vs_office_much less productive,productivity_remote_vs_office_much more productive,productivity_remote_vs_office_same productivity
count,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,...,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0
mean,1977.012587,1.920543,7.622295,3.682458,2.42953,0.731305,7.91466,4.294395,2.719361,0.477973,...,0.073534,0.121563,0.038423,0.084134,0.065253,0.10997,0.214972,0.049023,0.371977,0.254058
std,12.059818,1.193116,1.813712,2.053795,1.586734,1.078736,2.028679,2.186404,1.640603,0.499597,...,0.261055,0.326835,0.192248,0.277634,0.247013,0.312904,0.410871,0.215952,0.483412,0.435402
min,1900.0,0.0,1.0,0.0,-1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1967.0,1.0,7.0,2.0,1.0,0.0,7.0,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1978.0,2.0,8.0,3.5,2.0,0.5,8.0,4.0,2.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1987.0,2.5,8.0,5.0,3.0,1.0,9.0,5.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
max,2002.0,6.0,14.0,11.0,8.0,5.0,15.0,12.0,8.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
# Check the balance of our target values
y['preferred_rw_percentage_future'].value_counts()

0-3 days remote    1840
3-5 days remote    1179
Name: preferred_rw_percentage_future, dtype: int64

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Ensemble Learners

In this section, we compaerd two ensemble algorithms to determine which algorithm results in the best performance. We trained a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . 

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [21]:
# Resample the training data with the BalancedRandomForestClassifier
# Create a random forest classifier.
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [22]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = rf_model.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[379,  79],
       [ 61, 236]])

In [23]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.8110618558216811

In [24]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

0-3 days remote       0.86      0.83      0.79      0.84      0.81      0.66       458
3-5 days remote       0.75      0.79      0.83      0.77      0.81      0.66       297

    avg / total       0.82      0.81      0.81      0.82      0.81      0.66       755



In [25]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_

sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.12324816745545442, 'preferred_rw_percentage_2020_0-3 days remote'),
 (0.11139825200428372, 'preferred_rw_percentage_2020_3-5 days remote'),
 (0.0537213115604658, 'rw_percentage_2020_3-5 days remote'),
 (0.039310797273153335, 'rw_percentage_2020_0-3 days remote'),
 (0.03403957301537556, 'birth_year'),
 (0.024566140933206092, 'remote_hours_personal_family_time'),
 (0.023104267500571866, 'inperson_hours__personal_family_time'),
 (0.022759068220373685, 'remote_hours_working'),
 (0.022541033155893193, 'inperson_hours__commuting'),
 (0.02213295567225459, 'inperson_hours__working'),
 (0.02132999949513569, 'inperson_hours_domestic_responsibilities'),
 (0.021083250080721115, 'remote_hours_domestic_responsibilities'),
 (0.01838489177171691, 'rw_collaboration_easy_Strongly agree'),
 (0.01837214058981317, 'manage_others_0-3 days remote'),
 (0.01726526309656081, 'remote_hours_commuting'),
 (0.01099588920855011, 'manage_others_No'),
 (0.010863447265962457, 'productivity_remote_vs_office_much mor

### Easy Ensemble AdaBoost Classifier

In [26]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=42)
eec.fit(X_train, y_train)



EasyEnsembleClassifier(random_state=42)

In [27]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.8110618558216811

In [28]:
# Display the confusion matrix
y_pred = eec.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[356 102]
 [ 32 265]]


In [29]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

0-3 days remote       0.92      0.78      0.89      0.84      0.83      0.69       458
3-5 days remote       0.72      0.89      0.78      0.80      0.83      0.70       297

    avg / total       0.84      0.82      0.85      0.82      0.83      0.69       755

