In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Preliminary Cleaning/Setup

In [4]:

columns = [
"birth_year",
"gender",
"industry_desc",
"occupation_desc",
"organization_size",
"manage_others",
"household",
"years_at_job",
"metro_or_regional",
"rw_percentage_2020",
"org_encouraged_rw",
"org_prepared_for_rw",
"rw_is_common_at_org",
"rw_permission_is_attainable",
"rw_collaboration_easy",
"preferred_rw_percentage_2020",
"preferred_rw_percentage_future",
"if_no_covid_employer_encourage_rw",
"if_no_covid_employer_support_rw",
"if_no_covid_i_would_have_choice_about_rw",
"productivity_remote_vs_office",
"inperson_hours__commuting",
"inperson_hours__working",
"inperson_hours__personal_family_time",
"inperson_hours_domestic_responsibilities",
"remote_hours_commuting",
"remote_hours_working",
"remote_hours_personal_family_time",
"remote_hours_domestic_responsibilities",
]

target = ["productivity_remote_vs_office"]

In [5]:
# Load the data
file_path = Path('merged_df_cleaned_040323.csv')
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()

# Convert the target column values to More_or_same_productive and less_productive based on their values
x = dict.fromkeys(['more productive','much more productive','same productivity'], 'more_or_same_productive')
df = df.replace(x)

x = dict.fromkeys(['less productive', 'much less productive'], 'less_productive')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()


Unnamed: 0,birth_year,gender,industry_desc,occupation_desc,organization_size,manage_others,household,years_at_job,metro_or_regional,rw_percentage_2020,...,if_no_covid_i_would_have_choice_about_rw,productivity_remote_vs_office,inperson_hours__commuting,inperson_hours__working,inperson_hours__personal_family_time,inperson_hours_domestic_responsibilities,remote_hours_commuting,remote_hours_working,remote_hours_personal_family_time,remote_hours_domestic_responsibilities
0,1972,Female,Other,Clerical and administrative,Between 20 and 199,No,Couple with no dependent children,More than 5 years,Regional,4-5 days,...,Strongly disagree,more_or_same_productive,2.0,8.0,2.0,2.0,0.5,8.0,3.5,2.0
1,1972,Male,Other,Managers,Between 1 and 4,Yes,Couple with dependent children,More than 5 years,Metro,1-2 days,...,Somewhat agree,less_productive,2.0,7.0,3.0,3.0,0.0,7.0,3.0,3.0
2,1982,Male,Other,Managers,More than 200,Yes,One parent family with dependent children,More than 5 years,Metro,2-3 days,...,Somewhat agree,less_productive,6.0,1.0,6.0,5.0,5.0,2.0,7.0,7.0
3,1987,Female,"Professional, Scientific and Technical Services",Professionals,Between 20 and 199,No,Couple with dependent children,Between 1 and 5 years,Metro,4-5 days,...,Somewhat agree,more_or_same_productive,1.0,9.0,1.0,2.0,0.0,9.0,3.0,2.0
4,1991,Male,Other,Managers,Between 5 and 19,Yes,Couple with no dependent children,More than 5 years,Metro,4-5 days,...,Strongly disagree,more_or_same_productive,1.0,8.0,3.5,2.0,0.0,6.0,4.0,3.0


# Split the Data into Training and Testing

In [6]:
# Create our features
X = df.drop(columns= 'productivity_remote_vs_office')
X = pd.get_dummies(X)

# Create our target
y = df[target]

In [7]:
X.describe()

Unnamed: 0,birth_year,inperson_hours__commuting,inperson_hours__working,inperson_hours__personal_family_time,inperson_hours_domestic_responsibilities,remote_hours_commuting,remote_hours_working,remote_hours_personal_family_time,remote_hours_domestic_responsibilities,gender_Female,...,if_no_covid_i_would_have_choice_about_rw_Neither agree nor disagree,if_no_covid_i_would_have_choice_about_rw_Neither unlikely or likely,if_no_covid_i_would_have_choice_about_rw_Somewhat agree,if_no_covid_i_would_have_choice_about_rw_Somewhat disagree,if_no_covid_i_would_have_choice_about_rw_Somewhat likely,if_no_covid_i_would_have_choice_about_rw_Somewhat unlikely,if_no_covid_i_would_have_choice_about_rw_Strongly agree,if_no_covid_i_would_have_choice_about_rw_Strongly disagree,if_no_covid_i_would_have_choice_about_rw_Very likely,if_no_covid_i_would_have_choice_about_rw_Very unlikely
count,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,...,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0
mean,1977.012587,1.920543,7.622295,3.682458,2.42953,0.731305,7.91466,4.294395,2.719361,0.477973,...,0.103345,0.127194,0.181186,0.054654,0.150712,0.073534,0.121563,0.038423,0.084134,0.065253
std,12.059818,1.193116,1.813712,2.053795,1.586734,1.078736,2.028679,2.186404,1.640603,0.499597,...,0.30446,0.333246,0.385236,0.227341,0.357827,0.261055,0.326835,0.192248,0.277634,0.247013
min,1900.0,0.0,1.0,0.0,-1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1967.0,1.0,7.0,2.0,1.0,0.0,7.0,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1978.0,2.0,8.0,3.5,2.0,0.5,8.0,4.0,2.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1987.0,2.5,8.0,5.0,3.0,1.0,9.0,5.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2002.0,6.0,14.0,11.0,8.0,5.0,15.0,12.0,8.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Check the balance of our target values
y['productivity_remote_vs_office'].value_counts()

more_or_same_productive    2539
less_productive             480
Name: productivity_remote_vs_office, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Ensemble Learners

In this section, we compaerd two ensemble algorithms to determine which algorithm results in the best performance. We trained a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . 

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [10]:
# Resample the training data with the BalancedRandomForestClassifier
# Create a random forest classifier.
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [11]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = rf_model.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[  7, 105],
       [  1, 642]])

In [12]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.5304723950233281

In [13]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                               pre       rec       spe        f1       geo       iba       sup

        less_productive       0.88      0.06      1.00      0.12      0.25      0.06       112
more_or_same_productive       0.86      1.00      0.06      0.92      0.25      0.07       643

            avg / total       0.86      0.86      0.20      0.80      0.25      0.07       755



In [14]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_

sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.04710795133268984, 'birth_year'),
 (0.03522582211749134, 'remote_hours_personal_family_time'),
 (0.032169944142902064, 'remote_hours_working'),
 (0.03216724523188402, 'inperson_hours__personal_family_time'),
 (0.03167107990838756, 'inperson_hours_domestic_responsibilities'),
 (0.031456874926753385, 'remote_hours_domestic_responsibilities'),
 (0.029127808763287757, 'inperson_hours__working'),
 (0.028474811694017967, 'preferred_rw_percentage_future_0-1 days'),
 (0.027601789808104345, 'preferred_rw_percentage_2020_0-1 days'),
 (0.026814444478441495, 'inperson_hours__commuting'),
 (0.02401127499430594, 'remote_hours_commuting'),
 (0.019378786282843834, 'rw_collaboration_easy_Somewhat disagree'),
 (0.013711902546821933, 'rw_collaboration_easy_Strongly agree'),
 (0.012731572985368513, 'preferred_rw_percentage_future_4-5 days'),
 (0.010577504012254964, 'org_prepared_for_rw_Somewhat disagree'),
 (0.010287413603266729, 'preferred_rw_percentage_2020_4-5 days'),
 (0.010139835415232116, 'indus

### Easy Ensemble AdaBoost Classifier

In [15]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=42)
eec.fit(X_train, y_train)



EasyEnsembleClassifier(random_state=42)

In [16]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.5304723950233281

In [17]:
# Display the confusion matrix
y_pred = eec.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[ 69  43]
 [214 429]]


In [18]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                               pre       rec       spe        f1       geo       iba       sup

        less_productive       0.24      0.62      0.67      0.35      0.64      0.41       112
more_or_same_productive       0.91      0.67      0.62      0.77      0.64      0.41       643

            avg / total       0.81      0.66      0.62      0.71      0.64      0.41       755

