# Worker Productivity Resampling 

In [39]:
import warnings
warnings.filterwarnings('ignore')

In [40]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

# Read the CSV and Perform Preliminary Cleaning/Setup

In [41]:
columns = [
"birth_year",
"gender",
"industry_desc",
"occupation_desc",
"organization_size",
"manage_others",
"household",
"years_at_job",
"metro_or_regional",
"rw_percentage_2020",
"org_encouraged_rw",
"org_prepared_for_rw",
"rw_is_common_at_org",
"rw_permission_is_attainable",
"rw_collaboration_easy",
"preferred_rw_percentage_2020",
"preferred_rw_percentage_future",
"if_no_covid_employer_encourage_rw",
"if_no_covid_employer_support_rw",
"if_no_covid_i_would_have_choice_about_rw",
"productivity_remote_vs_office",
"inperson_hours__commuting",
"inperson_hours__working",
"inperson_hours__personal_family_time",
"inperson_hours_domestic_responsibilities",
"remote_hours_commuting",
"remote_hours_working",
"remote_hours_personal_family_time",
"remote_hours_domestic_responsibilities",
]

target = ["productivity_remote_vs_office"]

In [42]:
# Load the data
file_path = Path('merged_df_cleaned_040323.csv')
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()

# Convert the target column values to More_or_same_productive and less_productive based on their values
x = dict.fromkeys(['more productive','much more productive','same productivity'], 'more_or_same_productive')
df = df.replace(x)

x = dict.fromkeys(['less productive', 'much less productive'], 'less_productive')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()




Unnamed: 0,birth_year,gender,industry_desc,occupation_desc,organization_size,manage_others,household,years_at_job,metro_or_regional,rw_percentage_2020,...,if_no_covid_i_would_have_choice_about_rw,productivity_remote_vs_office,inperson_hours__commuting,inperson_hours__working,inperson_hours__personal_family_time,inperson_hours_domestic_responsibilities,remote_hours_commuting,remote_hours_working,remote_hours_personal_family_time,remote_hours_domestic_responsibilities
0,1972,Female,Other,Clerical and administrative,Between 20 and 199,No,Couple with no dependent children,More than 5 years,Regional,4-5 days,...,Strongly disagree,more_or_same_productive,2.0,8.0,2.0,2.0,0.5,8.0,3.5,2.0
1,1972,Male,Other,Managers,Between 1 and 4,Yes,Couple with dependent children,More than 5 years,Metro,1-2 days,...,Somewhat agree,less_productive,2.0,7.0,3.0,3.0,0.0,7.0,3.0,3.0
2,1982,Male,Other,Managers,More than 200,Yes,One parent family with dependent children,More than 5 years,Metro,2-3 days,...,Somewhat agree,less_productive,6.0,1.0,6.0,5.0,5.0,2.0,7.0,7.0
3,1987,Female,"Professional, Scientific and Technical Services",Professionals,Between 20 and 199,No,Couple with dependent children,Between 1 and 5 years,Metro,4-5 days,...,Somewhat agree,more_or_same_productive,1.0,9.0,1.0,2.0,0.0,9.0,3.0,2.0
4,1991,Male,Other,Managers,Between 5 and 19,Yes,Couple with no dependent children,More than 5 years,Metro,4-5 days,...,Strongly disagree,more_or_same_productive,1.0,8.0,3.5,2.0,0.0,6.0,4.0,3.0


In [43]:
#show string datatypes for encoding
df.select_dtypes(include=['object'])

Unnamed: 0,gender,industry_desc,occupation_desc,organization_size,manage_others,household,years_at_job,metro_or_regional,rw_percentage_2020,org_encouraged_rw,org_prepared_for_rw,rw_is_common_at_org,rw_permission_is_attainable,rw_collaboration_easy,preferred_rw_percentage_2020,preferred_rw_percentage_future,if_no_covid_employer_encourage_rw,if_no_covid_employer_support_rw,if_no_covid_i_would_have_choice_about_rw,productivity_remote_vs_office
0,Female,Other,Clerical and administrative,Between 20 and 199,No,Couple with no dependent children,More than 5 years,Regional,4-5 days,Strongly agree,Somewhat agree,Somewhat agree,Strongly agree,Somewhat agree,4-5 days,4-5 days,Somewhat disagree,Somewhat disagree,Strongly disagree,more_or_same_productive
1,Male,Other,Managers,Between 1 and 4,Yes,Couple with dependent children,More than 5 years,Metro,1-2 days,Somewhat agree,Somewhat agree,Somewhat agree,Somewhat agree,Somewhat agree,1-2 days,1-2 days,Somewhat agree,Somewhat agree,Somewhat agree,less_productive
2,Male,Other,Managers,More than 200,Yes,One parent family with dependent children,More than 5 years,Metro,2-3 days,Neither agree nor disagree,Somewhat agree,Somewhat agree,Neither agree nor disagree,Somewhat agree,3-4 days,3-4 days,Somewhat agree,Somewhat agree,Somewhat agree,less_productive
3,Female,"Professional, Scientific and Technical Services",Professionals,Between 20 and 199,No,Couple with dependent children,Between 1 and 5 years,Metro,4-5 days,Strongly agree,Somewhat disagree,Somewhat agree,Strongly agree,Strongly agree,4-5 days,4-5 days,Somewhat agree,Strongly agree,Somewhat agree,more_or_same_productive
4,Male,Other,Managers,Between 5 and 19,Yes,Couple with no dependent children,More than 5 years,Metro,4-5 days,Somewhat disagree,Somewhat agree,Strongly agree,Somewhat disagree,Somewhat agree,4-5 days,3-4 days,Strongly disagree,Strongly disagree,Strongly disagree,more_or_same_productive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3014,Male,Other,"Managers - Chief Executives, General Managers ...",Between 5 and 19,Yes,Couple with dependent children,Between 1 and 5 years,Metro,1-2 days,Neither agree nor disagree,Somewhat agree,Neither agree nor disagree,Somewhat agree,Somewhat agree,2-3 days,1-2 days,Somewhat likely,Neither unlikely or likely,Somewhat likely,more_or_same_productive
3015,Female,Health Care and Social Assistance,Other,Between 5 and 19,Yes,Multiple family household,More than 5 years,Metro,0-1 days,Somewhat disagree,Somewhat disagree,Strongly disagree,Neither agree nor disagree,Neither agree nor disagree,0-1 days,0-1 days,Very unlikely,Very unlikely,Very unlikely,more_or_same_productive
3016,Female,Other,Other,Between 20 and 199,No,One parent family with dependent children,More than 5 years,Regional,0-1 days,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly agree,4-5 days,4-5 days,Very unlikely,Very unlikely,Very unlikely,more_or_same_productive
3017,Female,Education and Training,"Managers - Chief Executives, General Managers ...",Between 1 and 4,Yes,Couple with no dependent children,More than 5 years,Metro,0-1 days,Somewhat agree,Somewhat agree,Somewhat agree,Strongly agree,Somewhat agree,0-1 days,0-1 days,Somewhat likely,Somewhat likely,Somewhat likely,more_or_same_productive


# Split the Data into Training and Testing

In [44]:
# Create our features
X = df.drop(columns= 'productivity_remote_vs_office')
X = pd.get_dummies(X)

# Create our target
y = df[target]


In [45]:
X.describe()

Unnamed: 0,birth_year,inperson_hours__commuting,inperson_hours__working,inperson_hours__personal_family_time,inperson_hours_domestic_responsibilities,remote_hours_commuting,remote_hours_working,remote_hours_personal_family_time,remote_hours_domestic_responsibilities,gender_Female,...,if_no_covid_i_would_have_choice_about_rw_Neither agree nor disagree,if_no_covid_i_would_have_choice_about_rw_Neither unlikely or likely,if_no_covid_i_would_have_choice_about_rw_Somewhat agree,if_no_covid_i_would_have_choice_about_rw_Somewhat disagree,if_no_covid_i_would_have_choice_about_rw_Somewhat likely,if_no_covid_i_would_have_choice_about_rw_Somewhat unlikely,if_no_covid_i_would_have_choice_about_rw_Strongly agree,if_no_covid_i_would_have_choice_about_rw_Strongly disagree,if_no_covid_i_would_have_choice_about_rw_Very likely,if_no_covid_i_would_have_choice_about_rw_Very unlikely
count,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,...,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0,3019.0
mean,1977.012587,1.920543,7.622295,3.682458,2.42953,0.731305,7.91466,4.294395,2.719361,0.477973,...,0.103345,0.127194,0.181186,0.054654,0.150712,0.073534,0.121563,0.038423,0.084134,0.065253
std,12.059818,1.193116,1.813712,2.053795,1.586734,1.078736,2.028679,2.186404,1.640603,0.499597,...,0.30446,0.333246,0.385236,0.227341,0.357827,0.261055,0.326835,0.192248,0.277634,0.247013
min,1900.0,0.0,1.0,0.0,-1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1967.0,1.0,7.0,2.0,1.0,0.0,7.0,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1978.0,2.0,8.0,3.5,2.0,0.5,8.0,4.0,2.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1987.0,2.5,8.0,5.0,3.0,1.0,9.0,5.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2002.0,6.0,14.0,11.0,8.0,5.0,15.0,12.0,8.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [46]:
# Check the balance of our target values
y['productivity_remote_vs_office'].value_counts()

more_or_same_productive    2539
less_productive             480
Name: productivity_remote_vs_office, dtype: int64

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Oversampling

In this section,we compared two oversampling algorithms to determine which algorithm results in the best performance. We oversampled the data using the naive random oversampling algorithm and the SMOTE algorithm. 

We used a random state of 1 for each sampling algorithm to ensure consistency between tests

### Naive Random Oversampling

In [48]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)



In [49]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)


LogisticRegression(random_state=1)

In [50]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.6455162741612975

In [51]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[ 69,  43],
       [209, 434]])

In [52]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                               pre       rec       spe        f1       geo       iba       sup

        less_productive       0.25      0.62      0.67      0.35      0.64      0.41       112
more_or_same_productive       0.91      0.67      0.62      0.77      0.64      0.42       643

            avg / total       0.81      0.67      0.62      0.71      0.64      0.42       755



### SMOTE Oversampling

In [53]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

In [54]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)


LogisticRegression(random_state=1)

In [55]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5583828593645856

In [56]:
# Display the confusion matrix

confusion_matrix(y_test, y_pred)

array([[ 19,  93],
       [ 34, 609]])

In [57]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                               pre       rec       spe        f1       geo       iba       sup

        less_productive       0.36      0.17      0.95      0.23      0.40      0.15       112
more_or_same_productive       0.87      0.95      0.17      0.91      0.40      0.17       643

            avg / total       0.79      0.83      0.28      0.81      0.40      0.17       755



# Undersampling

In this section, we tested an undersampling algorithm to determine which algorithm results in the best performance compared to the oversampling algorithms above. We undersampled the data using the Cluster Centroids algorithm.

We used a random state of 1 for each sampling algorithm to ensure consistency between tests

In [58]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)


In [59]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [60]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.5583828593645856

In [61]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[106,   6],
       [596,  47]])

In [62]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                               pre       rec       spe        f1       geo       iba       sup

        less_productive       0.15      0.95      0.07      0.26      0.26      0.08       112
more_or_same_productive       0.89      0.07      0.95      0.14      0.26      0.06       643

            avg / total       0.78      0.20      0.82      0.15      0.26      0.06       755



# Combination (Over and Under) Sampling

In this section, we used a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. We resampled the data using the SMOTEENN algorithm.

We used a random state of 1 for each sampling algorithm to ensure consistency between tests

In [63]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [64]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [65]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.5097617196178627

In [66]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 74,  38],
       [195, 448]])

In [67]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                               pre       rec       spe        f1       geo       iba       sup

        less_productive       0.28      0.66      0.70      0.39      0.68      0.46       112
more_or_same_productive       0.92      0.70      0.66      0.79      0.68      0.46       643

            avg / total       0.83      0.69      0.67      0.73      0.68      0.46       755

