In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

<h1> HASE </h1>

<p> 
Before importing the data, we manually made some adjustments to the data. For commute times we for instance had to remove some unnecessary tokens (e.g. '-10 min' should just be '-10')
Also, for entries with missing data (which happened for 'GPA', the average of all the valid entries was used for that.
</p>

<h2> Importing data and creating dummy variables for categorical variables </h2>

<p> 
    The dataset ready for the regression is exported and saved under 'pre_processed.xlsx' </p>

In [4]:
df = pd.read_excel('data/survey_results.xlsx', header=0, names=['timestamp', 'consent_form_approval', 'semester', 'time_in_ch', 'family_lives_in_ch', 'social_life', 'hobbies', 'class_in_person', 'live_online', 'recorded', 'study_place',
                                                            'commute_minutes', 'commute_feeling', 'support', 'schedule', 'task_management', 'discipline', 'time_outside_work', 'seperation', 'workload', 'exams',
                                                            'focus', 'distractions', 'switch', 'gpa', 'sleep', 'physical_health', 'productivity_change', 'well_being_1', 'well_being_2', 'well_being_3', 'well_being_4',
                                                            'well_being_5', 'well_being_6', 'well_being_7', 'well_being_8', 'well_being_9', 'well_being_10', 'well_being_11'])

print(df.columns)
df.drop(['timestamp', 'consent_form_approval'], axis=1, inplace=True)

# how long have you lived in CH?
# create dummy variables
time_in_ch = pd.get_dummies(df['time_in_ch'])
# delete last dummy variable to escape the dummy variable trap
time_in_ch.drop(['lifetime'], axis=1, inplace=True)
time_in_ch.rename(columns = {'1 semester': 'in_ch_1_semester', '1 year': 'in_ch_1_year', '2-5 years': 'in_ch_2-5_years', '>5 years': 'in_ch_over_5_years'}, inplace=True)
df = pd.concat([df, time_in_ch], axis=1)
df.drop(['time_in_ch'], axis=1, inplace=True)

# Does your family live in Switzerland?
# create dummy variables
family_in_ch = pd.get_dummies(df['family_lives_in_ch'])
# delete last dummy variable to escape the dummy variable trap
family_in_ch.drop(["No"], axis=1, inplace=True)
family_in_ch.rename(columns = {"Yes": "family_in_ch"}, inplace=True)
df = pd.concat([df, family_in_ch], axis=1)
df.drop(['family_lives_in_ch'], axis=1, inplace=True)

df['intercept'] = 1

df.to_excel('data/pre_processed.xlsx')

df.head()

Index(['timestamp', 'consent_form_approval', 'semester', 'time_in_ch',
       'family_lives_in_ch', 'social_life', 'hobbies', 'class_in_person',
       'live_online', 'recorded', 'study_place', 'commute_minutes',
       'commute_feeling', 'support', 'schedule', 'task_management',
       'discipline', 'time_outside_work', 'seperation', 'workload', 'exams',
       'focus', 'distractions', 'switch', 'gpa', 'sleep', 'physical_health',
       'productivity_change', 'well_being_1', 'well_being_2', 'well_being_3',
       'well_being_4', 'well_being_5', 'well_being_6', 'well_being_7',
       'well_being_8', 'well_being_9', 'well_being_10', 'well_being_11'],
      dtype='object')


Unnamed: 0,semester,social_life,hobbies,class_in_person,live_online,recorded,study_place,commute_minutes,commute_feeling,support,...,well_being_8,well_being_9,well_being_10,well_being_11,in_ch_1_semester,in_ch_1_year,in_ch_2-5_years,in_ch_over_5_years,family_in_ch,intercept
0,5,2,2,4,1,4,2,-45,2,1,...,1,4,2,4,0,0,1,0,0,1
1,15,2,4,4,3,3,3,-80,4,2,...,2,2,4,2,0,0,0,0,1,1
2,9,2,2,3,4,4,2,0,3,2,...,5,2,2,3,0,0,0,0,1,1
3,10,3,5,2,3,4,4,-240,5,2,...,4,2,3,3,0,0,0,0,1,1
4,3,5,4,2,2,2,5,0,5,3,...,5,3,3,2,0,0,0,0,1,1


<h2> Building multiple Linear Regression models </h2>

<p> Initially, all of the factors were included in the model. We then used Backward Elimination to end up with the best variables. </p>

<h3> Regressing Productivity Change </h3>

In [30]:
Y = df['productivity_change']

# original, wich all the columns in there
#X = df[['intercept', semester', 'in_ch_1_semester', 'in_ch_1_year', 'in_ch_2-5_years', 'in_ch_over_5_years', 'family_in_ch', 'social_life', 'hobbies', 'class_in_person', 'live_online', 'recorded', 'study_place','commute_minutes', 'commute_feeling', 'support', 
#              'schedule', 'task_management', 'discipline', 'time_outside_work', 'seperation', 'workload', 'exams', 'focus', 'switch', 'gpa', 'sleep', 'physical_health']]

X = df[['intercept', 'social_life', 'time_outside_work', 'focus', 'physical_health']]


results = sm.OLS(Y,X).fit()
results.summary()  

0,1,2,3
Dep. Variable:,productivity_change,R-squared:,0.752
Model:,OLS,Adj. R-squared:,0.726
Method:,Least Squares,F-statistic:,28.83
Date:,"Mon, 30 Nov 2020",Prob (F-statistic):,4.73e-11
Time:,11:34:06,Log-Likelihood:,-38.105
No. Observations:,43,AIC:,86.21
Df Residuals:,38,BIC:,95.02
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-0.6371,0.435,-1.464,0.151,-1.518,0.244
social_life,0.2156,0.096,2.237,0.031,0.020,0.411
time_outside_work,0.0815,0.025,3.232,0.003,0.030,0.133
focus,0.6168,0.076,8.104,0.000,0.463,0.771
physical_health,0.2305,0.080,2.897,0.006,0.069,0.392

0,1,2,3
Omnibus:,2.519,Durbin-Watson:,1.687
Prob(Omnibus):,0.284,Jarque-Bera (JB):,2.368
Skew:,-0.538,Prob(JB):,0.306
Kurtosis:,2.595,Cond. No.,34.5


<h3> Regressing Productivity Change without 'focus' </h3>

<p> The fact that 'focus' is strongly correlated with productivity, is not really surprising. However, in order to get some more useful results, we tried a model without the 'focus' factor to see if we can achieve similar R^2 results and some more information what might be causing higher focus </p>

In [57]:
# Trying a regression without focus, because that is like the best predictor but not really helpful (because we wan't to know what causes this focus)

Y = df['productivity_change']

#X = df[['intercept', 'semester', 'in_ch_1_semester', 'in_ch_1_year', 'in_ch_2-5_years', 'in_ch_over_5_years', 'family_in_ch', 'social_life', 'hobbies', 'class_in_person', 'live_online', 'recorded', 'study_place','commute_minutes', 'commute_feeling', 'support', 
#              'schedule', 'task_management', 'discipline', 'time_outside_work', 'seperation', 'workload', 'exams', 'focus', 'switch', 'gpa', 'sleep', 'physical_health']]

X = df[['intercept', 'commute_feeling', 'schedule']]


results = sm.OLS(Y,X).fit()
results.summary()  

0,1,2,3
Dep. Variable:,productivity_change,R-squared:,0.551
Model:,OLS,Adj. R-squared:,0.529
Method:,Least Squares,F-statistic:,24.57
Date:,"Mon, 30 Nov 2020",Prob (F-statistic):,1.1e-07
Time:,11:46:18,Log-Likelihood:,-50.864
No. Observations:,43,AIC:,107.7
Df Residuals:,40,BIC:,113.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,0.3519,0.383,0.918,0.364,-0.423,1.127
commute_feeling,0.2874,0.103,2.786,0.008,0.079,0.496
schedule,0.4899,0.095,5.163,0.000,0.298,0.682

0,1,2,3
Omnibus:,4.667,Durbin-Watson:,2.238
Prob(Omnibus):,0.097,Jarque-Bera (JB):,3.987
Skew:,0.745,Prob(JB):,0.136
Kurtosis:,3.062,Cond. No.,14.8


<h3> Regressing Well-Being Change </h3>

<p> Note that currently I just averaged all the well being answers from the survey. Will have to look up what method they used in the paper. </p>

In [77]:
# Predicting well_being
# https://link.springer.com/article/10.1186/1477-7525-11-66

Y = df[['well_being_1', 'well_being_2', 'well_being_3', 'well_being_4', 'well_being_5', 'well_being_6', 'well_being_7', 'well_being_8', 'well_being_9', 'well_being_10', 'well_being_11']]
Y['well_being_10'] = Y['well_being_10'] * -1

well_being_sum = Y.sum(axis=1)
Y.loc[:,'well_being_sum'] = well_being_sum

Y_well_being = Y['well_being_sum']
Y_well_being.head()

#X = df[['intercept', 'semester', 'in_ch_1_semester', 'in_ch_1_year', 'in_ch_2-5_years', 'in_ch_over_5_years', 'family_in_ch', 'social_life', 'hobbies', 'class_in_person', 'live_online', 'recorded', 'study_place','commute_minutes', 'commute_feeling', 'support', 
#              'schedule', 'task_management', 'discipline', 'time_outside_work', 'seperation', 'workload', 'exams', 'focus', 'switch', 'gpa', 'sleep', 'physical_health']]

# X = df[['semester', 'in_ch_1_year', 'in_ch_2-5_years', 'family_in_ch', 'class_in_person', 'recorded', 'study_place', 'commute_feeling', 'task_management', 'discipline', 'exams', 'sleep']]
X = df[['intercept', 'in_ch_1_year', 'in_ch_2-5_years', 'family_in_ch', 'study_place', 
              'task_management', 'discipline', 'seperation', 'sleep']]


results = sm.OLS(Y_well_being,X).fit()
results.summary()  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y['well_being_10'] = Y['well_being_10'] * -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


0,1,2,3
Dep. Variable:,well_being_sum,R-squared:,0.768
Model:,OLS,Adj. R-squared:,0.713
Method:,Least Squares,F-statistic:,14.04
Date:,"Mon, 30 Nov 2020",Prob (F-statistic):,9.14e-09
Time:,13:50:12,Log-Likelihood:,-127.97
No. Observations:,43,AIC:,273.9
Df Residuals:,34,BIC:,289.8
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,8.0996,4.171,1.942,0.060,-0.378,16.577
in_ch_1_year,-10.2549,3.158,-3.248,0.003,-16.672,-3.838
in_ch_2-5_years,-8.1675,2.713,-3.011,0.005,-13.680,-2.655
family_in_ch,-6.9071,2.453,-2.816,0.008,-11.892,-1.922
study_place,5.3370,1.238,4.311,0.000,2.821,7.853
task_management,-2.4262,0.967,-2.509,0.017,-4.392,-0.461
discipline,2.5662,0.952,2.697,0.011,0.632,4.500
seperation,1.5958,0.804,1.986,0.055,-0.037,3.229
sleep,1.9848,0.785,2.529,0.016,0.390,3.580

0,1,2,3
Omnibus:,2.267,Durbin-Watson:,2.24
Prob(Omnibus):,0.322,Jarque-Bera (JB):,2.143
Skew:,-0.491,Prob(JB):,0.342
Kurtosis:,2.519,Cond. No.,46.2


<h2> Testing our hypothesis </h2>

<h3> Perceived productivity </h3>

In [187]:
df = pd.read_excel('data/survey_results.xlsx', header=0, names=['timestamp', 'consent_form_approval', 'semester', 'time_in_ch', 'in_ch_over_5_years', 'family_lives_in_ch', 'social_life', 'hobbies', 'class_in_person', 'live_online', 'recorded', 'study_place',
                                                            'commute_minutes', 'commute_feeling', 'support', 'schedule', 'task_management', 'discipline', 'time_outside_work', 'seperation', 'workload', 'exams',
                                                            'focus', 'distractions', 'switch', 'gpa', 'sleep', 'physical_health', 'productivity_change', 'well_being_1', 'well_being_2', 'well_being_3', 'well_being_4',
                                                            'well_being_5', 'well_being_6', 'well_being_7', 'well_being_8', 'well_being_9', 'well_being_10', 'well_being_11'])

print(df.columns)
df.drop(['timestamp', 'consent_form_approval', 'time_in_ch'], axis=1, inplace=True)


# Does your family live in Switzerland?
# create dummy variables
family_in_ch = pd.get_dummies(df['family_lives_in_ch'])
# delete last dummy variable to escape the dummy variable trap
family_in_ch.drop(["No"], axis=1, inplace=True)
family_in_ch.rename(columns = {"Yes": "family_in_ch"}, inplace=True)
df = pd.concat([df, family_in_ch], axis=1)
df.drop(['family_lives_in_ch'], axis=1, inplace=True)

df['intercept'] = 1


df.to_excel('data/simple_pre_processed.xlsx')

df.head()

Index(['timestamp', 'consent_form_approval', 'semester', 'time_in_ch',
       'in_ch_over_5_years', 'family_lives_in_ch', 'social_life', 'hobbies',
       'class_in_person', 'live_online', 'recorded', 'study_place',
       'commute_minutes', 'commute_feeling', 'support', 'schedule',
       'task_management', 'discipline', 'time_outside_work', 'seperation',
       'workload', 'exams', 'focus', 'distractions', 'switch', 'gpa', 'sleep',
       'physical_health', 'productivity_change', 'well_being_1',
       'well_being_2', 'well_being_3', 'well_being_4', 'well_being_5',
       'well_being_6', 'well_being_7', 'well_being_8', 'well_being_9',
       'well_being_10', 'well_being_11'],
      dtype='object')


Unnamed: 0,semester,in_ch_over_5_years,social_life,hobbies,class_in_person,live_online,recorded,study_place,commute_minutes,commute_feeling,...,well_being_4,well_being_5,well_being_6,well_being_7,well_being_8,well_being_9,well_being_10,well_being_11,family_in_ch,intercept
0,5,0,2,2,4,1,4,2,-45,2,...,1,3,1,4,1,4,2,4,0,1
1,15,1,2,4,4,3,3,3,-80,4,...,2,3,2,2,2,2,4,2,1,1
2,9,1,2,2,3,4,4,2,0,3,...,3,2,1,2,5,2,2,3,1,1
3,10,1,3,5,2,3,4,4,-240,5,...,4,2,2,3,4,2,3,3,1,1
4,3,1,5,4,2,2,2,5,0,5,...,3,5,3,5,5,3,3,2,1,1


In [197]:
Y = df['productivity_change']

# original, wich all the columns in there
#X = df[['intercept', 'semester', 'in_ch_over_5_years', 'family_in_ch', 'social_life', 'hobbies', 'class_in_person', 'live_online', 'recorded', 'study_place','commute_minutes', 'commute_feeling', 'support', 
#              'schedule', 'task_management', 'discipline', 'time_outside_work', 'seperation', 'workload', 'exams', 'focus', 'switch', 'gpa', 'sleep', 'physical_health']]

X = df[['intercept', 'schedule', 'focus', 'time_outside_work']]



results = sm.OLS(Y,X).fit()
results.summary()  

0,1,2,3
Dep. Variable:,productivity_change,R-squared:,0.729
Model:,OLS,Adj. R-squared:,0.708
Method:,Least Squares,F-statistic:,34.94
Date:,"Mon, 30 Nov 2020",Prob (F-statistic):,3.88e-11
Time:,14:30:56,Log-Likelihood:,-40.036
No. Observations:,43,AIC:,88.07
Df Residuals:,39,BIC:,95.12
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,0.4215,0.241,1.752,0.088,-0.065,0.908
schedule,0.2592,0.087,2.967,0.005,0.083,0.436
focus,0.5093,0.092,5.557,0.000,0.324,0.695
time_outside_work,0.0608,0.027,2.284,0.028,0.007,0.115

0,1,2,3
Omnibus:,1.907,Durbin-Watson:,1.803
Prob(Omnibus):,0.385,Jarque-Bera (JB):,1.73
Skew:,-0.472,Prob(JB):,0.421
Kurtosis:,2.725,Cond. No.,17.0


<h3> Well being</h3>

In [206]:
# Predicting well_being
# https://link.springer.com/article/10.1186/1477-7525-11-66

Y = df[['well_being_1', 'well_being_2', 'well_being_3', 'well_being_4', 'well_being_5', 'well_being_6', 'well_being_7', 'well_being_8', 'well_being_9', 'well_being_10', 'well_being_11']]
Y['well_being_10'] = Y['well_being_10'] * -1

well_being_sum = Y.sum(axis=1)
Y.loc[:,'well_being_sum'] = well_being_sum

Y_well_being = Y['well_being_sum']
Y_well_being.head()

#X = df[['intercept', 'semester', 'in_ch_over_5_years', 'family_in_ch', 'social_life', 'hobbies', 'class_in_person', 'live_online', 'recorded', 'study_place','commute_minutes', 'commute_feeling', 'support', 
#              'schedule', 'task_management', 'discipline', 'time_outside_work', 'seperation', 'workload', 'exams', 'focus', 'switch', 'gpa', 'sleep', 'physical_health']]

X = df[['intercept', 'support', 'study_place', 'sleep']]


results = sm.OLS(Y_well_being,X).fit()
results.summary()  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y['well_being_10'] = Y['well_being_10'] * -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


0,1,2,3
Dep. Variable:,well_being_sum,R-squared:,0.662
Model:,OLS,Adj. R-squared:,0.636
Method:,Least Squares,F-statistic:,25.47
Date:,"Mon, 30 Nov 2020",Prob (F-statistic):,2.71e-09
Time:,14:32:01,Log-Likelihood:,-136.02
No. Observations:,43,AIC:,280.0
Df Residuals:,39,BIC:,287.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-0.1551,4.026,-0.039,0.969,-8.298,7.988
support,2.7922,1.056,2.644,0.012,0.656,4.929
study_place,5.2870,0.935,5.657,0.000,3.397,7.177
sleep,1.6683,0.827,2.018,0.050,-0.004,3.340

0,1,2,3
Omnibus:,2.242,Durbin-Watson:,1.848
Prob(Omnibus):,0.326,Jarque-Bera (JB):,1.405
Skew:,0.155,Prob(JB):,0.495
Kurtosis:,2.17,Cond. No.,25.8
