In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

<h1> HASE </h1>

<p> 
Before importing the data, we manually made some adjustments to the data. For commute times we for instance had to remove some unnecessary tokens (e.g. '-10 min' should just be '-10')
Also, for entries with missing data (which happened for 'GPA', the average of all the valid entries was used for that.
</p>

<h2> Importing data and creating dummy variables for categorical variables </h2>

<p> 
    The dataset ready for the regression is exported and saved under 'pre_processed.xlsx' </p>

In [2]:
df = pd.read_excel('data/survey_results.xlsx', header=0, names=['timestamp', 'consent_form_approval', 'semester', 'time_in_ch', 'family_lives_in_ch', 'social_life', 'hobbies', 'class_in_person', 'live_online', 'recorded', 'study_place',
                                                            'commute_minutes', 'commute_feeling', 'support', 'schedule', 'task_management', 'discipline', 'time_outside_work', 'seperation', 'workload', 'exams',
                                                            'focus', 'distractions', 'switch', 'gpa', 'sleep', 'physical_health', 'productivity_change', 'well_being_1', 'well_being_2', 'well_being_3', 'well_being_4',
                                                            'well_being_5', 'well_being_6', 'well_being_7', 'well_being_8', 'well_being_9', 'well_being_10', 'well_being_11'])

print(df.columns)
df.drop(['timestamp', 'consent_form_approval'], axis=1, inplace=True)

# how long have you lived in CH?
# create dummy variables
time_in_ch = pd.get_dummies(df['time_in_ch'])
# delete last dummy variable to escape the dummy variable trap
time_in_ch.drop(['lifetime'], axis=1, inplace=True)
time_in_ch.rename(columns = {'1 semester': 'in_ch_1_semester', '1 year': 'in_ch_1_year', '2-5 years': 'in_ch_2-5_years', '>5 years': 'in_ch_over_5_years'}, inplace=True)
df = pd.concat([df, time_in_ch], axis=1)
df.drop(['time_in_ch'], axis=1, inplace=True)

# Does your family live in Switzerland?
# create dummy variables
family_in_ch = pd.get_dummies(df['family_lives_in_ch'])
# delete last dummy variable to escape the dummy variable trap
family_in_ch.drop(["No"], axis=1, inplace=True)
family_in_ch.rename(columns = {"Yes": "family_in_ch"}, inplace=True)
df = pd.concat([df, family_in_ch], axis=1)
df.drop(['family_lives_in_ch'], axis=1, inplace=True)

df.to_excel('data/pre_processed.xlsx')

df.head()

Index(['timestamp', 'consent_form_approval', 'semester', 'time_in_ch',
       'family_lives_in_ch', 'social_life', 'hobbies', 'class_in_person',
       'live_online', 'recorded', 'study_place', 'commute_minutes',
       'commute_feeling', 'support', 'schedule', 'task_management',
       'discipline', 'time_outside_work', 'seperation', 'workload', 'exams',
       'focus', 'distractions', 'switch', 'gpa', 'sleep', 'physical_health',
       'productivity_change', 'well_being_1', 'well_being_2', 'well_being_3',
       'well_being_4', 'well_being_5', 'well_being_6', 'well_being_7',
       'well_being_8', 'well_being_9', 'well_being_10', 'well_being_11'],
      dtype='object')


Unnamed: 0,semester,social_life,hobbies,class_in_person,live_online,recorded,study_place,commute_minutes,commute_feeling,support,...,well_being_7,well_being_8,well_being_9,well_being_10,well_being_11,in_ch_1_semester,in_ch_1_year,in_ch_2-5_years,in_ch_over_5_years,family_in_ch
0,2,4,2,4,1,5,3,45,4,5,...,4,5,4,1,5,0,0,1,0,1
1,11,4,3,3,3,5,5,-40,4,3,...,5,5,3,2,4,0,0,0,0,0
2,3,3,1,5,2,4,1,-50,4,1,...,3,2,1,4,1,0,1,0,0,0
3,5,2,2,4,1,4,2,-45,2,1,...,4,1,4,2,4,0,0,1,0,0
4,15,2,4,4,3,3,3,-80,4,2,...,2,2,2,4,2,0,0,0,0,1


<h2> Building multiple Linear Regression models </h2>

<p> Initially, all of the factors were included in the model. We then used Backward Elimination to end up with the best variables. </p>

<h3> Regressing Productivity Change </h3>

In [3]:
Y = df['productivity_change']

# original, wich all the columns in there
#X = df[['semester', 'in_ch_1_semester', 'in_ch_1_year', 'in_ch_2-5_years', 'in_ch_over_5_years', 'family_in_ch', 'social_life', 'hobbies', 'class_in_person', 'live_online', 'recorded', 'study_place','commute_minutes', 'commute_feeling', 'support', 
#              'schedule', 'task_management', 'discipline', 'time_outside_work', 'seperation', 'workload', 'exams', 'focus', 'switch', 'gpa', 'sleep', 'physical_health']]

X = df[[ 'in_ch_1_semester', 'social_life', 'focus']]

results = sm.OLS(Y,X).fit()
results.summary()  

0,1,2,3
Dep. Variable:,productivity_change,R-squared (uncentered):,0.944
Model:,OLS,Adj. R-squared (uncentered):,0.939
Method:,Least Squares,F-statistic:,175.6
Date:,"Tue, 24 Nov 2020",Prob (F-statistic):,1.55e-19
Time:,13:39:22,Log-Likelihood:,-35.134
No. Observations:,34,AIC:,76.27
Df Residuals:,31,BIC:,80.85
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
in_ch_1_semester,0.9290,0.385,2.411,0.022,0.143,1.715
social_life,0.2645,0.081,3.273,0.003,0.100,0.429
focus,0.6651,0.096,6.924,0.000,0.469,0.861

0,1,2,3
Omnibus:,5.304,Durbin-Watson:,1.583
Prob(Omnibus):,0.071,Jarque-Bera (JB):,4.384
Skew:,-0.877,Prob(JB):,0.112
Kurtosis:,3.127,Cond. No.,13.4


<h3> Regressing Productivity Change without 'focus' </h3>

<p> The fact that 'focus' is strongly correlated with productivity, is not really surprising. However, in order to get some more useful results, we tried a model without the 'focus' factor to see if we can achieve similar R^2 results and some more information what might be causing higher focus </p>

In [4]:
# Trying a regression without focus, because that is like the best predictor but not really helpful (because we wan't to know what causes this focus)

Y = df['productivity_change']

#X = df[['semester', 'in_ch_1_semester', 'in_ch_1_year', 'in_ch_2-5_years', 'in_ch_over_5_years', 'family_in_ch', 'social_life', 'hobbies', 'class_in_person', 'live_online', 'recorded', 'study_place','commute_minutes', 'commute_feeling', 'support', 
#              'schedule', 'task_management', 'discipline', 'time_outside_work', 'seperation', 'workload', 'exams', 'focus', 'switch', 'gpa', 'sleep', 'physical_health']]

X = df[['semester', 'in_ch_1_semester', 'in_ch_1_year', 'in_ch_2-5_years', 'in_ch_over_5_years', 'social_life', 'live_online', 'commute_feeling', 'support', 'time_outside_work', 'seperation', 'exams', 'gpa', 'physical_health']]


results = sm.OLS(Y,X).fit()
results.summary()  

0,1,2,3
Dep. Variable:,productivity_change,R-squared (uncentered):,0.952
Model:,OLS,Adj. R-squared (uncentered):,0.919
Method:,Least Squares,F-statistic:,28.5
Date:,"Tue, 24 Nov 2020",Prob (F-statistic):,3.77e-10
Time:,13:39:22,Log-Likelihood:,-32.552
No. Observations:,34,AIC:,93.1
Df Residuals:,20,BIC:,114.5
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
semester,0.3474,0.102,3.413,0.003,0.135,0.560
in_ch_1_semester,2.4390,0.839,2.907,0.009,0.689,4.189
in_ch_1_year,1.5095,0.678,2.227,0.038,0.095,2.924
in_ch_2-5_years,1.8714,0.806,2.322,0.031,0.190,3.553
in_ch_over_5_years,2.6298,0.856,3.071,0.006,0.844,4.416
social_life,0.6295,0.284,2.213,0.039,0.036,1.223
live_online,0.5512,0.156,3.527,0.002,0.225,0.877
commute_feeling,-0.5011,0.222,-2.256,0.035,-0.964,-0.038
support,-0.6543,0.251,-2.611,0.017,-1.177,-0.132

0,1,2,3
Omnibus:,1.842,Durbin-Watson:,1.364
Prob(Omnibus):,0.398,Jarque-Bera (JB):,1.386
Skew:,0.492,Prob(JB):,0.5
Kurtosis:,2.905,Cond. No.,112.0


<h3> Regressing Well-Being Change </h3>

<p> Note that currently I just averaged all the well being answers from the survey. Will have to look up what method they used in the paper. </p>

In [5]:
# Predicting well_being

Y = df[['well_being_1', 'well_being_2', 'well_being_3', 'well_being_4', 'well_being_5', 'well_being_6', 'well_being_7', 'well_being_8', 'well_being_9', 'well_being_10', 'well_being_11']]
well_being_mean = Y.mean(axis=1)
Y.loc[:,'well_being_mean'] = well_being_mean

Y_well_being = Y['well_being_mean']
Y_well_being.head()

#X = df[['semester', 'in_ch_1_semester', 'in_ch_1_year', 'in_ch_2-5_years', 'in_ch_over_5_years', 'family_in_ch', 'social_life', 'hobbies', 'class_in_person', 'live_online', 'recorded', 'study_place','commute_minutes', 'commute_feeling', 'support', 
#              'schedule', 'task_management', 'discipline', 'time_outside_work', 'seperation', 'workload', 'exams', 'focus', 'switch', 'gpa', 'sleep', 'physical_health']]

X = df[['commute_feeling', 'support', 'discipline', 'gpa']]

results = sm.OLS(Y_well_being,X).fit()
results.summary()  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


0,1,2,3
Dep. Variable:,well_being_mean,R-squared (uncentered):,0.981
Model:,OLS,Adj. R-squared (uncentered):,0.978
Method:,Least Squares,F-statistic:,386.5
Date:,"Tue, 24 Nov 2020",Prob (F-statistic):,2.46e-25
Time:,13:39:22,Log-Likelihood:,-21.474
No. Observations:,34,AIC:,50.95
Df Residuals:,30,BIC:,57.05
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
commute_feeling,0.2159,0.073,2.947,0.006,0.066,0.366
support,0.2600,0.091,2.846,0.008,0.073,0.447
discipline,0.2444,0.063,3.873,0.001,0.116,0.373
gpa,0.1765,0.057,3.118,0.004,0.061,0.292

0,1,2,3
Omnibus:,1.267,Durbin-Watson:,1.987
Prob(Omnibus):,0.531,Jarque-Bera (JB):,0.972
Skew:,-0.118,Prob(JB):,0.615
Kurtosis:,2.206,Cond. No.,9.42
