## Linear Regression Model using Statsmodels

In [1]:
import pandas as pd
import statsmodels.api as sm

In [10]:
# Load cleaned data
df = pd.read_csv('data/data_clean.csv')

In [11]:
# Define the independent and dependent variables
X = df.drop(columns=['ward_code', 'Ward', 'Subjective well-being average score Average'])

y = df['Subjective well-being average score Average']

# Add a constant column to the independent variables
X = sm.add_constant(X)

# create the regression model and fit it
model = sm.OLS(y, X).fit()

model_summary = model.summary()
print(model_summary)

                                         OLS Regression Results                                        
Dep. Variable:     Subjective well-being average score Average   R-squared:                       0.065
Model:                                                     OLS   Adj. R-squared:                  0.046
Method:                                          Least Squares   F-statistic:                     3.525
Date:                                         Tue, 09 Jan 2024   Prob (F-statistic):           4.47e-05
Time:                                                 23:32:16   Log-Likelihood:                -865.95
No. Observations:                                          625   AIC:                             1758.
Df Residuals:                                              612   BIC:                             1816.
Df Model:                                                   12                                         
Covariance Type:                                     nonrobust  

In [12]:
# Print all the coefficients of the model, with their names, only if their p-value is less than 0.05
for i in range(len(model.pvalues)):
    if model.pvalues[i] < 0.05:
        print(model.params.index[i], model.params[i])

Life Expectancy 2009-13 -0.11703666763113421
Unemployment rate Average -0.25391565533775873
population_per_hectare 0.26620865346379824
travel_work_bicycle -0.14056941411305016


  if model.pvalues[i] < 0.05:
  print(model.params.index[i], model.params[i])


In [13]:
# refit model using variables whose p-value less than 0.05
columns=['Life Expectancy 2009-13', 'Unemployment rate Average', 'population_per_hectare', 'travel_work_bicycle', 'Subjective well-being average score Average']
df_0 = df[columns]

# Define the independent and dependent variables
X0 = df_0.drop(columns=['Subjective well-being average score Average'])

y0 = df_0['Subjective well-being average score Average']

# Add a constant column to the independent variables
X0 = sm.add_constant(X0)

# create the regression model and fit it
model0 = sm.OLS(y0, X0).fit()


model_summary_0 = model0.summary()
print(model_summary_0)

                                         OLS Regression Results                                        
Dep. Variable:     Subjective well-being average score Average   R-squared:                       0.054
Model:                                                     OLS   Adj. R-squared:                  0.048
Method:                                          Least Squares   F-statistic:                     8.874
Date:                                         Tue, 09 Jan 2024   Prob (F-statistic):           5.68e-07
Time:                                                 23:32:55   Log-Likelihood:                -869.44
No. Observations:                                          625   AIC:                             1749.
Df Residuals:                                              620   BIC:                             1771.
Df Model:                                                    4                                         
Covariance Type:                                     nonrobust  