In [1]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import json

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error

import statsmodels.api as sm

In [3]:
# Load Dataset

df = pd.read_csv('./datasets/happiness_2019.csv')

In [4]:
# Get descriptive statistics for the dataset

df.describe(include='all')

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
count,156.0,156,156.0,156.0,156.0,156.0,156.0,156.0,156.0
unique,,156,,,,,,,
top,,Finland,,,,,,,
freq,,1,,,,,,,
mean,78.5,,5.407096,0.905147,1.208814,0.725244,0.392571,0.184846,0.110603
std,45.177428,,1.11312,0.398389,0.299191,0.242124,0.143289,0.095254,0.094538
min,1.0,,2.853,0.0,0.0,0.0,0.0,0.0,0.0
25%,39.75,,4.5445,0.60275,1.05575,0.54775,0.308,0.10875,0.047
50%,78.5,,5.3795,0.96,1.2715,0.789,0.417,0.1775,0.0855
75%,117.25,,6.1845,1.2325,1.4525,0.88175,0.50725,0.24825,0.14125


In [5]:
# Check for duplicates

df.duplicated().sum()

0

In [6]:
# Check for missing values

df.isna().sum()

Overall rank                    0
Country or region               0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       0
dtype: int64

In [7]:
# Remove rows with missing values and check again

clean_df = df.dropna()

clean_df.isna().sum()

Overall rank                    0
Country or region               0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       0
dtype: int64

In [9]:
# Rename columns to be a single word
clean_df.columns = clean_df.columns.str.lower().str.replace(' ','_')

clean_df.columns

Index(['overall_rank', 'country_or_region', 'score', 'gdp_per_capita',
       'social_support', 'healthy_life_expectancy',
       'freedom_to_make_life_choices', 'generosity',
       'perceptions_of_corruption'],
      dtype='object')

In [14]:
# Create axis and sets

X = clean_df[['gdp_per_capita', 'social_support', 'healthy_life_expectancy', 'perceptions_of_corruption']]
y = clean_df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random.randint(0, 100_000))

In [15]:
# Train the model

model = LinearRegression()
model.fit(X_train, y_train)

r_sq = model.score(X_train, y_train)

In [16]:
# Predict the model and visualize the results

y_pred_train = model.predict(X_test)

r_sq = r2_score(y_test, y_pred_train)
mae = mean_absolute_error(y_test, y_pred_train)
mse = mean_squared_error(y_test, y_pred_train)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_train))

x = sm.add_constant(X)
results = sm.OLS(y,x).fit()
results.summary()

In [17]:
# Check the coefficients

cdf = pd.DataFrame(model.coef_, X.columns, columns=['Coefficients'])
cdf

Unnamed: 0,Coefficients
gdp_per_capita,0.64956
social_support,1.431894
healthy_life_expectancy,1.290839
perceptions_of_corruption,1.767437


In [None]:
# Generate results and save them to a JSON file

result = {
    'data': clean_df.to_dict(),
    'context': 'The dataset contains the happiness score of 156 countries based on the factors of GDP per capita, social support, healthy life expectancy, and perceptions of corruption. The goal is to predict the happiness score based on these factors.',
    'metrics': {
        'Coefficients': cdf.to_dict()['Coefficients'],
        'R2': r_sq,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        "Reference": "score"
    }
}

with open('lr_happiness_result.json', 'w') as file:
    file.write(json.dumps(result, indent=4))