You can answer the following questions when performing an analysis on this dataset:

1. Which country is the happiest in the world?
2. What are the most important contributing factors to a nation’s happiness?
3. Is overall happiness increasing or decreasing?

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
np.random.seed(42)

## Defining Needed Functions

In [2]:
def Heatmap(df):
    numeric_cols = df.select_dtypes(include=['number'])
    fig = px.imshow(numeric_cols.corr(), text_auto=True, width=1000, height=1000)
    fig.update_layout(
        title="Correlation Heatmap",
        xaxis=dict(tickangle=45, title="Columns", side = 'top'),  
        yaxis=dict(title="Columns")  
        )
    fig.show()

In [3]:
def imputation(df):
    imputer = IterativeImputer(max_iter=10)
    df_imputed = imputer.fit_transform(df[numeric_cols])
    df_imputed = pd.DataFrame(df_imputed, columns=numeric_cols)
    return df_imputed

In [4]:
def Boxy(df):
    fig = make_subplots(rows=num_of_rows, cols=num_of_cols, subplot_titles=df.columns)

    for i, col in enumerate(df.columns):
        row = i // num_of_cols + 1
        col_num = i % num_of_cols + 1
        fig.add_trace(go.Box(x = df[col], name = col), row=row, col=col_num)

    fig.update_layout(width = 1100, height = num_of_rows * 400, title = 'skewing test')
    fig.show()

## Reading Original Data

In [5]:
original_df = pd.read_csv('D:\Just Data\World Happiness Report\World-happiness-report-updated_2024.csv', encoding = 'ISO-8859-1')

  original_df = pd.read_csv('D:\Just Data\World Happiness Report\World-happiness-report-updated_2024.csv', encoding = 'ISO-8859-1')


## Exploring Data

In [6]:
original_df.head(20)

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.724,7.35,0.451,50.5,0.718,0.164,0.882,0.414,0.258
1,Afghanistan,2009,4.402,7.509,0.552,50.8,0.679,0.187,0.85,0.481,0.237
2,Afghanistan,2010,4.758,7.614,0.539,51.1,0.6,0.118,0.707,0.517,0.275
3,Afghanistan,2011,3.832,7.581,0.521,51.4,0.496,0.16,0.731,0.48,0.267
4,Afghanistan,2012,3.783,7.661,0.521,51.7,0.531,0.234,0.776,0.614,0.268
5,Afghanistan,2013,3.572,7.68,0.484,52.0,0.578,0.059,0.823,0.547,0.273
6,Afghanistan,2014,3.131,7.671,0.526,52.3,0.509,0.102,0.871,0.492,0.375
7,Afghanistan,2015,3.983,7.654,0.529,52.6,0.389,0.078,0.881,0.491,0.339
8,Afghanistan,2016,4.22,7.65,0.559,52.925,0.523,0.04,0.793,0.501,0.348
9,Afghanistan,2017,2.662,7.648,0.491,53.25,0.427,-0.123,0.954,0.435,0.371


In [7]:
df = original_df.copy()

In [8]:
df.isnull().sum()

Country name                          0
year                                  0
Life Ladder                           0
Log GDP per capita                   28
Social support                       13
Healthy life expectancy at birth     63
Freedom to make life choices         36
Generosity                           81
Perceptions of corruption           125
Positive affect                      24
Negative affect                      16
dtype: int64

In [9]:
df.shape

(2363, 11)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2363 entries, 0 to 2362
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      2363 non-null   object 
 1   year                              2363 non-null   int64  
 2   Life Ladder                       2363 non-null   float64
 3   Log GDP per capita                2335 non-null   float64
 4   Social support                    2350 non-null   float64
 5   Healthy life expectancy at birth  2300 non-null   float64
 6   Freedom to make life choices      2327 non-null   float64
 7   Generosity                        2282 non-null   float64
 8   Perceptions of corruption         2238 non-null   float64
 9   Positive affect                   2339 non-null   float64
 10  Negative affect                   2347 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 203.2+ KB


In [11]:
df.describe()

Unnamed: 0,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
count,2363.0,2363.0,2335.0,2350.0,2300.0,2327.0,2282.0,2238.0,2339.0,2347.0
mean,2014.76386,5.483566,9.399671,0.809369,63.401828,0.750282,9.8e-05,0.743971,0.651882,0.273151
std,5.059436,1.125522,1.152069,0.121212,6.842644,0.139357,0.161388,0.184865,0.10624,0.087131
min,2005.0,1.281,5.527,0.228,6.72,0.228,-0.34,0.035,0.179,0.083
25%,2011.0,4.647,8.5065,0.744,59.195,0.661,-0.112,0.687,0.572,0.209
50%,2015.0,5.449,9.503,0.8345,65.1,0.771,-0.022,0.7985,0.663,0.262
75%,2019.0,6.3235,10.3925,0.904,68.5525,0.862,0.09375,0.86775,0.737,0.326
max,2023.0,8.019,11.676,0.987,74.6,0.985,0.7,0.983,0.884,0.705


## Imputation to fill nan values

In [13]:
numeric_cols = df.select_dtypes(include='number').columns

df_imputed = imputation(df)
df_updated = df.copy()
df_updated[numeric_cols] = df_imputed
df_updated.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008.0,3.724,7.35,0.451,50.5,0.718,0.164,0.882,0.414,0.258
1,Afghanistan,2009.0,4.402,7.509,0.552,50.8,0.679,0.187,0.85,0.481,0.237
2,Afghanistan,2010.0,4.758,7.614,0.539,51.1,0.6,0.118,0.707,0.517,0.275
3,Afghanistan,2011.0,3.832,7.581,0.521,51.4,0.496,0.16,0.731,0.48,0.267
4,Afghanistan,2012.0,3.783,7.661,0.521,51.7,0.531,0.234,0.776,0.614,0.268


In [14]:
df_updated.isna().sum()

Country name                        0
year                                0
Life Ladder                         0
Log GDP per capita                  0
Social support                      0
Healthy life expectancy at birth    0
Freedom to make life choices        0
Generosity                          0
Perceptions of corruption           0
Positive affect                     0
Negative affect                     0
dtype: int64

## Checking columns Distribution to know the optimal outlier handling technique

In [15]:
num_of_cols = 2
num_of_rows = (len(df_updated.columns) // num_of_cols) + 1
fig = make_subplots(rows=num_of_rows, cols=num_of_cols, subplot_titles=df_updated.columns)

for i, col in enumerate(df_updated.columns):
    row = (i // num_of_cols) + 1
    col_num = (i % num_of_cols) + 1
    fig.add_trace(go.Histogram(x=df_updated[col], name=col), row=row, col=col_num)

fig.update_layout(width = 1100, height = num_of_rows * 400, title = 'Distribution of Columns Values')
fig.show()

In [16]:
Boxy(df_updated)    

In [17]:
df_updated.shape

(2363, 11)

## Removing Outliers using IQR

In [18]:
for col in numeric_cols:
    Q1 = df_updated[col].quantile(0.25)
    Q3 = df_updated[col].quantile(0.75)
    
    IQR = Q3 - Q1
    
    upper_limit = Q3 + 1.5*IQR
    lower_limit = Q1 - 1.5*IQR

    df_updated = df_updated[(df_updated[col] >= lower_limit) & (df_updated[col] <= upper_limit)]

Boxy(df_updated)    

In [19]:
df_updated.shape

(2030, 11)

In [20]:
df_updated['Country name'].nunique()

159

## Finding Correlations between Features

In [26]:
Heatmap(df)    

## Adding Happiness Score column based on weights of other columns towards happiness

In [21]:
features = ['Life Ladder', 'Log GDP per capita', 'Social support',
    'Healthy life expectancy at birth', 'Freedom to make life choices', 'Perceptions of corruption']

scaler = MinMaxScaler()

df_updated[features] = scaler.fit_transform(df_updated[features])

weights = {'Life Ladder': 0.5, 'Log GDP per capita': 0.5, 'Social support': 0.5, 'Healthy life expectancy at birth': 0.4, 
           'Freedom to make life choices': 0.3, 'Perceptions of corruption': 0.2}

df_updated['happiness score'] = sum(df_updated[feature] * weight for feature, weight in weights.items())

df_updated.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,happiness score
1,Afghanistan,2009.0,0.351631,0.27556,0.099792,0.148175,0.499182,0.187,0.77265,0.481,0.237,0.727046
2,Afghanistan,2010.0,0.422434,0.293943,0.072765,0.158912,0.369885,0.118,0.528205,0.517,0.275,0.674742
3,Afghanistan,2011.0,0.238266,0.288165,0.035343,0.169649,0.199673,0.16,0.569231,0.48,0.267,0.522495
4,Afghanistan,2012.0,0.22852,0.302171,0.035343,0.180387,0.256956,0.234,0.646154,0.614,0.268,0.561489
6,Afghanistan,2014.0,0.098846,0.303922,0.045738,0.201861,0.220949,0.102,0.808547,0.492,0.375,0.532992


In [22]:
Happiest_countries = df_updated.groupby('Country name')['happiness score'].mean().sort_values(ascending=False)
Happiest_countries

Country name
Iceland        2.142942
Sweden         2.044426
Norway         2.039166
Switzerland    2.015894
Luxembourg     1.999976
                 ...   
Haiti          0.636781
South Sudan    0.633517
Rwanda         0.631109
Afghanistan    0.586074
Burundi        0.423700
Name: happiness score, Length: 159, dtype: float64

## Plotting the happiest 30 countries 

In [23]:
def Bar_Plot(df):
    Top_happiest_countries = df.nlargest(30)
    Top_happiest_countries.sort_values(inplace=True)
    fig = px.bar(Top_happiest_countries, orientation = 'h')
    fig.update_layout(width = 1000, height = 1000, xaxis_title = 'Happiness Score', 
                  yaxis_title = 'Countries', yaxis_tickangle = 45, title = 'Happiest_countries')
    fig.show()

Bar_Plot(Happiest_countries)    

In [24]:
df_updated.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,happiness score
1,Afghanistan,2009.0,0.351631,0.27556,0.099792,0.148175,0.499182,0.187,0.77265,0.481,0.237,0.727046
2,Afghanistan,2010.0,0.422434,0.293943,0.072765,0.158912,0.369885,0.118,0.528205,0.517,0.275,0.674742
3,Afghanistan,2011.0,0.238266,0.288165,0.035343,0.169649,0.199673,0.16,0.569231,0.48,0.267,0.522495
4,Afghanistan,2012.0,0.22852,0.302171,0.035343,0.180387,0.256956,0.234,0.646154,0.614,0.268,0.561489
6,Afghanistan,2014.0,0.098846,0.303922,0.045738,0.201861,0.220949,0.102,0.808547,0.492,0.375,0.532992


## Happiness rate over the years

In [25]:
Happiness_over_years = df_updated.groupby("year")["happiness score"].mean().reset_index()
Happiness_over_years = Happiness_over_years.sort_values(by = 'year', ascending=True)

fig = px.bar(Happiness_over_years, x = 'happiness score', y = 'year', orientation= 'h')
fig.update_layout(width = 800, height = 800, xaxis_title = 'Happiness score', yaxis_title = 'Years',
yaxis_tickangle = 45, title = 'Happiness Over Years')

fig.show()