In [25]:
from src.dataset import HappinessDataset
import logging
import numpy as np
import pandas as pd
from ctgan import CTGAN



In [None]:
!pip install pgmpy
!pip install copulas

In [2]:
# logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(name)s:%(lineno)s [%(levelname)s]: %(message)s",
)

In [3]:
DATA = HappinessDataset.from_kaggle()



src.dataset:73 [INFO]: Loaded data for years: ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']


In [4]:
CHECK_MARK = "\u2705"
CROSS_MARK = "\u274C"

for twenty_xy_str in DATA.get_years():
    twenty_xy = int(twenty_xy_str)
    xy = twenty_xy - 2000

    assert (
        DATA[twenty_xy] is DATA[xy]
    ), f"{CROSS_MARK} failed at year {twenty_xy}"

    print(f"{CHECK_MARK} {twenty_xy=} and {xy=} have the same df. :)")

✅ twenty_xy=2015 and xy=15 have the same df. :)
✅ twenty_xy=2016 and xy=16 have the same df. :)
✅ twenty_xy=2017 and xy=17 have the same df. :)
✅ twenty_xy=2018 and xy=18 have the same df. :)
✅ twenty_xy=2019 and xy=19 have the same df. :)
✅ twenty_xy=2020 and xy=20 have the same df. :)
✅ twenty_xy=2021 and xy=21 have the same df. :)
✅ twenty_xy=2022 and xy=22 have the same df. :)
✅ twenty_xy=2023 and xy=23 have the same df. :)


In [5]:
data_2023 = DATA[2023]

In [6]:
len(data_2023)

137

In [51]:
data_2023.to_csv("happiness_2023.csv", index=False)

### Adding noice

In [8]:
df = data_2023

def add_noise(df, noise_level=0.05):
    noisy_df = df.copy()
    numeric_cols = [
        'happiness_score', 'gdp_per_capita', 'social_support',
        'healthy_life_expectancy', 'freedom_to_make_life_choices',
        'generosity', 'perceptions_of_corruption'
    ]
    for col in numeric_cols:
        noise = np.random.normal(0, noise_level * df[col].std(), size=df[col].shape)
        noisy_df[col] += noise
    return noisy_df

# Generate multiple synthetic datasets
num_synthetic_copies = 5  # Adjust this number to generate more or fewer datasets
synthetic_dfs = [add_noise(df, noise_level=0.05) for _ in range(num_synthetic_copies)]

# Concatenate the original DataFrame with all synthetic DataFrames
final_df = pd.concat([df] + synthetic_dfs, axis=0).reset_index(drop=True)

In [9]:
final_df.describe()

Unnamed: 0,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom_to_make_life_choices,generosity,perceptions_of_corruption
count,822.0,822.0,822.0,816.0,822.0,822.0,822.0
mean,5.539679,1.407774,1.155933,0.365935,0.539989,0.148579,0.145864
std,1.13672,0.431698,0.325811,0.156362,0.149031,0.075828,0.126423
min,1.743263,-0.025029,-0.018121,-0.012594,-0.012086,-0.002885,-0.007882
25%,4.728543,1.09405,0.966122,0.248234,0.455905,0.097558,0.058733
50%,5.684448,1.446456,1.231278,0.393053,0.557307,0.13629,0.109753
75%,6.354949,1.787836,1.402117,0.48683,0.656,0.198089,0.18626
max,7.875061,2.215017,1.637111,0.711225,0.777815,0.430263,0.570001


In [10]:
df.describe()

Unnamed: 0,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom_to_make_life_choices,generosity,perceptions_of_corruption
count,137.0,137.0,137.0,136.0,137.0,137.0,137.0
mean,5.539796,1.406985,1.156212,0.366176,0.54,0.148474,0.145898
std,1.139929,0.432963,0.326322,0.156691,0.149501,0.076053,0.126723
min,1.859,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.724,1.099,0.962,0.2485,0.455,0.097,0.06
50%,5.684,1.449,1.227,0.3895,0.557,0.137,0.111
75%,6.334,1.798,1.401,0.4875,0.656,0.199,0.187
max,7.804,2.2,1.62,0.702,0.772,0.422,0.561


### CTGAN

In [19]:
from ctgan import CTGAN
from ctgan import load_demo

synth_data = df.drop(columns=["country",  'healthy_life_expectancy'])

discrete_columns = ['region']

ctgan = CTGAN(epochs=10)
ctgan.fit(synth_data, discrete_columns)

# Create synthetic data
synthetic_data = ctgan.sample(1000)

rdt.transformers.null:119 [INFO]: Guidance: There are no missing values in column happiness_score. Extra column not created.
rdt.transformers.null:119 [INFO]: Guidance: There are no missing values in column gdp_per_capita. Extra column not created.
rdt.transformers.null:119 [INFO]: Guidance: There are no missing values in column social_support. Extra column not created.
rdt.transformers.null:119 [INFO]: Guidance: There are no missing values in column freedom_to_make_life_choices. Extra column not created.
rdt.transformers.null:119 [INFO]: Guidance: There are no missing values in column generosity. Extra column not created.
rdt.transformers.null:119 [INFO]: Guidance: There are no missing values in column perceptions_of_corruption. Extra column not created.


In [20]:
synthetic_data.describe()

Unnamed: 0,happiness_score,gdp_per_capita,social_support,freedom_to_make_life_choices,generosity,perceptions_of_corruption
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,4.244884,2.039567,0.792013,0.57855,0.173524,0.239614
std,1.465174,0.387813,0.457518,0.219878,0.124831,0.181772
min,0.789742,0.095347,-0.249484,-0.242032,-0.173418,-0.115263
25%,3.198996,1.840935,0.397811,0.446626,0.08462,0.101081
50%,4.178791,2.10457,0.922344,0.60336,0.166471,0.192688
75%,5.253108,2.324664,1.140549,0.747861,0.250151,0.372063
max,8.381159,2.8043,1.735828,0.966621,0.598372,0.757771


### Bayesian network

In [53]:
import pandas as pd
import numpy as np

# Display the first few rows
print("Original DataFrame:")
print(df.head())

# Drop unique identifier 'country'
df_synth = df.drop(['country'], axis=1)

# Encode categorical variables (e.g., 'region')
df_synth['region'] = df_synth['region'].astype('category').cat.codes

# Reset index
df_synth = df_synth.reset_index(drop=True)

# Handle missing values if any
print("\nMissing Values Before Imputation:")
print(df_synth.isnull().sum())

# Fill numerical missing values with mean
numeric_cols = [
    'happiness_score', 'gdp_per_capita', 'social_support',
    'healthy_life_expectancy', 'freedom_to_make_life_choices',
    'generosity', 'perceptions_of_corruption'
]
df_synth[numeric_cols] = df_synth[numeric_cols].fillna(df_synth[numeric_cols].mean())

# Fill categorical missing values with mode
df_synth['region'].fillna(df_synth['region'].mode()[0], inplace=True)

print("\nMissing Values After Imputation:")
print(df_synth.isnull().sum())


Original DataFrame:
       country                        region  happiness_score  gdp_per_capita  \
0      Finland                Western Europe            7.804           1.888   
1      Denmark                Western Europe            7.586           1.949   
2      Iceland                Western Europe            7.530           1.926   
3       Israel  Middle East and North Africa            7.473           1.833   
4  Netherlands                Western Europe            7.403           1.942   

   social_support  healthy_life_expectancy  freedom_to_make_life_choices  \
0           1.585                    0.535                         0.772   
1           1.548                    0.537                         0.734   
2           1.620                    0.559                         0.738   
3           1.521                    0.577                         0.569   
4           1.488                    0.545                         0.672   

   generosity  perceptions_of_corrup

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_synth['region'].fillna(df_synth['region'].mode()[0], inplace=True)


### VAE

In [54]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler

def generate_synthetic_data_vae(df, num_samples=400, latent_dim=2):
    """Generates synthetic data using a Variational Autoencoder (VAE).

    Args:
        df: The original pandas DataFrame containing the data.
        num_samples: The number of synthetic samples to generate.
        latent_dim: The dimensionality of the latent space.

    Returns:
        A pandas DataFrame containing the synthetic data.
    """

    # 1. Data Preprocessing
    # a. Handle NaN/inf values
    df.fillna(df.mean(), inplace=True)  # Impute NaNs with mean
    df = df.replace([np.inf, -np.inf], np.nan).dropna()  # Remove infinite values

    # b. Scale numerical features
    scaler = MinMaxScaler()
    numerical_cols = ['happiness_score', 'gdp_per_capita', 'social_support',
                       'healthy_life_expectancy', 'freedom_to_make_life_choices',
                       'generosity', 'perceptions_of_corruption']
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # 2. Build the VAE Model
    class Sampling(layers.Layer):
        """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
        def call(self, inputs):
            z_mean, z_log_var = inputs
            batch = tf.shape(z_mean)[0]
            dim = tf.shape(z_mean)[1]
            epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
            return z_mean + tf.exp(0.5 * z_log_var) * epsilon

    # Encoder
    encoder_inputs = keras.Input(shape=(df.shape[1],))
    x = layers.Dense(64, activation="relu")(encoder_inputs)
    x = layers.Dense(32, activation="relu")(x)
    z_mean = layers.Dense(latent_dim, name="z_mean")(x)
    z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
    z = Sampling()([z_mean, z_log_var])
    encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")

    # Decoder
    latent_inputs = keras.Input(shape=(latent_dim,))
    x = layers.Dense(32, activation="relu")(latent_inputs)
    x = layers.Dense(64, activation="relu")(x)
    decoder_outputs = layers.Dense(df.shape[1], activation="sigmoid")(x)  # Sigmoid for scaled data
    decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")

    # VAE
    outputs = decoder(encoder(encoder_inputs)[2])
    vae = keras.Model(encoder_inputs, outputs, name="vae")

    # 3. Train the VAE
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)  # Adjusted learning rate
    vae.compile(optimizer=optimizer, loss="mse") 
    vae.fit(df, df, epochs=100, batch_size=32)  # Adjusted hyperparameters

    # 4. Generate Synthetic Data
    random_latent_vectors = tf.random.normal(shape=(num_samples, latent_dim))
    synthetic_data = decoder.predict(random_latent_vectors)

    # 5. Inverse Transform (Scaling)
    synthetic_data = scaler.inverse_transform(synthetic_data)  # Rescale to original range

    # Convert to DataFrame
    synthetic_df = pd.DataFrame(synthetic_data, columns=df.columns)

    return synthetic_df

no_countries_df = df.drop(['country', 'region'], axis=1)

synthetic_df = generate_synthetic_data_vae(no_countries_df)

ModuleNotFoundError: No module named 'distutils'

### Copula

In [None]:
import pandas as pd
import numpy as np
from copulas.multivariate import GaussianMultivariate
# Assuming your original data is in a DataFrame called 'df'

def generate_synthetic_data_copula(df, num_samples=137):
    """Generates synthetic data using a Gaussian copula.

    Args:
        df: The original pandas DataFrame containing the data.
        num_samples: The number of synthetic samples to generate.

    Returns:
        A pandas DataFrame containing the synthetic data.
    """
    df.fillna(df.mean(), inplace=True)  # Impute NaNs with mean
    df = df.replace([np.inf, -np.inf], np.nan).dropna()  # Remove infinite values

    # b. Scale numerical features
    scaler = MinMaxScaler()
    numerical_cols = ['happiness_score', 'gdp_per_capita', 'social_support',
                       'healthy_life_expectancy', 'freedom_to_make_life_choices',
                       'generosity', 'perceptions_of_corruption']
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # 1. Fit a Gaussian copula to the data
    copula = GaussianMultivariate()
    copula.fit(df)

    # 2. Generate synthetic data from the copula
    synthetic_data = copula.sample(num_samples)

    synthetic_data = scaler.inverse_transform(synthetic_data)  # Rescale to original range

    # 3. Convert to DataFrame
    synthetic_df = pd.DataFrame(synthetic_data, columns=df.columns)
    

    return synthetic_df

no_countries_df = df.drop(['country', 'region'], axis=1)
synthetic_df = generate_synthetic_data_copula(no_countries_df)
print(synthetic_df.head())