In [None]:
# Importing pre-requisite libraries

import pandas as pd # For dealing with dataframes (i.e. pd.read_csv())
import matplotlib.pyplot as plt # Displaying graphs 
from sklearn.model_selection import train_test_split # Seperating the training and testing process
from sklearn.impute import SimpleImputer # For replacing missing values
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline # Easier preprocessing
from sklearn.ensemble import RandomForestRegressor # Will be used for the model
from sklearn.preprocessing import OrdinalEncoder # For dealing with categorical values
from sklearn.metrics import mean_absolute_error # MAE for model accuracy determination
from xgboost import XGBRegressor # Alternative model


## Preprocessing of data + Training + Testing

In [None]:
y = data['Ladder score'] # Target data (happiness)

data_features = ['Country name', 'Regional indicator', 'Freedom to make life choices', 
                 'Perceptions of corruption'] # Features for training + testing

X = data[data_features]

# Dividing data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8,
                                                     test_size=0.2, random_state=0)


In [None]:
# Checking for categorical variables

s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:", object_cols)

In [None]:
# Combine the training and validation sets to fit the encoder
combined_X = pd.concat([X_train[object_cols], X_valid[object_cols]])

In [None]:
# Ordinal Encoding categorical variables

label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Applying ordinal encoding to categorical variables
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(combined_X)
label_X_train[object_cols] = ordinal_encoder.transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

### Dealing with missing values

#### Imputing missing values

In [None]:
# Imputing missing values
my_imputer = SimpleImputer(strategy='mean')

imputed_X_train = pd.DataFrame(my_imputer.fit_transform(label_X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(label_X_valid))

# Restore column names after imputation
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_train.columns

#### Removing null values (alternative approach)

In [None]:
# Removing null-values from the dataset

missing_cols = [col for col in X_train.columns
               if X_train[col].isnull().any()] # Count null values present

# Drop columns in both training and validation data
reduced_X_train = label_X_train.drop(missing_cols, axis=1)
reduced_X_valid = label_X_valid.drop(missing_cols, axis=1)

#### Random Forest Regressor (Training + Testing)

In [None]:
# Accuracy for imputed values

model = RandomForestRegressor(n_estimators=150, random_state=1)
model.fit(imputed_X_train, y_train)
predictions = model.predict(imputed_X_valid) # Predictions with the model

print("MAE from Approach 1 (Imputed Values):")
print(mean_absolute_error(y_valid, predictions))

In [None]:
# Accuracy for dropped values

model = RandomForestRegressor(n_estimators=500, random_state=1)
model.fit(reduced_X_train, y_train)
predictions = model.predict(reduced_X_valid) # Predictions with the model

print("MAE from Approach 2 (Dropped Values):")
print(mean_absolute_error(y_valid, predictions))

#### XGBoost + Parameter Tuning

In [None]:
# We will use imputed values, as they've provided the best accuracy

my_model = XGBRegressor(n_estimators=500, learning_rate=0.85, random_state=1)
my_model.fit(imputed_X_train, y_train,
             early_stopping_rounds=5,
             eval_set=[(imputed_X_valid, y_valid)],
             verbose=False)

pred = my_model.predict(imputed_X_valid)
print("MAE from Approach 3 (XGB Regressor + Imputation): ")
print(mean_absolute_error(y_valid, pred))