## Starting with 10 features

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.metrics import accuracy_score


In [None]:
df = pd.read_csv("../data/ESS10/ESS10.csv")

| feature | min     | max       | *          |
| ------- | ------- | --------- | ---------- |
| sclmeet | 1       | 7         | 77, 88, 99 |
| inprdsc | 0       | 6         | 77, 88, 99 |
| sclact  | 1       | 5         | 7, 8, 9    |
| health  | 1       | 5         | 7, 8, 9    |
| rlgdgr  | 0       | 10        | 77, 88, 99 |
| dscrgrp | 1(yes)  | 2(no)     | 7, 8, 9    |
| ctzcntr | 1(yes)  | 2(no)     | 7, 8, 9    |
| brncntr | 1(yes)  | 2(no)     | 7, 8, 9    |
| gndr    | 1(male) | 0(female) | 9          |



In [None]:
def feature_scale_map(df, feature):
    map_dict = {}

    all_vals = np.sort(df[feature].unique())

    if np.any(all_vals[1:] - all_vals[0:-1] > 1):
        min_ind = np.where(all_vals[1:] - all_vals[0:-1] > 1)[0][0]
        vals = all_vals[:min_ind].copy()
        min_val = vals.min()
        if min_val == 1:
            vals = vals - 1
        for i, val in enumerate(all_vals[:-1]):
            map_dict[val] = i
        map_dict[all_vals[-1]] = -1
        df[feature] = df[feature].replace(map_dict)
    else:
        for i, val in enumerate(all_vals):
            map_dict[val] = i
        df[feature] = df[feature].replace(map_dict)

In [None]:
base_df = df[
    ['cntry', 'gndr','sclmeet',
     'inprdsc','sclact','health',
     'rlgdgr','dscrgrp','ctzcntr',
     'brncntr', 'happy']
]


mask = base_df["happy"].isin([77, 88, 99])
base_df = base_df[~mask].reset_index(drop=True)

for col in base_df.columns[1:]:
    feature_scale_map(base_df, col)


In [None]:
minmax = MinMaxScaler()
base_df_scaled = minmax.fit_transform(base_df[base_df.columns[1:-1]])
base_df_scaled = pd.DataFrame(base_df_scaled, columns=base_df.columns[1:-1])

In [None]:
encoder = OneHotEncoder()
encoded_countries = encoder.fit_transform(base_df[['cntry']]).toarray()
df_encoded = pd.concat(
    [pd.DataFrame(encoded_countries, columns=encoder.get_feature_names_out()),
     base_df_scaled,
     base_df["happy"]
    ],
    axis=1)
df_encoded = df_encoded.astype(float)
df_encoded.head()

In [None]:
X = df_encoded.drop("happy", axis=1).copy()
y = df_encoded["happy"].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, test_size=0.2
)

In [None]:
clf = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=len(set(y)),
    eval_metric='mlogloss',
)

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")