In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

| feature | min     | max       | *          |
| ------- | ------- | --------- | ---------- |
| sclmeet | 1       | 7         | 77, 88, 99 |
| inprdsc | 0       | 6         | 77, 88, 99 |
| sclact  | 1       | 5         | 7, 8, 9    |
| health  | 1       | 5         | 7, 8, 9    |
| rlgdgr  | 0       | 10        | 77, 88, 99 |
| dscrgrp | 1(yes)  | 2(no)     | 7, 8, 9    |
| ctzcntr | 1(yes)  | 2(no)     | 7, 8, 9    |
| brncntr | 1(yes)  | 2(no)     | 7, 8, 9    |
| gndr    | 1(male) | 0(female) | 9          |


In [None]:
data = pd.read_csv('/home/mehrshad/code/arthurcornelio88/how-happy-in-europe/data/20240319_ESS10_manually-filtered_arthurcornelio88.csv')
data = data.drop("Unnamed: 0", axis=1)
data= data.drop(['livecnta','gndr13','yrbrn6','yrbrn7','yrbrn8','yrbrn9','yrbrn10','yrbrn11','yrbrn12','yrbrn13','rshipa8','rshipa9','rshipa10','rshipa11','rshipa12', 'rshipa13',
                    'edulvlb','pdjobyr','isco08','edulvlpb','isco08p','wkhtotp','edulvlfb','edulvlfb','edulvlmb','anctry1','anctry2'], axis=1)
mask = data["happy"].isin([77, 88, 99])
data = data[~mask].reset_index(drop=True)
data.head()

In [None]:
data.info()

In [None]:
data['sclact'].sort_values().unique()

In [None]:
def feature_scale_map(df, feature):
    map_dict = {}

    all_vals = np.sort(df[feature].unique())

    if np.any(all_vals[1:] - all_vals[0:-1] > 1):
        min_ind = np.where(all_vals[1:] - all_vals[0:-1] > 1)[0][0]
        vals = all_vals[:min_ind].copy()
        min_val = vals.min()
        if min_val == 1:
            vals = vals - 1
        for i, val in enumerate(all_vals[:-1]):
            map_dict[val] = i
        map_dict[all_vals[-1]] = -1
        df[feature] = df[feature].replace(map_dict)
    else:
        for i, val in enumerate(all_vals):
            map_dict[val] = i
        df[feature] = df[feature].replace(map_dict)

In [None]:
X = data.drop('happy',axis=1)
y = data['happy']
X_cols = X.select_dtypes(exclude='object')
for col in X_cols.columns:
    feature_scale_map(X_cols, col)

In [None]:
numerical_cols = X_cols.columns
categorical_cols = X[['cntry', 'ctzcntr', 'brncntr', 'gndr', 'dscrgrp']].columns

numerical_imputer = SimpleImputer(strategy='constant')
categorical_imputer = SimpleImputer(strategy='constant')

numerical_preprocessor = Pipeline(steps=[
    ('imputer', numerical_imputer),
    ('scaler', MinMaxScaler())
])
categorical_preprocessor = Pipeline(steps=[
    ('imputer', categorical_imputer),
    ('encoder', OneHotEncoder(drop='if_binary'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_preprocessor, numerical_cols),
        ('cat', categorical_preprocessor, categorical_cols)
    ])

smote = SMOTE()


pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', XGBClassifier(objective='multi:softprob',
                                 num_class=len(set(y)),
                                 eval_metric='mlogloss'))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


In [None]:
pipeline.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__max_depth': [3, 5, 7],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'preprocessor__num__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant']
}


grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


grid_search.fit(X_train, y_train)


best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters found: {best_parameters}")
print(f"Best cross-validated score: {best_score * 100:.2f}%")


best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
