In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

| feature | min     | max       | *          |
| ------- | ------- | --------- | ---------- |
| sclmeet | 1       | 7         | 77, 88, 99 |
| inprdsc | 0       | 6         | 77, 88, 99 |
| sclact  | 1       | 5         | 7, 8, 9    |
| health  | 1       | 5         | 7, 8, 9    |
| rlgdgr  | 0       | 10        | 77, 88, 99 |
| dscrgrp | 1(yes)  | 2(no)     | 7, 8, 9    |
| ctzcntr | 1(yes)  | 2(no)     | 7, 8, 9    |
| brncntr | 1(yes)  | 2(no)     | 7, 8, 9    |
| gndr    | 1(male) | 0(female) | 9          |


In [2]:
data = pd.read_csv('/home/mehrshad/code/arthurcornelio88/how-happy-in-europe/data/20240319_ESS10_manually-filtered_arthurcornelio88.csv')
data = data.drop("Unnamed: 0", axis=1)
data= data.drop(['livecnta','gndr13','yrbrn6','yrbrn7','yrbrn8','yrbrn9','yrbrn10','yrbrn11','yrbrn12','yrbrn13','rshipa8','rshipa9','rshipa10','rshipa11','rshipa12', 'rshipa13',
                    'edulvlb','pdjobyr','isco08','edulvlpb','isco08p','wkhtotp','edulvlfb','edulvlfb','edulvlmb','anctry1','anctry2'], axis=1)
mask = data["happy"].isin([77, 88, 99])
data = data[~mask].reset_index(drop=True)
data.head()

  data = pd.read_csv('/home/mehrshad/code/arthurcornelio88/how-happy-in-europe/data/20240319_ESS10_manually-filtered_arthurcornelio88.csv')


Unnamed: 0,idno,cntry,netusoft,ppltrst,pplfair,pplhlp,polintr,psppsgva,actrolga,psppipla,...,vdtpitre,vdtpscre,vdtpaure,vdtpvire,vdtpoire,vdtpntre,vdtpapre,vdtprere,vdtpdkre,vdtpnare
0,10038,BE,5,6,7,4,2,2,1,3,...,0,0,0,0,0,0,1,0,0,0
1,10053,BE,5,3,4,3,4,1,4,2,...,0,0,0,0,0,1,0,0,0,0
2,10055,BE,5,6,8,5,2,3,3,3,...,0,0,0,0,0,0,1,0,0,0
3,10062,BE,5,7,5,5,4,2,1,2,...,0,0,0,0,0,0,1,0,0,0
4,10064,BE,5,3,8,8,1,2,2,2,...,0,0,0,0,0,0,1,0,0,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37521 entries, 0 to 37520
Columns: 386 entries, idno to vdtpnare
dtypes: float64(15), int64(364), object(7)
memory usage: 110.5+ MB


In [4]:
data['sclact'].sort_values().unique()

array([1, 2, 3, 4, 5, 7, 8, 9])

In [5]:
def feature_scale_map(df, feature):
    map_dict = {}

    all_vals = np.sort(df[feature].unique())

    if np.any(all_vals[1:] - all_vals[0:-1] > 1):
        min_ind = np.where(all_vals[1:] - all_vals[0:-1] > 1)[0][0]
        vals = all_vals[:min_ind].copy()
        min_val = vals.min()
        if min_val == 1:
            vals = vals - 1
        for i, val in enumerate(all_vals[:-1]):
            map_dict[val] = i
        map_dict[all_vals[-1]] = -1
        df[feature] = df[feature].replace(map_dict)
    else:
        for i, val in enumerate(all_vals):
            map_dict[val] = i
        df[feature] = df[feature].replace(map_dict)

In [6]:
X = data.drop('happy',axis=1)
y = data['happy']
numerical_cols = X.select_dtypes(exclude='object')
for col in numerical_cols.columns:
    feature_scale_map(numerical_cols, col)

In [7]:
# numerical_cols = numerical_cols.columns
# categorical_cols = X[['cntry','ctzcntr','brncntr','gndr', 'dscrgrp']].columns

# numerical_transformer = MinMaxScaler()
# categorical_transformer = OneHotEncoder(drop='if_binary')

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ])

# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', XGBClassifier(objective='multi:softprob',
#                                  num_class=len(set(y)),
#                                  eval_metric='mlogloss'))
# ])

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# pipeline.fit(X_train, y_train)

# y_pred = pipeline.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy * 100:.2f}%")


In [9]:
y.isnull().sum()

0

In [8]:
numerical_cols = numerical_cols.columns
categorical_cols = X[['cntry','ctzcntr','brncntr','gndr', 'dscrgrp']].columns


numerical_transformer = MinMaxScaler()
categorical_transformer = OneHotEncoder(drop='if_binary')


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


smote = SMOTE()

pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', XGBClassifier(objective='multi:softprob',
                                 num_class=len(set(y)),
                                 eval_metric='mlogloss'))
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict using the pipeline
y_pred = pipeline.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [15]:
y.isna().sum()

0